eval-framework 0.2.13__tar.gz → 0.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.13 → eval_framework-0.2.14}/PKG-INFO +1 -1
- {eval_framework-0.2.13 → eval_framework-0.2.14}/pyproject.toml +1 -1
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/aleph_alpha.py +7 -2
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/base.py +5 -2
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/huggingface.py +3 -1
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/openai.py +15 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/vllm.py +10 -1
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/math_reasoning.py +19 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mbpp.py +101 -6
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/task_names.py +2 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/generate_task_docs.py +8 -4
- {eval_framework-0.2.13 → eval_framework-0.2.14}/LICENSE +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/README.md +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/run.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/py.typed +0 -0
|
@@ -200,11 +200,16 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
200
200
|
stop_sequences: list[str] | None = None,
|
|
201
201
|
max_tokens: int | None = None,
|
|
202
202
|
temperature: float | None = None,
|
|
203
|
+
top_p: float | None = None,
|
|
203
204
|
) -> list[RawCompletion]:
|
|
204
205
|
effective_temperature = temperature if temperature is not None else self._temperature
|
|
206
|
+
if effective_temperature is not None and not (0 <= effective_temperature <= 2):
|
|
207
|
+
raise ValueError(f"temperature must be between 0 and 2, got {effective_temperature}")
|
|
208
|
+
effective_top_p = top_p if top_p is not None else self._top_p
|
|
209
|
+
if effective_top_p is not None and not (0 < effective_top_p <= 1):
|
|
210
|
+
raise ValueError(f"top_p must be between 0 and 1 (exclusive), got {effective_top_p}")
|
|
205
211
|
|
|
206
212
|
requests: list[CompletionRequest] = []
|
|
207
|
-
|
|
208
213
|
# Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
|
|
209
214
|
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
210
215
|
|
|
@@ -215,7 +220,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
215
220
|
maximum_tokens=scaled_max_tokens,
|
|
216
221
|
stop_sequences=stop_sequences,
|
|
217
222
|
temperature=effective_temperature,
|
|
218
|
-
top_p=
|
|
223
|
+
top_p=effective_top_p,
|
|
219
224
|
)
|
|
220
225
|
)
|
|
221
226
|
|
|
@@ -24,6 +24,7 @@ class BaseLLM(ABC):
|
|
|
24
24
|
stop_sequences: list[str] | None = None,
|
|
25
25
|
max_tokens: int | None = None,
|
|
26
26
|
temperature: float | None = None,
|
|
27
|
+
top_p: float | None = None,
|
|
27
28
|
) -> list[RawCompletion]:
|
|
28
29
|
"""
|
|
29
30
|
stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
|
|
@@ -47,6 +48,7 @@ class BaseLLM(ABC):
|
|
|
47
48
|
stop_sequences: list[str] | None = None,
|
|
48
49
|
max_tokens: int | None = None,
|
|
49
50
|
temperature: float | None = None,
|
|
51
|
+
top_p: float | None = None,
|
|
50
52
|
) -> list[RawCompletion]:
|
|
51
53
|
"""
|
|
52
54
|
stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
|
|
@@ -79,6 +81,7 @@ class BaseLLM(ABC):
|
|
|
79
81
|
stop_sequences: list[str] | None = None,
|
|
80
82
|
max_tokens: int | None = None,
|
|
81
83
|
temperature: float | None = None,
|
|
84
|
+
top_p: float | None = None,
|
|
82
85
|
) -> list[RawCompletion]:
|
|
83
86
|
"""Generates a model response for each sample.
|
|
84
87
|
|
|
@@ -86,10 +89,10 @@ class BaseLLM(ABC):
|
|
|
86
89
|
otherwise falls back to 'generate_from_messages'.
|
|
87
90
|
"""
|
|
88
91
|
try:
|
|
89
|
-
return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature)
|
|
92
|
+
return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature, top_p)
|
|
90
93
|
except NotImplementedError:
|
|
91
94
|
messages: list[Sequence[Message]] = [sample.messages for sample in samples]
|
|
92
|
-
return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature)
|
|
95
|
+
return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
|
|
93
96
|
|
|
94
97
|
def post_process_completion(self, completion: str, sample: Sample) -> str:
|
|
95
98
|
"""
|
|
@@ -146,6 +146,7 @@ class BaseHFLLM(BaseLLM):
|
|
|
146
146
|
stop_sequences: list[str] | None = None,
|
|
147
147
|
max_tokens: int | None = None,
|
|
148
148
|
temperature: float | None = None,
|
|
149
|
+
top_p: float | None = None,
|
|
149
150
|
) -> list[RawCompletion]:
|
|
150
151
|
if temperature is None:
|
|
151
152
|
effective_temperature = 0.0 # Current default, TODO: refactor to use model's default
|
|
@@ -154,7 +155,8 @@ class BaseHFLLM(BaseLLM):
|
|
|
154
155
|
)
|
|
155
156
|
else:
|
|
156
157
|
effective_temperature = temperature
|
|
157
|
-
|
|
158
|
+
if top_p is not None:
|
|
159
|
+
logger.warning("Huggingface LLM does not support top_p. Ignoring top_p value.")
|
|
158
160
|
raw_completions = []
|
|
159
161
|
for single_messages in messages:
|
|
160
162
|
# format
|
|
@@ -34,6 +34,7 @@ class OpenAIModel(BaseLLM):
|
|
|
34
34
|
model_name: str | None = None,
|
|
35
35
|
formatter: BaseFormatter | None = None,
|
|
36
36
|
temperature: float | None = None,
|
|
37
|
+
top_p: float | None = None,
|
|
37
38
|
api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
|
|
38
39
|
organization: str | None = None,
|
|
39
40
|
base_url: str | None = None,
|
|
@@ -46,6 +47,7 @@ class OpenAIModel(BaseLLM):
|
|
|
46
47
|
model_name: OpenAI model name (e.g., "gpt-4o", "gpt-3.5-turbo"). If None, uses LLM_NAME class attribute.
|
|
47
48
|
formatter: Optional message formatter.
|
|
48
49
|
temperature: Sampling temperature used when not passed to generate methods (from 0.0 to 2.0).
|
|
50
|
+
top_p: Nucleus sampling probability mass (from 0.0 to 1.0). If None, the API default is used.
|
|
49
51
|
api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable).
|
|
50
52
|
organization: Optional OpenAI organization ID.
|
|
51
53
|
base_url: Optional API base URL for Azure or alternate endpoints.
|
|
@@ -59,6 +61,10 @@ class OpenAIModel(BaseLLM):
|
|
|
59
61
|
self._temperature = temperature if temperature is not None else 0.0
|
|
60
62
|
assert 0.0 <= self._temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
|
|
61
63
|
|
|
64
|
+
if top_p is not None:
|
|
65
|
+
assert 0.0 <= top_p <= 1.0, "top_p must be between 0.0 and 1.0"
|
|
66
|
+
self._top_p = top_p
|
|
67
|
+
|
|
62
68
|
self._client = OpenAI(
|
|
63
69
|
api_key=api_key,
|
|
64
70
|
organization=organization,
|
|
@@ -97,6 +103,7 @@ class OpenAIModel(BaseLLM):
|
|
|
97
103
|
stop_sequences: list[str] | None = None,
|
|
98
104
|
max_tokens: int | None = None,
|
|
99
105
|
temperature: float | None = None,
|
|
106
|
+
top_p: float | None = None,
|
|
100
107
|
) -> list[RawCompletion]:
|
|
101
108
|
"""
|
|
102
109
|
Generate completions for a list of message sequences concurrently.
|
|
@@ -108,6 +115,7 @@ class OpenAIModel(BaseLLM):
|
|
|
108
115
|
stop_sequences: Optional list of stop sequences.
|
|
109
116
|
max_tokens: Optional maximum number of tokens to generate.
|
|
110
117
|
temperature: Sampling temperature.
|
|
118
|
+
top_p: Nucleus sampling probability mass (0.0 to 1.0). Overrides instance default if provided.
|
|
111
119
|
|
|
112
120
|
Returns:
|
|
113
121
|
List of RawCompletion objects containing prompts and completions.
|
|
@@ -116,6 +124,10 @@ class OpenAIModel(BaseLLM):
|
|
|
116
124
|
effective_temperature = temperature if temperature is not None else self._temperature
|
|
117
125
|
assert 0.0 <= effective_temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
|
|
118
126
|
|
|
127
|
+
effective_top_p = top_p if top_p is not None else self._top_p
|
|
128
|
+
if effective_top_p is not None:
|
|
129
|
+
assert 0.0 <= effective_top_p <= 1.0, "top_p must be between 0.0 and 1.0"
|
|
130
|
+
|
|
119
131
|
def _process_one(single_messages: Sequence[Message]) -> RawCompletion:
|
|
120
132
|
# Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
|
|
121
133
|
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
@@ -129,6 +141,7 @@ class OpenAIModel(BaseLLM):
|
|
|
129
141
|
model=self._model_name,
|
|
130
142
|
prompt=prompt,
|
|
131
143
|
temperature=effective_temperature,
|
|
144
|
+
top_p=effective_top_p,
|
|
132
145
|
max_tokens=scaled_max_tokens,
|
|
133
146
|
stop=stop_sequences,
|
|
134
147
|
)
|
|
@@ -158,6 +171,7 @@ class OpenAIModel(BaseLLM):
|
|
|
158
171
|
model=self._model_name,
|
|
159
172
|
messages=chat_messages,
|
|
160
173
|
temperature=effective_temperature,
|
|
174
|
+
top_p=effective_top_p,
|
|
161
175
|
max_tokens=scaled_max_tokens,
|
|
162
176
|
stop=stop_sequences,
|
|
163
177
|
)
|
|
@@ -300,6 +314,7 @@ class OpenAIEmbeddingModel(BaseLLM):
|
|
|
300
314
|
stop_sequences: list[str] | None = None,
|
|
301
315
|
max_tokens: int | None = None,
|
|
302
316
|
temperature: float | None = None,
|
|
317
|
+
top_p: float | None = None,
|
|
303
318
|
) -> list[RawCompletion]:
|
|
304
319
|
raise NotImplementedError(
|
|
305
320
|
"Embedding model does not support generate_from_messages. Use generate_embeddings instead."
|
|
@@ -226,6 +226,7 @@ class BaseVLLMModel(BaseLLM):
|
|
|
226
226
|
stop_sequences: list[str] | None = None,
|
|
227
227
|
max_tokens: int | None = None,
|
|
228
228
|
temperature: float | None = None,
|
|
229
|
+
top_p: float | None = None,
|
|
229
230
|
) -> list[RawCompletion]:
|
|
230
231
|
raw_completions: list[RawCompletion | None] = [None] * len(messages)
|
|
231
232
|
prompt_objs = []
|
|
@@ -235,7 +236,7 @@ class BaseVLLMModel(BaseLLM):
|
|
|
235
236
|
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
236
237
|
|
|
237
238
|
sampling_params = self._resolve_sampling_params(
|
|
238
|
-
self.sampling_params, scaled_max_tokens, stop_sequences, temperature
|
|
239
|
+
self.sampling_params, scaled_max_tokens, stop_sequences, temperature, top_p
|
|
239
240
|
)
|
|
240
241
|
|
|
241
242
|
for i, single_messages in enumerate(messages):
|
|
@@ -295,6 +296,7 @@ class BaseVLLMModel(BaseLLM):
|
|
|
295
296
|
max_tokens: int | None,
|
|
296
297
|
stop_sequences: list[str] | None,
|
|
297
298
|
temperature: float | None,
|
|
299
|
+
top_p: float | None = None,
|
|
298
300
|
) -> SamplingParams:
|
|
299
301
|
sampling_params.max_tokens = max_tokens
|
|
300
302
|
sampling_params.stop = stop_sequences
|
|
@@ -308,6 +310,13 @@ class BaseVLLMModel(BaseLLM):
|
|
|
308
310
|
f"Using sampling params temperature value: {sampling_params.temperature} "
|
|
309
311
|
f"as no custom temperature value was provided"
|
|
310
312
|
)
|
|
313
|
+
if top_p is not None:
|
|
314
|
+
logger.warning(f"Overriding sampling params top_p {sampling_params.top_p} with custom value {top_p}")
|
|
315
|
+
sampling_params.top_p = top_p
|
|
316
|
+
else:
|
|
317
|
+
logger.info(
|
|
318
|
+
f"Using sampling params top_p value: {sampling_params.top_p} as no custom top_p value was provided"
|
|
319
|
+
)
|
|
311
320
|
return sampling_params
|
|
312
321
|
|
|
313
322
|
def _model_generate(
|
|
@@ -394,6 +394,25 @@ class AIME2025(AIME2024):
|
|
|
394
394
|
return item["answer"]
|
|
395
395
|
|
|
396
396
|
|
|
397
|
+
class AIME2026(AIME2024):
|
|
398
|
+
"""AIME 2026 dataset: https://huggingface.co/datasets/math-ai/aime26
|
|
399
|
+
|
|
400
|
+
This dataset contains a single test split of 30 questions.
|
|
401
|
+
Data contains
|
|
402
|
+
problem | answer | id
|
|
403
|
+
|
|
404
|
+
pass@1 evaluation
|
|
405
|
+
"""
|
|
406
|
+
|
|
407
|
+
NAME = "AIME2026"
|
|
408
|
+
DATASET_PATH = "math-ai/aime26"
|
|
409
|
+
SAMPLE_SPLIT = "test"
|
|
410
|
+
FEWSHOT_SPLIT = "test"
|
|
411
|
+
|
|
412
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
|
|
413
|
+
return item["answer"]
|
|
414
|
+
|
|
415
|
+
|
|
397
416
|
class MATH500(MATHReasoning):
|
|
398
417
|
"""MATH500 dataset: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
|
|
399
418
|
|
|
@@ -105,9 +105,7 @@ class MBPP(BaseTask[str]):
|
|
|
105
105
|
def _get_context(self, item: dict[str, Any]) -> MBPPMetricContext:
|
|
106
106
|
return MBPPMetricContext(tests_code="\n".join(item["test_list"]))
|
|
107
107
|
|
|
108
|
-
def post_process_generated_completion(self, completion_text: str, sample: Sample
|
|
109
|
-
assert sample is not None
|
|
110
|
-
|
|
108
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str: # type: ignore[override]
|
|
111
109
|
if BEGIN in completion_text:
|
|
112
110
|
completion_text = completion_text.split(f"{BEGIN}\n")[1]
|
|
113
111
|
|
|
@@ -193,9 +191,7 @@ class MBPP_PROMPT_WITHOUT_TESTS(MBPP):
|
|
|
193
191
|
return postfix
|
|
194
192
|
return f"{postfix.strip()}:"
|
|
195
193
|
|
|
196
|
-
def post_process_generated_completion(self, completion_text: str, sample: Sample
|
|
197
|
-
assert sample is not None
|
|
198
|
-
|
|
194
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str: # type: ignore[override]
|
|
199
195
|
if BEGIN in completion_text:
|
|
200
196
|
completion_text = completion_text.split(BEGIN)[1]
|
|
201
197
|
|
|
@@ -212,3 +208,102 @@ class MBPP_PROMPT_WITHOUT_TESTS(MBPP):
|
|
|
212
208
|
class MBPP_PROMPT_WITHOUT_TESTS_SANITIZED(MBPP_PROMPT_WITHOUT_TESTS):
|
|
213
209
|
NAME = "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED"
|
|
214
210
|
SUBJECTS = ["sanitized"]
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
_OLMES_FEWSHOT_EXAMPLES: list[dict[str, Any]] = [
|
|
214
|
+
{
|
|
215
|
+
"text": "Write a function to find the similar elements from the given two tuple lists.",
|
|
216
|
+
"code": (
|
|
217
|
+
"def similar_elements(test_tup1, test_tup2):\n"
|
|
218
|
+
" res = tuple(set(test_tup1) & set(test_tup2))\n return (res)"
|
|
219
|
+
),
|
|
220
|
+
"test_list": [
|
|
221
|
+
"assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
|
|
222
|
+
"assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
|
|
223
|
+
"assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)",
|
|
224
|
+
],
|
|
225
|
+
},
|
|
226
|
+
{
|
|
227
|
+
"text": "Write a python function to identify non-prime numbers.",
|
|
228
|
+
"code": (
|
|
229
|
+
"import math\ndef is_not_prime(n):\n result = False\n"
|
|
230
|
+
" for i in range(2,int(math.sqrt(n)) + 1):\n"
|
|
231
|
+
" if n % i == 0:\n result = True\n return result"
|
|
232
|
+
),
|
|
233
|
+
"test_list": [
|
|
234
|
+
"assert is_not_prime(2) == False",
|
|
235
|
+
"assert is_not_prime(10) == True",
|
|
236
|
+
"assert is_not_prime(35) == True",
|
|
237
|
+
],
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
"text": (
|
|
241
|
+
"Write a function to find the largest integers from a given list of numbers using heap queue algorithm."
|
|
242
|
+
),
|
|
243
|
+
"code": (
|
|
244
|
+
"import heapq as hq\ndef heap_queue_largest(nums,n):\n"
|
|
245
|
+
" largest_nums = hq.nlargest(n, nums)\n return largest_nums"
|
|
246
|
+
),
|
|
247
|
+
"test_list": [
|
|
248
|
+
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ",
|
|
249
|
+
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ",
|
|
250
|
+
"assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]",
|
|
251
|
+
],
|
|
252
|
+
},
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class MBPP_OLMES(MBPP):
|
|
257
|
+
"""
|
|
258
|
+
MBPP OLMES variant replicating oe_eval's ``mbpp:3shot::olmo3:n32:v2``.
|
|
259
|
+
|
|
260
|
+
Uses the EvalPlus prompt format with 3 hardcoded fewshot examples from the
|
|
261
|
+
original MBPP "prompt" split (matching oe_eval's ordering). Each prompt
|
|
262
|
+
shows one test case (the first) instead of all.
|
|
263
|
+
|
|
264
|
+
Recommended EvalConfig settings for full replication::
|
|
265
|
+
|
|
266
|
+
split: test
|
|
267
|
+
num_fewshot: 3 (hardcoded, prompt split)
|
|
268
|
+
metric: pass_at_1
|
|
269
|
+
temperature: 0.6
|
|
270
|
+
top_p: 0.6
|
|
271
|
+
repeats: 32
|
|
272
|
+
"""
|
|
273
|
+
|
|
274
|
+
NAME = "MBPP_OLMES"
|
|
275
|
+
FEWSHOT_SPLIT = "test"
|
|
276
|
+
|
|
277
|
+
def __init__(self, num_fewshot: int = 3) -> None:
|
|
278
|
+
super().__init__(num_fewshot)
|
|
279
|
+
assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples"
|
|
280
|
+
self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]
|
|
281
|
+
|
|
282
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
283
|
+
text = item["text"] if "text" in item else item["prompt"]
|
|
284
|
+
test = item["test_list"][0]
|
|
285
|
+
return (
|
|
286
|
+
"Please provide a self-contained Python script that solves the following problem"
|
|
287
|
+
f" in a markdown code block:\n```\n{text.strip()}\n{test}\n```\n"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
291
|
+
return "Here is the completed function:\n\n```python\n"
|
|
292
|
+
|
|
293
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
294
|
+
return item["code"] + "\n"
|
|
295
|
+
|
|
296
|
+
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
297
|
+
return list(_OLMES_FEWSHOT_EXAMPLES)
|
|
298
|
+
|
|
299
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str: # type: ignore[override]
|
|
300
|
+
assert self.stop_sequences is not None
|
|
301
|
+
|
|
302
|
+
for stop_seq in self.stop_sequences:
|
|
303
|
+
if stop_seq in completion_text:
|
|
304
|
+
completion_text = completion_text.split(stop_seq)[0]
|
|
305
|
+
|
|
306
|
+
extracted_code = completion_text + "\n"
|
|
307
|
+
mbpp_ground_truth = str(sample.ground_truth)
|
|
308
|
+
code = self._code_expander(extracted_code, mbpp_ground_truth)
|
|
309
|
+
return code
|
|
@@ -19,6 +19,7 @@ def register_all_tasks() -> None:
|
|
|
19
19
|
"""Register all the benchmark tasks with the eval framework."""
|
|
20
20
|
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2024")
|
|
21
21
|
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2025")
|
|
22
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2026")
|
|
22
23
|
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC")
|
|
23
24
|
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_IDK")
|
|
24
25
|
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_OLMES")
|
|
@@ -88,6 +89,7 @@ def register_all_tasks() -> None:
|
|
|
88
89
|
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_SANITIZED")
|
|
89
90
|
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS")
|
|
90
91
|
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED")
|
|
92
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_OLMES")
|
|
91
93
|
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU")
|
|
92
94
|
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_IDK")
|
|
93
95
|
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_OLMES")
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
@@ -74,13 +74,17 @@ def generate_docs_for_task(
|
|
|
74
74
|
try:
|
|
75
75
|
num_fewshot = 1
|
|
76
76
|
task = task_class(num_fewshot=num_fewshot)
|
|
77
|
-
except
|
|
77
|
+
except (TypeError, ValueError, AssertionError):
|
|
78
78
|
try:
|
|
79
79
|
num_fewshot = 0
|
|
80
80
|
task = task_class(num_fewshot=num_fewshot)
|
|
81
|
-
except
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
except (TypeError, ValueError, AssertionError):
|
|
82
|
+
try:
|
|
83
|
+
task = task_class()
|
|
84
|
+
num_fewshot = task.num_fewshot
|
|
85
|
+
except Exception as e:
|
|
86
|
+
print(f"Failed to instantiate task {task_name}: {e}")
|
|
87
|
+
return
|
|
84
88
|
|
|
85
89
|
with open(f"{output_docs_directory}/{task_name}.md", "w") as f:
|
|
86
90
|
f.write(f"# {task_name}\n\n")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/bleu.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/chrf.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ter.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/base.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_fi.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/balancedcopa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/global_mmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/goldenswag.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gsm8k.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/lab_bench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/medqa.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmmlu.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/pawsx.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/social_iqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sphyr.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/squad.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winox.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.13 → eval_framework-0.2.14}/src/template_formatting/mistral_formatter.py
RENAMED
|
File without changes
|
|
File without changes
|