eval-framework 0.3.8__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.8 → eval_framework-0.5.1}/PKG-INFO +11 -15
- {eval_framework-0.3.8 → eval_framework-0.5.1}/README.md +0 -1
- {eval_framework-0.3.8 → eval_framework-0.5.1}/pyproject.toml +17 -38
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/evaluation_generator.py +5 -9
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/openai.py +2 -2
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/base.py +1 -1
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_assertion.py +4 -14
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/response_generator.py +8 -14
- eval_framework-0.5.1/src/eval_framework/tasks/__init__.py +12 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/base.py +3 -3
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/aidanbench.py +2 -2
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores200.py +3 -3
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores_plus.py +1 -2
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gsm8k.py +30 -1
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py +34 -40
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/squad.py +26 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/triviaqa.py +32 -1
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/wmt.py +2 -2
- {eval_framework-0.3.8/src/eval_framework/tasks/benchmarks → eval_framework-0.5.1/src/eval_framework/tasks}/dataset_revisions.py +30 -7
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/eval_config.py +2 -3
- eval_framework-0.5.1/src/eval_framework/tasks/registry.py +301 -0
- eval_framework-0.5.1/src/eval_framework/tasks/task-dataset-revisions.json +62 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/task_names.py +4 -122
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/task_style.py +64 -2
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/generate_task_docs.py +6 -12
- eval_framework-0.3.8/src/eval_framework/metrics/completion/comet.py +0 -56
- eval_framework-0.3.8/src/eval_framework/tasks/__init__.py +0 -6
- eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/task-dataset-revisions.json +0 -179
- eval_framework-0.3.8/src/eval_framework/tasks/registry.py +0 -186
- {eval_framework-0.3.8 → eval_framework-0.5.1}/LICENSE +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/llm/vllm_local_server.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.1}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Evaluation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -212,16 +212,15 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
|
212
212
|
Classifier: Topic :: Software Development :: Libraries
|
|
213
213
|
Classifier: Typing :: Typed
|
|
214
214
|
Requires-Dist: pyyaml>=6.0.3,<7
|
|
215
|
-
Requires-Dist: xmltodict>=0.
|
|
215
|
+
Requires-Dist: xmltodict>=1.0.4,<1.1
|
|
216
216
|
Requires-Dist: pydantic>=2.13.4,<3
|
|
217
|
-
Requires-Dist: datasets>=
|
|
217
|
+
Requires-Dist: datasets>=5.0.0,<6
|
|
218
218
|
Requires-Dist: sacrebleu>=2.6.0,<3
|
|
219
|
-
Requires-Dist: pycountry>=
|
|
219
|
+
Requires-Dist: pycountry>=26.2.16,<27
|
|
220
220
|
Requires-Dist: nltk>=3.9.4,<4
|
|
221
221
|
Requires-Dist: python-dotenv>=1.2.2,<2
|
|
222
222
|
Requires-Dist: lingua-language-detector>=2.2.0,<3
|
|
223
223
|
Requires-Dist: google-crc32c>=1.8.0,<2
|
|
224
|
-
Requires-Dist: kubernetes>=31.0.0,<32
|
|
225
224
|
Requires-Dist: langdetect>=1.0.9,<2
|
|
226
225
|
Requires-Dist: spacy>=3.8.14,<4
|
|
227
226
|
Requires-Dist: jsonschema>=4.26.0,<5
|
|
@@ -232,18 +231,17 @@ Requires-Dist: llm-sandbox[docker]==0.3.39
|
|
|
232
231
|
Requires-Dist: jsonlines>=4,<5
|
|
233
232
|
Requires-Dist: lxml>=6.1.1,<7
|
|
234
233
|
Requires-Dist: python-iso639>=2026.4.20
|
|
235
|
-
Requires-Dist: wandb>=0.27.
|
|
236
|
-
Requires-Dist: boto3>=1.43.
|
|
237
|
-
Requires-Dist: numpy>=
|
|
234
|
+
Requires-Dist: wandb>=0.27.2,<1
|
|
235
|
+
Requires-Dist: boto3>=1.43.19,<2
|
|
236
|
+
Requires-Dist: numpy>=2.2.6
|
|
238
237
|
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
239
|
-
Requires-Dist: scipy>=1.
|
|
238
|
+
Requires-Dist: scipy>=1.18.0,<2
|
|
240
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
241
|
-
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,
|
|
240
|
+
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
|
|
242
241
|
Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
|
|
243
|
-
Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
|
|
244
242
|
Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
|
|
245
243
|
Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
|
|
246
|
-
Requires-Dist: mistral-common>=1.11.
|
|
244
|
+
Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
|
|
247
245
|
Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
|
|
248
246
|
Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
|
|
249
247
|
Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
|
|
@@ -253,7 +251,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
|
|
|
253
251
|
Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
|
|
254
252
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
|
|
255
253
|
Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
|
|
256
|
-
Requires-Dist: accelerate>=
|
|
254
|
+
Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
|
|
257
255
|
Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
|
|
258
256
|
Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
|
|
259
257
|
Requires-Python: >=3.12, <3.13
|
|
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
|
|
|
261
259
|
Provides-Extra: accelerate
|
|
262
260
|
Provides-Extra: all
|
|
263
261
|
Provides-Extra: api
|
|
264
|
-
Provides-Extra: comet
|
|
265
262
|
Provides-Extra: determined
|
|
266
263
|
Provides-Extra: mistral
|
|
267
264
|
Provides-Extra: openai
|
|
@@ -319,7 +316,6 @@ pip install eval_framework
|
|
|
319
316
|
|
|
320
317
|
There are optional extras available to unlock specific features of the library:
|
|
321
318
|
- `api` for inference using the aleph-alpha client.
|
|
322
|
-
- `comet` for the COMET metric.
|
|
323
319
|
- `determined` for running jobs via determined.
|
|
324
320
|
- `mistral` for inference on Mistral models.
|
|
325
321
|
- `transformers` for inference using the transformers library.
|
|
@@ -47,7 +47,6 @@ pip install eval_framework
|
|
|
47
47
|
|
|
48
48
|
There are optional extras available to unlock specific features of the library:
|
|
49
49
|
- `api` for inference using the aleph-alpha client.
|
|
50
|
-
- `comet` for the COMET metric.
|
|
51
50
|
- `determined` for running jobs via determined.
|
|
52
51
|
- `mistral` for inference on Mistral models.
|
|
53
52
|
- `transformers` for inference using the transformers library.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.1"
|
|
4
4
|
description = "Evaluation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -19,16 +19,15 @@ classifiers = [
|
|
|
19
19
|
]
|
|
20
20
|
dependencies = [
|
|
21
21
|
"pyyaml>=6.0.3,<7",
|
|
22
|
-
"xmltodict>=0.
|
|
22
|
+
"xmltodict>=1.0.4,<1.1",
|
|
23
23
|
"pydantic>=2.13.4,<3",
|
|
24
|
-
"datasets>=
|
|
24
|
+
"datasets>=5.0.0,<6",
|
|
25
25
|
"sacrebleu>=2.6.0,<3",
|
|
26
|
-
"pycountry>=
|
|
26
|
+
"pycountry>=26.2.16,<27",
|
|
27
27
|
"nltk>=3.9.4,<4",
|
|
28
28
|
"python-dotenv>=1.2.2,<2",
|
|
29
29
|
"lingua-language-detector>=2.2.0,<3",
|
|
30
30
|
"google-crc32c>=1.8.0,<2",
|
|
31
|
-
"kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
|
|
32
31
|
"langdetect>=1.0.9,<2", # required by the original ifeval implementation
|
|
33
32
|
"spacy>=3.8.14,<4",
|
|
34
33
|
"jsonschema>=4.26.0,<5",
|
|
@@ -39,14 +38,13 @@ dependencies = [
|
|
|
39
38
|
"jsonlines>=4,<5",
|
|
40
39
|
"lxml>=6.1.1,<7",
|
|
41
40
|
"python-iso639>=2026.4.20",
|
|
42
|
-
"wandb>=0.27.
|
|
43
|
-
"boto3>=1.43.
|
|
44
|
-
"numpy>=
|
|
41
|
+
"wandb>=0.27.2,<1",
|
|
42
|
+
"boto3>=1.43.19,<2",
|
|
43
|
+
"numpy>=2.2.6",
|
|
45
44
|
# is a dependency of sympy, but not explicitly listed in the requirements.txt
|
|
46
45
|
# https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
|
|
47
46
|
"antlr4-python3-runtime==4.11.0",
|
|
48
|
-
"scipy>=1.
|
|
49
|
-
|
|
47
|
+
"scipy>=1.18.0,<2", # required for the aggregation of pass@k metrics
|
|
50
48
|
]
|
|
51
49
|
|
|
52
50
|
[project.optional-dependencies]
|
|
@@ -64,7 +62,7 @@ openai = [
|
|
|
64
62
|
transformers = [
|
|
65
63
|
"transformers>=4.45.2,<5",
|
|
66
64
|
"torch>=2.5,<3",
|
|
67
|
-
"accelerate>=
|
|
65
|
+
"accelerate>=1.14.0,<2",
|
|
68
66
|
]
|
|
69
67
|
accelerate = ["accelerate"]
|
|
70
68
|
vllm = [
|
|
@@ -72,21 +70,17 @@ vllm = [
|
|
|
72
70
|
"torch>=2.5,<3"
|
|
73
71
|
]
|
|
74
72
|
mistral = [
|
|
75
|
-
"mistral-common>=1.11.
|
|
73
|
+
"mistral-common>=1.11.3,<2",
|
|
76
74
|
"huggingface-hub>=0.36.2,<0.37",
|
|
77
75
|
"eval_framework[vllm]",
|
|
78
76
|
]
|
|
79
|
-
# Benchmark/metric specific extras
|
|
80
|
-
comet = [
|
|
81
|
-
"unbabel-comet>=2.2.7,<3",
|
|
82
|
-
]
|
|
83
77
|
# from template-formatting
|
|
84
78
|
optional = [
|
|
85
79
|
"transformers>=4.45.2,<5",
|
|
86
80
|
"jinja2>=3.1.6,<4"
|
|
87
81
|
]
|
|
88
82
|
all = [
|
|
89
|
-
"eval_framework[determined,api,openai,transformers,accelerate,vllm,
|
|
83
|
+
"eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
|
|
90
84
|
]
|
|
91
85
|
|
|
92
86
|
[project.urls]
|
|
@@ -98,24 +92,24 @@ eval_framework = "eval_framework.run:run"
|
|
|
98
92
|
[dependency-groups]
|
|
99
93
|
dev = [
|
|
100
94
|
"mypy>=2.1.0,<3",
|
|
101
|
-
"pytest>=9.0
|
|
95
|
+
"pytest>=9.1.0,<10",
|
|
102
96
|
"pytest-mock>=3.15.1",
|
|
103
97
|
"pytest-xdist>=3.8.0,<4",
|
|
104
98
|
"pytest-sugar>1.1,<2",
|
|
105
99
|
"types-pyyaml>=6.0.12.20260518,<7",
|
|
106
100
|
"types-python-dateutil>=2.9.0.20260518,<3",
|
|
107
101
|
"types-requests>=2.33.0.20260518,<3",
|
|
108
|
-
"plotly>=
|
|
109
|
-
"ruff>=0.15.
|
|
102
|
+
"plotly>=6.8.0,<7",
|
|
103
|
+
"ruff>=0.15.18",
|
|
110
104
|
"pip-licenses>=5.5.5",
|
|
111
105
|
]
|
|
112
106
|
flash-attn = [
|
|
113
|
-
"flash-attn>=2.8.3,<2.9",
|
|
107
|
+
"flash-attn>=2.8.3.post1,<2.9",
|
|
114
108
|
"torch"
|
|
115
109
|
]
|
|
116
110
|
|
|
117
111
|
[build-system]
|
|
118
|
-
requires = ["uv_build>=0.11.
|
|
112
|
+
requires = ["uv_build>=0.11.23,<0.11.24"]
|
|
119
113
|
build-backend = "uv_build"
|
|
120
114
|
|
|
121
115
|
[tool.uv.build-backend]
|
|
@@ -126,22 +120,6 @@ override-dependencies = [
|
|
|
126
120
|
"requests>=2.32,<3", # fix for determined
|
|
127
121
|
]
|
|
128
122
|
|
|
129
|
-
[tool.uv.sources]
|
|
130
|
-
torch = [
|
|
131
|
-
{ index = "pytorch-default", marker = "sys_platform != 'linux'" },
|
|
132
|
-
{ index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
|
|
133
|
-
]
|
|
134
|
-
|
|
135
|
-
[[tool.uv.index]]
|
|
136
|
-
name = "pytorch-cu124"
|
|
137
|
-
url = "https://download.pytorch.org/whl/cu124"
|
|
138
|
-
explicit = true
|
|
139
|
-
|
|
140
|
-
[[tool.uv.index]]
|
|
141
|
-
name = "pytorch-default"
|
|
142
|
-
url = "https://pypi.org/simple"
|
|
143
|
-
explicit = true
|
|
144
|
-
|
|
145
123
|
[tool.uv.extra-build-dependencies]
|
|
146
124
|
# Build flash-attn with the same torch version as in the container. Details at:
|
|
147
125
|
# https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
|
|
@@ -167,6 +145,7 @@ known-third-party = ["wandb"]
|
|
|
167
145
|
|
|
168
146
|
[tool.ruff.lint.extend-per-file-ignores]
|
|
169
147
|
"__init__.py" = ["F401"]
|
|
148
|
+
"tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
|
|
170
149
|
|
|
171
150
|
[tool.mypy]
|
|
172
151
|
plugins = "pydantic.mypy"
|
|
@@ -18,7 +18,7 @@ from eval_framework.result_processors.base import Result, ResultProcessor
|
|
|
18
18
|
from eval_framework.shared.types import Completion, Loglikelihood
|
|
19
19
|
from eval_framework.tasks.base import ResponseType
|
|
20
20
|
from eval_framework.tasks.eval_config import EvalConfig
|
|
21
|
-
from eval_framework.tasks.registry import
|
|
21
|
+
from eval_framework.tasks.registry import registry
|
|
22
22
|
from eval_framework.utils.constants import RED, RESET
|
|
23
23
|
from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
|
|
24
24
|
|
|
@@ -36,13 +36,9 @@ class EvaluationGenerator:
|
|
|
36
36
|
self.result_processor = result_processor
|
|
37
37
|
self.save_intermediate_results = config.save_intermediate_results
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
task_metrics = list(task_class.TASK_STYLER.metrics)
|
|
43
|
-
else:
|
|
44
|
-
response_type = task_class.RESPONSE_TYPE
|
|
45
|
-
task_metrics = task_class.METRICS
|
|
39
|
+
eval_ = registry()[config.task_name]
|
|
40
|
+
response_type = eval_.response_type()
|
|
41
|
+
task_metrics = eval_.metrics()
|
|
46
42
|
|
|
47
43
|
if response_type == ResponseType.COMPLETION:
|
|
48
44
|
self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
|
|
@@ -51,7 +47,7 @@ class EvaluationGenerator:
|
|
|
51
47
|
else:
|
|
52
48
|
raise NotImplementedError
|
|
53
49
|
|
|
54
|
-
self.task_name = task_class.NAME
|
|
50
|
+
self.task_name = eval_.task_class().NAME
|
|
55
51
|
|
|
56
52
|
def _run_metric_calculators(self, responses: list[Completion | Loglikelihood]) -> list[Result]:
|
|
57
53
|
results: list[Result] = self.result_processor.load_metrics_results()
|
|
@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
|
|
|
55
55
|
formatter: BaseFormatter | None = None,
|
|
56
56
|
temperature: float | None = None,
|
|
57
57
|
top_p: float | None = None,
|
|
58
|
-
api_key: str | None =
|
|
58
|
+
api_key: str | None = None,
|
|
59
59
|
organization: str | None = None,
|
|
60
60
|
base_url: str | None = None,
|
|
61
61
|
bytes_per_token: float | None = None,
|
|
@@ -86,7 +86,7 @@ class OpenAIModel(BaseLLM):
|
|
|
86
86
|
self._top_p = top_p
|
|
87
87
|
|
|
88
88
|
self._client = OpenAI(
|
|
89
|
-
api_key=api_key,
|
|
89
|
+
api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
|
|
90
90
|
organization=organization,
|
|
91
91
|
base_url=base_url,
|
|
92
92
|
)
|
|
@@ -36,7 +36,7 @@ class BaseMetric[Response](ABC):
|
|
|
36
36
|
# macro averaging the overall computation default.
|
|
37
37
|
AGGREGATORS: list[Aggregator] = []
|
|
38
38
|
# Set by the evaluation generator before calculate(); controls how infra failures are handled.
|
|
39
|
-
fail_on_error: bool =
|
|
39
|
+
fail_on_error: bool = True
|
|
40
40
|
|
|
41
41
|
@classproperty
|
|
42
42
|
def NAMES(cls) -> list[str]:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from llm_sandbox.exceptions import SandboxTimeoutError
|
|
2
2
|
|
|
3
3
|
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
4
|
-
from eval_framework.shared.types import Completion
|
|
4
|
+
from eval_framework.shared.types import Completion
|
|
5
5
|
from eval_framework.tasks.utils import run_python_code
|
|
6
6
|
|
|
7
7
|
|
|
@@ -16,7 +16,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
|
|
|
16
16
|
code = response.completion
|
|
17
17
|
try:
|
|
18
18
|
output = run_python_code(code, image="python:3.12-slim")
|
|
19
|
-
except SandboxTimeoutError
|
|
19
|
+
except SandboxTimeoutError:
|
|
20
20
|
# The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
|
|
21
21
|
# problem.
|
|
22
22
|
import traceback
|
|
@@ -26,7 +26,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
|
|
|
26
26
|
metric_name=self.NAME,
|
|
27
27
|
value=0.0,
|
|
28
28
|
higher_is_better=True,
|
|
29
|
-
|
|
29
|
+
code_execution_trace=traceback.format_exc(),
|
|
30
30
|
)
|
|
31
31
|
]
|
|
32
32
|
except Exception as e:
|
|
@@ -42,22 +42,12 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
|
|
|
42
42
|
last_output = output_parts[-1]
|
|
43
43
|
|
|
44
44
|
success = last_output == "True"
|
|
45
|
-
error = (
|
|
46
|
-
None
|
|
47
|
-
if success
|
|
48
|
-
else Error(
|
|
49
|
-
error_class="CodeCompletionAssertionError",
|
|
50
|
-
message=f"Expected 'True' but got '{last_output}'",
|
|
51
|
-
traceback=output,
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
|
|
55
45
|
return [
|
|
56
46
|
MetricResult(
|
|
57
47
|
metric_name=self.NAME,
|
|
58
48
|
value=1.0 if success else 0.0,
|
|
59
49
|
higher_is_better=True,
|
|
60
|
-
error=
|
|
50
|
+
error=None,
|
|
61
51
|
code_execution_trace=output,
|
|
62
52
|
)
|
|
63
53
|
]
|
|
@@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable
|
|
|
5
5
|
from datetime import UTC, datetime
|
|
6
6
|
from functools import partial
|
|
7
7
|
|
|
8
|
-
from eval_framework.tasks.registry import
|
|
8
|
+
from eval_framework.tasks.registry import registry
|
|
9
9
|
|
|
10
10
|
try:
|
|
11
11
|
from determined._info import get_cluster_info
|
|
@@ -28,7 +28,6 @@ from eval_framework.shared.types import (
|
|
|
28
28
|
)
|
|
29
29
|
from eval_framework.tasks.base import Language, ResponseType, Sample
|
|
30
30
|
from eval_framework.tasks.eval_config import EvalConfig
|
|
31
|
-
from eval_framework.tasks.perturbation import create_perturbation_class
|
|
32
31
|
from eval_framework.tasks.utils import raise_errors
|
|
33
32
|
from eval_framework.utils.constants import RED, RESET
|
|
34
33
|
from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
|
|
@@ -54,7 +53,6 @@ def map_language_to_value(
|
|
|
54
53
|
|
|
55
54
|
class ResponseGenerator:
|
|
56
55
|
def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFileProcessor) -> None:
|
|
57
|
-
self.few_shot = config.num_fewshot
|
|
58
56
|
self.task_name = config.task_name
|
|
59
57
|
self.llm = llm
|
|
60
58
|
self.config = config
|
|
@@ -62,20 +60,16 @@ class ResponseGenerator:
|
|
|
62
60
|
self.num_samples = config.num_samples
|
|
63
61
|
self.save_intermediate_results = config.save_intermediate_results
|
|
64
62
|
|
|
65
|
-
task_class = get_task(config.task_name)
|
|
66
|
-
|
|
67
63
|
if config.perturbation_config is not None:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
64
|
+
self.task = registry()[config.task_name].create_perturbation(
|
|
65
|
+
config.perturbation_config,
|
|
66
|
+
config.num_fewshot,
|
|
67
|
+
config.task_subjects,
|
|
68
|
+
config.hf_revision,
|
|
73
69
|
)
|
|
74
70
|
else:
|
|
75
|
-
self.task =
|
|
76
|
-
|
|
77
|
-
custom_subjects=self.config.task_subjects,
|
|
78
|
-
custom_hf_revision=self.config.hf_revision,
|
|
71
|
+
self.task = registry()[config.task_name].create(
|
|
72
|
+
config.num_fewshot, config.task_subjects, config.hf_revision
|
|
79
73
|
)
|
|
80
74
|
|
|
81
75
|
self.response_type = self.task.get_response_type()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Register all tasks on import
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from .dataset_revisions import DatasetRevision
|
|
5
|
+
from .task_names import register_all_tasks
|
|
6
|
+
|
|
7
|
+
DatasetRevision.add_revision_file(Path(__file__).parent / "task-dataset-revisions.json")
|
|
8
|
+
|
|
9
|
+
register_all_tasks()
|
|
10
|
+
|
|
11
|
+
del register_all_tasks
|
|
12
|
+
del DatasetRevision
|
|
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
|
|
|
15
15
|
from pydantic import BaseModel, ConfigDict
|
|
16
16
|
|
|
17
17
|
from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
|
|
18
|
-
from eval_framework.tasks.
|
|
18
|
+
from eval_framework.tasks.dataset_revisions import DatasetRevision
|
|
19
19
|
from eval_framework.tasks.utils import classproperty, raise_errors
|
|
20
20
|
from template_formatting.formatter import Message, Role
|
|
21
21
|
|
|
@@ -118,7 +118,7 @@ class BaseTask[SubjectType](ABC):
|
|
|
118
118
|
# Applied once at instance creation; not refreshed if the pin file changes mid-run.
|
|
119
119
|
if custom_hf_revision:
|
|
120
120
|
self.HF_REVISION = custom_hf_revision
|
|
121
|
-
elif self.HF_REVISION is None and (pinned :=
|
|
121
|
+
elif self.HF_REVISION is None and (pinned := DatasetRevision.pinned_revision(self.__class__.__name__)):
|
|
122
122
|
self.HF_REVISION = pinned
|
|
123
123
|
|
|
124
124
|
@classmethod
|
|
@@ -359,7 +359,7 @@ class BaseTask[SubjectType](ABC):
|
|
|
359
359
|
samples: list[Sample],
|
|
360
360
|
stop_sequences: list[str] | None = None,
|
|
361
361
|
max_tokens: int | None = None,
|
|
362
|
-
fail_on_error: bool =
|
|
362
|
+
fail_on_error: bool = True,
|
|
363
363
|
) -> list[Completion]:
|
|
364
364
|
"""
|
|
365
365
|
Generates completions for the sample.
|
{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
@@ -109,7 +109,7 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
109
109
|
stop_sequences: list[str] | None,
|
|
110
110
|
max_tokens: int | None,
|
|
111
111
|
initial_samples: list[Sample],
|
|
112
|
-
fail_on_error: bool =
|
|
112
|
+
fail_on_error: bool = True,
|
|
113
113
|
) -> tuple[list[list[Message]], list[Union["Error", None]]]:
|
|
114
114
|
initial_messages = [s.messages for s in initial_samples]
|
|
115
115
|
samples = [(s, False) for s in initial_samples] # (sample, is_done)
|
|
@@ -170,7 +170,7 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
170
170
|
samples: list[Sample],
|
|
171
171
|
stop_sequences: list[str] | None = None,
|
|
172
172
|
max_tokens: int | None = None,
|
|
173
|
-
fail_on_error: bool =
|
|
173
|
+
fail_on_error: bool = True,
|
|
174
174
|
) -> list[Completion]:
|
|
175
175
|
assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
|
|
176
176
|
"Each sample must have exactly one USER message."
|
{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import random
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any, cast
|
|
5
5
|
|
|
6
6
|
import pycountry
|
|
7
7
|
from datasets import DatasetDict, DownloadConfig, load_dataset
|
|
@@ -100,11 +100,11 @@ class Flores200(BaseTask[str]):
|
|
|
100
100
|
|
|
101
101
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
102
102
|
source_key = item["subject"].split("-")[0]
|
|
103
|
-
source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
|
|
103
|
+
source_language = cast(Any, pycountry.languages.get(alpha_3=source_key.split("_")[0])).name
|
|
104
104
|
source = item[f"sentence_{source_key}"]
|
|
105
105
|
instruction = f"{source_language} sentence: {source}\n"
|
|
106
106
|
target_key = item["subject"].split("-")[1]
|
|
107
|
-
target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
|
|
107
|
+
target_language = cast(Any, pycountry.languages.get(alpha_3=target_key.split("_")[0])).name
|
|
108
108
|
|
|
109
109
|
return f"{instruction}{target_language} sentence:"
|
|
110
110
|
|
{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
@@ -4,7 +4,6 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from eval_framework.metrics.completion.bleu import BLEU
|
|
6
6
|
from eval_framework.metrics.completion.chrf import CHRF
|
|
7
|
-
from eval_framework.metrics.completion.comet import COMET
|
|
8
7
|
from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
|
|
9
8
|
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
|
|
10
9
|
|
|
@@ -29,7 +28,7 @@ class FloresPlus(BaseTask[str]):
|
|
|
29
28
|
SAMPLE_SPLIT = "dev"
|
|
30
29
|
FEWSHOT_SPLIT = "devtest"
|
|
31
30
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
32
|
-
METRICS = [BLEU, CHRF
|
|
31
|
+
METRICS = [BLEU, CHRF]
|
|
33
32
|
SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
|
|
34
33
|
PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
|
|
35
34
|
LANGUAGE = {
|
|
@@ -4,6 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion, AccuracyCompletionOLMES
|
|
6
6
|
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
|
|
7
|
+
from eval_framework.tasks.task_style import BPBStyle
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
@@ -95,7 +96,6 @@ class GSM8KEvalHarness(BaseTask[str]):
|
|
|
95
96
|
|
|
96
97
|
NAME = "GSM8KEvalHarness"
|
|
97
98
|
DATASET_PATH = "openai/gsm8k"
|
|
98
|
-
HF_REVISION = "main"
|
|
99
99
|
SAMPLE_SPLIT = "test"
|
|
100
100
|
FEWSHOT_SPLIT = "train"
|
|
101
101
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
@@ -216,3 +216,32 @@ class GSM8K_OLMES(GSM8K):
|
|
|
216
216
|
|
|
217
217
|
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
218
218
|
return self._clean_short_answer(completion_text)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class GSM8KBPB(GSM8K_OLMES):
|
|
222
|
+
NAME = "GSM8KBPB"
|
|
223
|
+
TASK_STYLER = BPBStyle(cue_text="Answer:", leading_space_continuations=False)
|
|
224
|
+
|
|
225
|
+
# BPBStyle already adds "Answer:" as that separate assistant message. But the methods we inherit
|
|
226
|
+
# still put "Answer:" at the end of the question text and leave it out of the fewshot answer.
|
|
227
|
+
# So we override them here: remove "Answer:" from the question, and add it back in front of the
|
|
228
|
+
# fewshot answer. Without this, the question ends in "Answer:Answer:" and fewshot answers have
|
|
229
|
+
# no "Answer:" label at all.
|
|
230
|
+
|
|
231
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
232
|
+
return f"Question: {item['question']}\n"
|
|
233
|
+
|
|
234
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
235
|
+
return f"Answer:{self.normalize_answer_str(item)}"
|
|
236
|
+
|
|
237
|
+
def _get_raw_question(self, item: dict[str, Any]) -> str:
|
|
238
|
+
return item["question"]
|
|
239
|
+
|
|
240
|
+
def _get_choices(self, item: dict[str, Any]) -> list[str]:
|
|
241
|
+
return [self.normalize_answer_str(item)]
|
|
242
|
+
|
|
243
|
+
def _get_correct_index(self, item: dict[str, Any]) -> int:
|
|
244
|
+
return 0
|
|
245
|
+
|
|
246
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str:
|
|
247
|
+
return self._get_choices(item)[0]
|
{eval_framework-0.3.8 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
@@ -14,8 +14,8 @@ from eval_framework.metrics.completion.minerva_math_utils import (
|
|
|
14
14
|
extract_answers,
|
|
15
15
|
normalized_gold_from_solution,
|
|
16
16
|
)
|
|
17
|
-
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
|
|
18
17
|
from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
|
|
18
|
+
from eval_framework.tasks.task_style import BPBStyle
|
|
19
19
|
|
|
20
20
|
# Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB)
|
|
21
21
|
MATH_SUBJECTS = [
|
|
@@ -612,44 +612,6 @@ class MATH500Minerva(MATHMinerva):
|
|
|
612
612
|
super().__init__(num_fewshot)
|
|
613
613
|
|
|
614
614
|
|
|
615
|
-
class MATHMinervaBPB(MATHReasoning):
|
|
616
|
-
"""
|
|
617
|
-
MATH (Hendrycks) with Minerva-style prompt, evaluated via loglikelihood of the
|
|
618
|
-
gold answer string (bits-per-byte).
|
|
619
|
-
Same prompt as MATHMinerva; scores P(normalized_gold_answer | prompt).
|
|
620
|
-
"""
|
|
621
|
-
|
|
622
|
-
NAME = "MATHMinervaBPB"
|
|
623
|
-
DATASET_PATH = "EleutherAI/hendrycks_math"
|
|
624
|
-
SAMPLE_SPLIT = "test"
|
|
625
|
-
FEWSHOT_SPLIT = "train"
|
|
626
|
-
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
627
|
-
METRICS = [BitsPerByteLoglikelihood]
|
|
628
|
-
SUBJECTS = MATH_SUBJECTS
|
|
629
|
-
LANGUAGE = Language.ENG
|
|
630
|
-
|
|
631
|
-
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
632
|
-
return "Problem:\n" + item["problem"] + "\n\n" + "Solution:"
|
|
633
|
-
|
|
634
|
-
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
635
|
-
return ""
|
|
636
|
-
|
|
637
|
-
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
638
|
-
normalized = self._normalized_gold_from_solution(item["solution"])
|
|
639
|
-
if normalized is None:
|
|
640
|
-
return None
|
|
641
|
-
return " " + normalized
|
|
642
|
-
|
|
643
|
-
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
644
|
-
normalized = self._normalized_gold_from_solution(item["solution"])
|
|
645
|
-
if normalized is None:
|
|
646
|
-
return None
|
|
647
|
-
return [" " + normalized]
|
|
648
|
-
|
|
649
|
-
def _normalized_gold_from_solution(self, solution: str) -> str | None:
|
|
650
|
-
return normalized_gold_from_solution(solution)
|
|
651
|
-
|
|
652
|
-
|
|
653
615
|
class MATHLvl5(MATH):
|
|
654
616
|
NAME = "Math Lvl 5"
|
|
655
617
|
|
|
@@ -742,7 +704,7 @@ Answer:"""
|
|
|
742
704
|
|
|
743
705
|
|
|
744
706
|
_OLMES_FEWSHOTS = [
|
|
745
|
-
|
|
707
|
+
# https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
|
|
746
708
|
{
|
|
747
709
|
"problem": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
|
|
748
710
|
"solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so "
|
|
@@ -790,3 +752,35 @@ class MATHMinerva_OLMES(MATHMinerva):
|
|
|
790
752
|
|
|
791
753
|
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
792
754
|
return _OLMES_FEWSHOTS[: self.num_fewshot]
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
class MATHMinervaBPB(MATHMinerva_OLMES):
|
|
758
|
+
NAME = "MATHMinervaBPB"
|
|
759
|
+
TASK_STYLER = BPBStyle(cue_text="Solution:")
|
|
760
|
+
|
|
761
|
+
# BPBStyle already adds "Solution:" as that separate assistant message. But the methods we inherit
|
|
762
|
+
# still put "Solution:" at the end of the question text and leave it out of the fewshot answer.
|
|
763
|
+
# So we override them here: remove "Solution:" from the question, and add it back in front of the
|
|
764
|
+
# fewshot answer. Without this, the question ends in "Solution:Solution:" and fewshot answers have
|
|
765
|
+
# no "Solution:" label at all.
|
|
766
|
+
|
|
767
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
768
|
+
return "Problem:\n" + item["problem"] + "\n\n"
|
|
769
|
+
|
|
770
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
771
|
+
return f"Solution: {item['solution']}"
|
|
772
|
+
|
|
773
|
+
def _get_choices(self, item: dict[str, Any]) -> list[str]:
|
|
774
|
+
answer = normalized_gold_from_solution(item["solution"])
|
|
775
|
+
template = f"\nFinal Answer: The final answer is {answer}. I hope it is correct."
|
|
776
|
+
|
|
777
|
+
return [item["solution"] + template]
|
|
778
|
+
|
|
779
|
+
def _get_correct_index(self, item: dict[str, Any]) -> int:
|
|
780
|
+
return 0
|
|
781
|
+
|
|
782
|
+
def _get_raw_question(self, item: dict[str, Any]) -> str:
|
|
783
|
+
return item["problem"]
|
|
784
|
+
|
|
785
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
|
|
786
|
+
return self._get_choices(item)[0]
|