eval-framework 0.2.14__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.14 → eval_framework-0.3.1}/PKG-INFO +3 -2
- {eval_framework-0.2.14 → eval_framework-0.3.1}/pyproject.toml +4 -3
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/determined.py +1 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/eval.py +2 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/local.py +1 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/evaluation_generator.py +68 -4
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/main.py +1 -1
- eval_framework-0.3.1/src/eval_framework/metrics/aggregators/aggregators.py +139 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/base.py +5 -0
- eval_framework-0.3.1/src/eval_framework/metrics/completion/accuracy_completion.py +116 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_assertion.py +13 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +60 -7
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/drop_completion.py +18 -9
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/f1.py +41 -2
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/math_minerva_completion.py +2 -0
- eval_framework-0.3.1/src/eval_framework/metrics/completion/multipl_e_assertion.py +206 -0
- eval_framework-0.3.1/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +98 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +13 -3
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/response_generator.py +24 -9
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/run.py +33 -1
- eval_framework-0.3.1/src/eval_framework/suite.py +387 -0
- eval_framework-0.3.1/src/eval_framework/tasks/Dockerfile_codebench +9 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/base.py +39 -7
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc_de.py +2 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/bigcodebench.py +77 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/csqa.py +8 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/drop.py +75 -13
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/global_mmlu.py +9 -2
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/gsm8k.py +68 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/hellaswag.py +5 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +2 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py +53 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/medqa.py +6 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu.py +1 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu_de.py +2 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmmlu.py +7 -1
- eval_framework-0.3.1/src/eval_framework/tasks/benchmarks/multipl_e.py +234 -0
- eval_framework-0.3.1/src/eval_framework/tasks/benchmarks/naturalqs_open.py +103 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/piqa.py +3 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/sciq.py +2 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/social_iqa.py +12 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/squad.py +39 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/tablebench.py +5 -5
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winogrande.py +55 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/eval_config.py +4 -1
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/task_names.py +22 -0
- eval_framework-0.3.1/src/eval_framework/tasks/task_style.py +387 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/utils.py +93 -12
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/generate_task_docs.py +9 -4
- eval_framework-0.3.1/src/template_formatting/py.typed +0 -0
- eval_framework-0.2.14/src/eval_framework/metrics/completion/accuracy_completion.py +0 -16
- eval_framework-0.2.14/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -51
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -100
- {eval_framework-0.2.14 → eval_framework-0.3.1}/LICENSE +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/README.md +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.14/src/eval_framework/metrics/efficiency → eval_framework-0.3.1/src/eval_framework/metrics/aggregators}/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.14/src/eval_framework/metrics/llm → eval_framework-0.3.1/src/eval_framework/metrics/efficiency}/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.14/src/eval_framework/metrics/loglikelihood → eval_framework-0.3.1/src/eval_framework/metrics/llm}/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.14/src/eval_framework/result_processors → eval_framework-0.3.1/src/eval_framework/metrics/loglikelihood}/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.14/src/eval_framework/tasks/benchmarks → eval_framework-0.3.1/src/eval_framework/result_processors}/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.14/src/template_formatting → eval_framework-0.3.1/src/eval_framework/tasks/benchmarks}/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/template_formatting/README.md +0 -0
- /eval_framework-0.2.14/src/template_formatting/py.typed → /eval_framework-0.3.1/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.14 → eval_framework-0.3.1}/src/template_formatting/mistral_formatter.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -228,7 +228,7 @@ Requires-Dist: jsonschema>=4.23.0,<5
|
|
|
228
228
|
Requires-Dist: mysql-connector-python>=9.0.0,<10
|
|
229
229
|
Requires-Dist: psycopg2-binary>=2.9.9,<3
|
|
230
230
|
Requires-Dist: sympy>=1.13.1,<2
|
|
231
|
-
Requires-Dist: llm-sandbox[docker]
|
|
231
|
+
Requires-Dist: llm-sandbox[docker]==0.3.37
|
|
232
232
|
Requires-Dist: jsonlines>=4,<5
|
|
233
233
|
Requires-Dist: lxml>=6,<7
|
|
234
234
|
Requires-Dist: python-iso639>=2025.2.18
|
|
@@ -236,6 +236,7 @@ Requires-Dist: wandb>=0.23.0,<1
|
|
|
236
236
|
Requires-Dist: boto3>=1.40.54,<2
|
|
237
237
|
Requires-Dist: numpy>=1.26.4
|
|
238
238
|
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
239
|
+
Requires-Dist: scipy>=1.14.0,<2
|
|
239
240
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
240
241
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
241
242
|
Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.1"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -35,7 +35,7 @@ dependencies = [
|
|
|
35
35
|
"mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
|
|
36
36
|
"psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
|
|
37
37
|
"sympy>=1.13.1,<2",
|
|
38
|
-
"llm-sandbox[docker]
|
|
38
|
+
"llm-sandbox[docker]==0.3.37",
|
|
39
39
|
"jsonlines>=4,<5",
|
|
40
40
|
"lxml>=6,<7",
|
|
41
41
|
"python-iso639>=2025.2.18",
|
|
@@ -45,6 +45,8 @@ dependencies = [
|
|
|
45
45
|
# is a dependency of sympy, but not explicitly listed in the requirements.txt
|
|
46
46
|
# https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
|
|
47
47
|
"antlr4-python3-runtime==4.11.0",
|
|
48
|
+
"scipy>=1.14.0,<2", # required for the aggregation of pass@k metrics
|
|
49
|
+
|
|
48
50
|
]
|
|
49
51
|
|
|
50
52
|
[project.optional-dependencies]
|
|
@@ -105,7 +107,6 @@ dev = [
|
|
|
105
107
|
"types-requests>=2.32.0.20250328,<3",
|
|
106
108
|
"plotly>=5.24.1,<6",
|
|
107
109
|
"ruff>=0.12.8",
|
|
108
|
-
"scipy>=1.14.0,<2", # for tests comparing our Hungarian implementation to scipy
|
|
109
110
|
]
|
|
110
111
|
flash-attn = [
|
|
111
112
|
"flash-attn>=2.7.2.post1,<2.8",
|
|
@@ -148,6 +148,7 @@ class DeterminedContext(EvalContext):
|
|
|
148
148
|
wandb_project=self.hparams.wandb_project or self.wandb_project,
|
|
149
149
|
wandb_entity=self.hparams.wandb_entity or self.wandb_entity,
|
|
150
150
|
wandb_run_id=self.hparams.wandb_run_id or self.wandb_run_id,
|
|
151
|
+
wandb_group=self.wandb_group,
|
|
151
152
|
wandb_upload_results=self.hparams.wandb_upload_results or self.wandb_upload_results,
|
|
152
153
|
batch_size=self.hparams.task_args.batch_size or self.batch_size,
|
|
153
154
|
description=self.hparams.description or self.description,
|
|
@@ -61,6 +61,7 @@ class EvalContext(AbstractContextManager):
|
|
|
61
61
|
wandb_project: str | None = None,
|
|
62
62
|
wandb_entity: str | None = None,
|
|
63
63
|
wandb_run_id: str | None = None,
|
|
64
|
+
wandb_group: str | None = None,
|
|
64
65
|
wandb_upload_results: bool | None = None,
|
|
65
66
|
hf_upload_dir: str | None = None,
|
|
66
67
|
hf_upload_repo: str | None = None,
|
|
@@ -89,6 +90,7 @@ class EvalContext(AbstractContextManager):
|
|
|
89
90
|
self.wandb_project = wandb_project
|
|
90
91
|
self.wandb_entity = wandb_entity
|
|
91
92
|
self.wandb_run_id = wandb_run_id
|
|
93
|
+
self.wandb_group = wandb_group
|
|
92
94
|
self.wandb_upload_results = wandb_upload_results
|
|
93
95
|
self.hf_upload_dir = hf_upload_dir
|
|
94
96
|
self.hf_upload_repo = hf_upload_repo
|
|
@@ -58,6 +58,7 @@ class LocalContext(EvalContext):
|
|
|
58
58
|
wandb_entity=self.wandb_entity,
|
|
59
59
|
wandb_project=self.wandb_project,
|
|
60
60
|
wandb_run_id=self.wandb_run_id,
|
|
61
|
+
wandb_group=self.wandb_group,
|
|
61
62
|
wandb_upload_results=self.wandb_upload_results,
|
|
62
63
|
llm_judge_class=self.llm_judge_class,
|
|
63
64
|
judge_model_args=self.judge_model_args,
|
|
@@ -37,10 +37,17 @@ class EvaluationGenerator:
|
|
|
37
37
|
self.save_intermediate_results = config.save_intermediate_results
|
|
38
38
|
|
|
39
39
|
task_class = get_task(config.task_name)
|
|
40
|
-
if task_class
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
40
|
+
if hasattr(task_class, "TASK_STYLER"):
|
|
41
|
+
response_type = task_class.TASK_STYLER.response_type
|
|
42
|
+
task_metrics = list(task_class.TASK_STYLER.metrics)
|
|
43
|
+
else:
|
|
44
|
+
response_type = task_class.RESPONSE_TYPE
|
|
45
|
+
task_metrics = task_class.METRICS
|
|
46
|
+
|
|
47
|
+
if response_type == ResponseType.COMPLETION:
|
|
48
|
+
self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
|
|
49
|
+
elif response_type == ResponseType.LOGLIKELIHOODS:
|
|
50
|
+
self.metrics = task_metrics + [BytesLoglikelihood, SequencePositionsLoglikelihood]
|
|
44
51
|
else:
|
|
45
52
|
raise NotImplementedError
|
|
46
53
|
|
|
@@ -243,6 +250,61 @@ class EvaluationGenerator:
|
|
|
243
250
|
|
|
244
251
|
return aggregated_results
|
|
245
252
|
|
|
253
|
+
def _aggregate_results_with_aggregators(self, results: list[Result]) -> dict[str, float | None]:
|
|
254
|
+
data = pd.DataFrame([r.model_dump() for r in results])
|
|
255
|
+
if len(data) == 0:
|
|
256
|
+
return {}
|
|
257
|
+
data = data.fillna({"key": ""})
|
|
258
|
+
aggregated_results: dict[str, float | None] = {}
|
|
259
|
+
data = data.loc[data.error.isnull()]
|
|
260
|
+
|
|
261
|
+
for (metric_name, current_metric_class), metric_group in data.groupby(["metric_name", "metric_class_name"]):
|
|
262
|
+
# The reason we groupby over both metric_name and metric_class_name is because we want to aggregate
|
|
263
|
+
# results for a single metric. Two metric classes can implement the same metric name. We want to separate
|
|
264
|
+
# those cases. We cannot group over only metric_class_name because each metric class can implement
|
|
265
|
+
# multiple metrics with different names.
|
|
266
|
+
current_metric = None
|
|
267
|
+
# now loop over the self.metrics list and find the metric class that matches the current_metric_class
|
|
268
|
+
for metric_class in self.metrics:
|
|
269
|
+
if metric_class.__name__ == current_metric_class:
|
|
270
|
+
current_metric = metric_class
|
|
271
|
+
break
|
|
272
|
+
if current_metric is None:
|
|
273
|
+
raise ValueError(f"Metric {metric_name} not found in metrics list")
|
|
274
|
+
|
|
275
|
+
for aggregator in current_metric.AGGREGATORS:
|
|
276
|
+
aggregated_results[f"{aggregator.name} {current_metric_class}.{metric_name}"] = (
|
|
277
|
+
aggregator(metric_group, ["prompt"]) # Compute the aggregator, grouped by the prompt...
|
|
278
|
+
.groupby(["key", "subject"]) # ... then group by key, subject...
|
|
279
|
+
.agg({"value": "mean"})["value"] # ...and average scores over each key, subject group...
|
|
280
|
+
.mean() # ...and lastly average the scores across all groups giving equal weight to every
|
|
281
|
+
.item() # key, subject group.
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Loop to additionally compute per-subject/per-key breakdown metric scores, e.g. for only subject="algebra"
|
|
285
|
+
for (key, subject, metric_name, current_metric_class), ksm_group in data.groupby(
|
|
286
|
+
["key", "subject", "metric_name", "metric_class_name"]
|
|
287
|
+
):
|
|
288
|
+
current_metric = None
|
|
289
|
+
# now loop over the self.metrics list and find the metric class that matches the current_metric_class
|
|
290
|
+
for metric_class in self.metrics:
|
|
291
|
+
if metric_class.__name__ == current_metric_class:
|
|
292
|
+
current_metric = metric_class
|
|
293
|
+
break
|
|
294
|
+
|
|
295
|
+
if current_metric is None:
|
|
296
|
+
raise ValueError(f"Metric {metric_name} not found in metrics list. This should never happen.")
|
|
297
|
+
|
|
298
|
+
for aggregator in current_metric.AGGREGATORS:
|
|
299
|
+
save_string = (
|
|
300
|
+
f"{aggregator.name} {metric_name} - {subject}"
|
|
301
|
+
if not key
|
|
302
|
+
else f"{aggregator.name} {metric_name} - {key} - {subject}"
|
|
303
|
+
)
|
|
304
|
+
aggregated_results[save_string] = aggregator(ksm_group, ["prompt"])["value"].mean().mean().item()
|
|
305
|
+
|
|
306
|
+
return aggregated_results
|
|
307
|
+
|
|
246
308
|
def run_eval(self) -> list[Result]:
|
|
247
309
|
"""Runs evaluation using saved completions."""
|
|
248
310
|
logger.info("Running evaluation...")
|
|
@@ -252,6 +314,8 @@ class EvaluationGenerator:
|
|
|
252
314
|
|
|
253
315
|
metrics_results = self._run_metric_calculators(responses)
|
|
254
316
|
aggregated_results = self._aggregate_results(metrics_results)
|
|
317
|
+
results_with_aggregators = self._aggregate_results_with_aggregators(metrics_results)
|
|
318
|
+
aggregated_results.update(results_with_aggregators)
|
|
255
319
|
|
|
256
320
|
wandb.log(aggregated_results)
|
|
257
321
|
self.result_processor.save_aggregated_results(aggregated_results)
|
|
@@ -66,7 +66,7 @@ def main(
|
|
|
66
66
|
with wandb.init(
|
|
67
67
|
entity=config.wandb_entity,
|
|
68
68
|
project=config.wandb_project,
|
|
69
|
-
group=llm.name[:127],
|
|
69
|
+
group=(config.wandb_group or llm.name)[:127],
|
|
70
70
|
job_type=config.task_name[:63],
|
|
71
71
|
id=wandb_run_id, # (potentially resuming run after preemption)
|
|
72
72
|
config=response_generator._get_metadata(),
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
from typing import Any, Protocol
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy.special import comb
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Aggregator(Protocol):
|
|
9
|
+
"""Base class for metric aggregators.
|
|
10
|
+
|
|
11
|
+
An aggregator collapses multiple evaluation rows for the same problem (i.e. prompt) into a
|
|
12
|
+
single score per problem. The input DataFrame has one row per (problem, attempt)
|
|
13
|
+
pair; the output has one row per problem with a new ``value``.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
response_df: DataFrame where each row is one evaluation attempt. Must contain
|
|
17
|
+
a ``value`` column (the per-attempt score) and all ``identifier_columns``.
|
|
18
|
+
identifier_columns: Columns that uniquely identify a problem (e.g. ``["prompt"]``).
|
|
19
|
+
Rows sharing the same identifier are different attempts at the same problem.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
DataFrame with one row per unique problem and a ``value`` column holding
|
|
23
|
+
the aggregated score. All non-identifier, non-value columns are preserved
|
|
24
|
+
(typically via ``"first"``).
|
|
25
|
+
|
|
26
|
+
Example input (``identifier_columns=["prompt"]``, 3 attempts per problem):
|
|
27
|
+
|
|
28
|
+
| prompt | value | subject |
|
|
29
|
+
|----------------|-------|---------|
|
|
30
|
+
| "What is 2+2?" | 1.0 | algebra |
|
|
31
|
+
| "What is 2+2?" | 1.0 | algebra |
|
|
32
|
+
| "What is 2+2?" | 0.0 | algebra |
|
|
33
|
+
| "Solve x^2=4" | 0.0 | algebra |
|
|
34
|
+
| "Solve x^2=4" | 1.0 | algebra |
|
|
35
|
+
| "Solve x^2=4" | 0.0 | algebra |
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
|
|
40
|
+
def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame: ...
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def closed_form_passatk(n: int, c: int, k: int) -> float:
|
|
44
|
+
"""Closed-form pass@k estimator (see HumanEval paper).
|
|
45
|
+
|
|
46
|
+
pass@k = 1 - C(n-c, k) / C(n, k)
|
|
47
|
+
|
|
48
|
+
Given n total samples with c correct, this is the probability that at least one of k
|
|
49
|
+
randomly chosen samples is correct. The ratio C(n-c,k)/C(n,k) is the chance all k picks
|
|
50
|
+
are wrong; subtracting from 1 gives success probability. When n-c < k there aren't enough
|
|
51
|
+
wrong samples to fill k slots, so the result is trivially 1.
|
|
52
|
+
"""
|
|
53
|
+
if n < k:
|
|
54
|
+
return 1.0 if c > 0 else 0.0
|
|
55
|
+
if n - c < k:
|
|
56
|
+
return 1.0
|
|
57
|
+
return 1.0 - comb(n - c, k, exact=False) / comb(n, k, exact=False)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class PassAtK(Aggregator):
|
|
61
|
+
"""Computes pass@k: the probability that at least one of k random attempts is correct.
|
|
62
|
+
|
|
63
|
+
Groups rows by ``identifier_columns``, counts correct (``c = sum(value)``) and
|
|
64
|
+
total (``n = count(value)``) attempts per problem, then applies the closed-form
|
|
65
|
+
estimator.
|
|
66
|
+
|
|
67
|
+
Expects ``value`` to be binary (0 or 1). For k=1 this is equivalent to the mean.
|
|
68
|
+
|
|
69
|
+
Example (k=2, continuing from the Aggregator docstring example):
|
|
70
|
+
"What is 2+2?": n=3, c=2, k=2 -> 1.0 (guaranteed correct pick)
|
|
71
|
+
"Solve x^2=4": n=3, c=1, k=2 -> 0.667 (as computed by the `closed_form_passatk`)
|
|
72
|
+
|
|
73
|
+
Output:
|
|
74
|
+
| prompt | value | subject |
|
|
75
|
+
|----------------|-------|---------|
|
|
76
|
+
| "What is 2+2?" | 1.000 | algebra |
|
|
77
|
+
| "Solve x^2=4" | 0.667 | algebra |
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, k: int = 1) -> None:
|
|
81
|
+
self.k = k
|
|
82
|
+
self.name = f"Pass@{k}"
|
|
83
|
+
|
|
84
|
+
def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
|
|
85
|
+
# agg_dict decides how each column (`agg_dict` key) will get aggregated (`agg_dict` value).
|
|
86
|
+
# For the `value` column, we compute both the sum and the count, for all other columns we simply pick the first
|
|
87
|
+
# entry (as they are identical anyway).
|
|
88
|
+
other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
|
|
89
|
+
agg_dict = {"value": ["sum", "count"], **{c: "first" for c in other_cols}}
|
|
90
|
+
agg = response_df.groupby(identifier_columns).agg(agg_dict)
|
|
91
|
+
# flatten multi-index columns from value agg: ("value", "sum") / ("value", "count")
|
|
92
|
+
c = agg[("value", "sum")].values
|
|
93
|
+
n = agg[("value", "count")].values
|
|
94
|
+
scores = np.array([closed_form_passatk(n_i, c_i, self.k) for n_i, c_i in zip(n, c)])
|
|
95
|
+
out = agg.drop(columns=[("value", "sum"), ("value", "count")])
|
|
96
|
+
if isinstance(out.columns, pd.MultiIndex):
|
|
97
|
+
out.columns = out.columns.droplevel(1)
|
|
98
|
+
return out.assign(value=scores).reset_index()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class IdentifierMean(Aggregator):
|
|
102
|
+
"""Computes the arithmetic mean of ``value`` across attempts per problem.
|
|
103
|
+
|
|
104
|
+
Example (continuing from the Aggregator docstring example):
|
|
105
|
+
|
|
106
|
+
"What is 2+2?": mean(1.0, 1.0, 0.0) = 0.667
|
|
107
|
+
"Solve x^2=4": mean(0.0, 1.0, 0.0) = 0.333
|
|
108
|
+
|
|
109
|
+
Output:
|
|
110
|
+
| prompt | value | subject |
|
|
111
|
+
|----------------|-------|---------|
|
|
112
|
+
| "What is 2+2?" | 0.667 | algebra |
|
|
113
|
+
| "Solve x^2=4" | 0.333 | algebra |
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self) -> None:
|
|
117
|
+
self.name = "IdentifierMean"
|
|
118
|
+
|
|
119
|
+
def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
|
|
120
|
+
agg_dict = {
|
|
121
|
+
"value": "mean",
|
|
122
|
+
}
|
|
123
|
+
other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
|
|
124
|
+
agg_dict.update({c: "first" for c in other_cols})
|
|
125
|
+
return response_df.groupby(identifier_columns).agg(agg_dict).reset_index()
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class Identity:
|
|
129
|
+
"""No-op aggregator — returns the input unchanged.
|
|
130
|
+
|
|
131
|
+
Use for metrics where each row is already a final score and no cross-attempt
|
|
132
|
+
aggregation is needed (e.g. when ``num_samples=1``).
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(self) -> None:
|
|
136
|
+
self.name = "Identity"
|
|
137
|
+
|
|
138
|
+
def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
|
|
139
|
+
return response_df
|
|
@@ -3,6 +3,7 @@ from typing import Any
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel, ConfigDict
|
|
5
5
|
|
|
6
|
+
from eval_framework.metrics.aggregators.aggregators import Aggregator
|
|
6
7
|
from eval_framework.shared.types import Error
|
|
7
8
|
|
|
8
9
|
|
|
@@ -28,6 +29,10 @@ class classproperty:
|
|
|
28
29
|
class BaseMetric[Response](ABC):
|
|
29
30
|
NAME: str
|
|
30
31
|
KEYS: list[str] | None = None
|
|
32
|
+
# The aggregator determines how to aggregate the results of a metric for a single
|
|
33
|
+
# sample over multiple runs (LLM calls). We default to averaging and thus making
|
|
34
|
+
# macro averaging the overall computation default.
|
|
35
|
+
AGGREGATORS: list[Aggregator] = []
|
|
31
36
|
|
|
32
37
|
@classproperty
|
|
33
38
|
def NAMES(cls) -> list[str]:
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import string
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
8
|
+
from eval_framework.shared.types import Completion
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AccuracyCompletion(BaseMetric[Completion]):
|
|
12
|
+
NAME = "Accuracy Completion"
|
|
13
|
+
|
|
14
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
15
|
+
if response.error is not None:
|
|
16
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
17
|
+
|
|
18
|
+
ground_truths = response.ground_truth_list
|
|
19
|
+
is_correct = any(response.completion == gt for gt in ground_truths)
|
|
20
|
+
return [
|
|
21
|
+
MetricResult(metric_name=self.NAME, value=float(is_correct), higher_is_better=True, error=response.error)
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AccuracyCompletionWithEvaluate(AccuracyCompletion):
|
|
26
|
+
def __init__(self, regexes_to_ignore: list[str], ignore_case: bool = False, ignore_punctuation: bool = False):
|
|
27
|
+
self.regexes_to_ignore = regexes_to_ignore
|
|
28
|
+
self.ignore_case = ignore_case
|
|
29
|
+
self.ignore_punctuation = ignore_punctuation
|
|
30
|
+
|
|
31
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
32
|
+
if response.error is not None:
|
|
33
|
+
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
|
|
34
|
+
ground_truths = response.ground_truth_list
|
|
35
|
+
model_answer = response.completion
|
|
36
|
+
|
|
37
|
+
is_correct = exact_match_hf_evaluate(
|
|
38
|
+
predictions=[model_answer] * len(ground_truths),
|
|
39
|
+
references=ground_truths, # type: ignore[arg-type]
|
|
40
|
+
regexes_to_ignore=self.regexes_to_ignore,
|
|
41
|
+
ignore_case=self.ignore_case,
|
|
42
|
+
ignore_punctuation=self.ignore_punctuation,
|
|
43
|
+
)["exact_match"]
|
|
44
|
+
return [
|
|
45
|
+
MetricResult(metric_name=self.NAME, value=float(is_correct), higher_is_better=True, error=response.error)
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AccuracyCompletionOLMES(AccuracyCompletionWithEvaluate):
|
|
50
|
+
# If we did a functools partial, code fails as there an issubclass check that
|
|
51
|
+
# doesn't work with partial. These specific regexes are taken from
|
|
52
|
+
# https://github.com/allenai/olmes/blob/main/oe_eval/tasks/oe_eval_tasks/gsm8k.py#L70
|
|
53
|
+
def __init__(self) -> None:
|
|
54
|
+
super().__init__(regexes_to_ignore=[",", "\\$", "(?s).*#### ", "\\.$"])
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# The following code is (largely) reproduced from https://github.com/allenai/olmes/blob/main/oe_eval/dependencies/hf_evaluate/exact_match.py#L25
|
|
58
|
+
# Olmes released under Apache 2.0 license and so is the HF evaluate library.
|
|
59
|
+
# Some cosmetic modifications have been made to fit our codebase and linting rules.
|
|
60
|
+
# -------------------------------------------------------------------------------------
|
|
61
|
+
|
|
62
|
+
### Code ported from Huggingface's `evaluate` library at
|
|
63
|
+
### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
|
|
64
|
+
### which is under the apache license.
|
|
65
|
+
### Port taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/api/metrics.py used
|
|
66
|
+
### to fix the issue: https://github.com/EleutherAI/lm-evaluation-harness/pull/2045
|
|
67
|
+
|
|
68
|
+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
69
|
+
|
|
70
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
71
|
+
# you may not use this file except in compliance with the License.
|
|
72
|
+
# You may obtain a copy of the License at
|
|
73
|
+
|
|
74
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
78
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
79
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
80
|
+
# See the License for the specific language governing permissions and
|
|
81
|
+
# limitations under the License.
|
|
82
|
+
def exact_match_hf_evaluate(
|
|
83
|
+
predictions: list[str],
|
|
84
|
+
references: list[str],
|
|
85
|
+
regexes_to_ignore: list[str] | None = None,
|
|
86
|
+
ignore_case: bool = False,
|
|
87
|
+
ignore_punctuation: bool = False,
|
|
88
|
+
ignore_numbers: bool = False,
|
|
89
|
+
) -> dict[str, Any]: # type: ignore
|
|
90
|
+
if regexes_to_ignore is not None:
|
|
91
|
+
for s in regexes_to_ignore:
|
|
92
|
+
predictions = np.array([re.sub(s, "", x) for x in predictions]) # type: ignore
|
|
93
|
+
references = np.array([re.sub(s, "", x) for x in references]) # type: ignore
|
|
94
|
+
else:
|
|
95
|
+
predictions = np.asarray(predictions) # type: ignore
|
|
96
|
+
references = np.asarray(references) # type: ignore
|
|
97
|
+
|
|
98
|
+
if ignore_case:
|
|
99
|
+
predictions = np.char.lower(predictions) # type: ignore
|
|
100
|
+
references = np.char.lower(references) # type: ignore
|
|
101
|
+
|
|
102
|
+
if ignore_punctuation:
|
|
103
|
+
repl_table = string.punctuation.maketrans("", "", string.punctuation)
|
|
104
|
+
predictions = np.char.translate(predictions, table=repl_table) # type: ignore
|
|
105
|
+
references = np.char.translate(references, table=repl_table) # type: ignore
|
|
106
|
+
|
|
107
|
+
if ignore_numbers:
|
|
108
|
+
repl_table = string.digits.maketrans("", "", string.digits)
|
|
109
|
+
predictions = np.char.translate(predictions, table=repl_table) # type: ignore
|
|
110
|
+
references = np.char.translate(references, table=repl_table) # type: ignore
|
|
111
|
+
|
|
112
|
+
# NOTE: For multiple ground-truths OLMES returns the mean over their scores. The max over
|
|
113
|
+
# it would be more meaningful, but we leave it here for parity.
|
|
114
|
+
score_list = predictions == references
|
|
115
|
+
|
|
116
|
+
return {"exact_match": np.mean(score_list)}
|
|
@@ -12,7 +12,19 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
|
|
|
12
12
|
|
|
13
13
|
# this will always be a list, if return is "" this will be an empty list
|
|
14
14
|
code = response.completion
|
|
15
|
-
|
|
15
|
+
try:
|
|
16
|
+
output = run_python_code(code, image="python:3.12-slim")
|
|
17
|
+
except Exception as e:
|
|
18
|
+
import traceback
|
|
19
|
+
|
|
20
|
+
return [
|
|
21
|
+
MetricResult(
|
|
22
|
+
metric_name=self.NAME,
|
|
23
|
+
value=0.0,
|
|
24
|
+
higher_is_better=True,
|
|
25
|
+
error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
|
|
26
|
+
)
|
|
27
|
+
]
|
|
16
28
|
|
|
17
29
|
# Split and filter out empty strings
|
|
18
30
|
output_parts = [part for part in output.split() if part.strip()]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import importlib.resources
|
|
1
2
|
import traceback
|
|
2
3
|
from collections.abc import Callable
|
|
3
4
|
from typing import Self
|
|
@@ -5,8 +6,17 @@ from typing import Self
|
|
|
5
6
|
from pydantic import Field
|
|
6
7
|
|
|
7
8
|
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
8
|
-
from eval_framework.shared.types import
|
|
9
|
-
|
|
9
|
+
from eval_framework.shared.types import (
|
|
10
|
+
BaseMetricContext,
|
|
11
|
+
Completion,
|
|
12
|
+
Error,
|
|
13
|
+
extract_context_metric,
|
|
14
|
+
)
|
|
15
|
+
from eval_framework.tasks.utils import (
|
|
16
|
+
CallableSerializer,
|
|
17
|
+
ExecutionResult,
|
|
18
|
+
execute_python_code_with_tests,
|
|
19
|
+
)
|
|
10
20
|
|
|
11
21
|
|
|
12
22
|
class CodeExecutionBaseContext(BaseMetricContext):
|
|
@@ -65,7 +75,14 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
|
|
|
65
75
|
|
|
66
76
|
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
67
77
|
if response.error is not None:
|
|
68
|
-
return [
|
|
78
|
+
return [
|
|
79
|
+
MetricResult(
|
|
80
|
+
metric_name=self.NAME,
|
|
81
|
+
value=None,
|
|
82
|
+
higher_is_better=True,
|
|
83
|
+
error=response.error,
|
|
84
|
+
)
|
|
85
|
+
]
|
|
69
86
|
try:
|
|
70
87
|
context = extract_context_metric(response, CodeExecutionPassAtOneContext)
|
|
71
88
|
parsed_context = RealtimeCodeExectionContext.from_context(context)
|
|
@@ -76,8 +93,19 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
|
|
|
76
93
|
try:
|
|
77
94
|
c, output = self._count_correct_samples(response.completion, parsed_context)
|
|
78
95
|
except Exception as e:
|
|
79
|
-
error = Error(
|
|
80
|
-
|
|
96
|
+
error = Error(
|
|
97
|
+
error_class=e.__class__.__name__,
|
|
98
|
+
message=str(e),
|
|
99
|
+
traceback=traceback.format_exc(),
|
|
100
|
+
)
|
|
101
|
+
return [
|
|
102
|
+
MetricResult(
|
|
103
|
+
metric_name=self.NAME,
|
|
104
|
+
value=None,
|
|
105
|
+
higher_is_better=True,
|
|
106
|
+
error=error,
|
|
107
|
+
)
|
|
108
|
+
]
|
|
81
109
|
|
|
82
110
|
pass_at_k_value = estimate_pass_at_k(n, c, self.k)
|
|
83
111
|
return [
|
|
@@ -90,15 +118,40 @@ class CodeExecutionPassAtOne(BaseMetric[Completion]):
|
|
|
90
118
|
)
|
|
91
119
|
]
|
|
92
120
|
|
|
121
|
+
def _count_correct_samples(self, completion: str, context: RealtimeCodeExectionContext) -> tuple[int, str]:
|
|
122
|
+
try:
|
|
123
|
+
result = execute_python_code_with_tests(
|
|
124
|
+
code=completion,
|
|
125
|
+
test_code=context.test_code,
|
|
126
|
+
package_mapping=context.package_downloads,
|
|
127
|
+
merge_code_fn=context.snippet_merge_fn,
|
|
128
|
+
image=context.run_env,
|
|
129
|
+
timeout=context.benchmark_timeout,
|
|
130
|
+
parse_output_fn=context.output_parse_fn,
|
|
131
|
+
dockerfile=None,
|
|
132
|
+
)
|
|
133
|
+
except Exception as e:
|
|
134
|
+
return (0, str(e))
|
|
135
|
+
return (1 if result.success else 0), result.output
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class CodeExecutionPassAtOneWithCodebench(CodeExecutionPassAtOne):
|
|
139
|
+
NAME = "code-execution-pass@1-codebench"
|
|
140
|
+
|
|
141
|
+
def __init__(self) -> None:
|
|
142
|
+
super().__init__()
|
|
143
|
+
self.dockerfile = str(importlib.resources.files("eval_framework.tasks") / "Dockerfile_codebench")
|
|
144
|
+
|
|
93
145
|
def _count_correct_samples(self, completion: str, context: RealtimeCodeExectionContext) -> tuple[int, str]:
|
|
94
146
|
result = execute_python_code_with_tests(
|
|
95
147
|
code=completion,
|
|
96
148
|
test_code=context.test_code,
|
|
97
|
-
package_mapping=
|
|
149
|
+
package_mapping={}, # the docker contains everything
|
|
98
150
|
merge_code_fn=context.snippet_merge_fn,
|
|
99
|
-
image=
|
|
151
|
+
image=None, # dockerfile provided
|
|
100
152
|
timeout=context.benchmark_timeout,
|
|
101
153
|
parse_output_fn=context.output_parse_fn,
|
|
154
|
+
dockerfile=self.dockerfile,
|
|
102
155
|
)
|
|
103
156
|
return (1 if result.success else 0), result.output
|
|
104
157
|
|
|
@@ -2,7 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
from eval_framework.external.drop_process_results import process_results
|
|
4
4
|
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
5
|
-
from eval_framework.shared.types import
|
|
5
|
+
from eval_framework.shared.types import (
|
|
6
|
+
BaseMetricContext,
|
|
7
|
+
Completion,
|
|
8
|
+
extract_context_metric,
|
|
9
|
+
)
|
|
6
10
|
|
|
7
11
|
|
|
8
12
|
class DropMetricContext(BaseMetricContext):
|
|
@@ -20,10 +24,13 @@ class DropF1ExactMatch(BaseMetric[Completion]):
|
|
|
20
24
|
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
21
25
|
if response.error is not None:
|
|
22
26
|
return [
|
|
23
|
-
MetricResult(metric_name=f"{self.NAME}/f1", value=None, higher_is_better=True, error=response.error),
|
|
24
27
|
MetricResult(
|
|
25
|
-
metric_name=
|
|
26
|
-
|
|
28
|
+
metric_name=name,
|
|
29
|
+
value=None,
|
|
30
|
+
higher_is_better=True,
|
|
31
|
+
error=response.error,
|
|
32
|
+
)
|
|
33
|
+
for name in [n.strip() for n in self.NAME.split("/")]
|
|
27
34
|
]
|
|
28
35
|
|
|
29
36
|
context = extract_context_metric(response, DropMetricContext)
|
|
@@ -36,12 +43,14 @@ class DropF1ExactMatch(BaseMetric[Completion]):
|
|
|
36
43
|
pred_spans = [raw]
|
|
37
44
|
|
|
38
45
|
doc = {"answers": answer_tuples}
|
|
39
|
-
|
|
40
|
-
out = process_results(doc, results)
|
|
46
|
+
out = process_results(doc, pred_spans)
|
|
41
47
|
|
|
42
48
|
return [
|
|
43
|
-
MetricResult(metric_name="DROP F1", value=out["f1"], higher_is_better=True, error=response.error),
|
|
44
49
|
MetricResult(
|
|
45
|
-
metric_name=
|
|
46
|
-
|
|
50
|
+
metric_name=name,
|
|
51
|
+
value=out[key],
|
|
52
|
+
higher_is_better=True,
|
|
53
|
+
error=response.error,
|
|
54
|
+
)
|
|
55
|
+
for name, key in zip([n.strip() for n in self.NAME.split("/")], self.KEYS)
|
|
47
56
|
]
|