eval-framework 0.5.0__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.5.0 → eval_framework-0.5.1}/PKG-INFO +2 -2
- {eval_framework-0.5.0 → eval_framework-0.5.1}/pyproject.toml +4 -4
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/evaluation_generator.py +5 -9
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/response_generator.py +8 -14
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gsm8k.py +30 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py +34 -40
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/squad.py +5 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/triviaqa.py +5 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/eval_config.py +2 -3
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/registry.py +102 -18
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task_names.py +2 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/generate_task_docs.py +6 -12
- {eval_framework-0.5.0 → eval_framework-0.5.1}/LICENSE +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/README.md +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/llm/vllm_local_server.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/main.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/run.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/dataset_revisions.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task-dataset-revisions.json +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task_style.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/README.md +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.5.0 → eval_framework-0.5.1}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Evaluation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -235,7 +235,7 @@ Requires-Dist: wandb>=0.27.2,<1
|
|
|
235
235
|
Requires-Dist: boto3>=1.43.19,<2
|
|
236
236
|
Requires-Dist: numpy>=2.2.6
|
|
237
237
|
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
238
|
-
Requires-Dist: scipy>=1.
|
|
238
|
+
Requires-Dist: scipy>=1.18.0,<2
|
|
239
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
240
240
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
|
|
241
241
|
Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.5.
|
|
3
|
+
version = "0.5.1"
|
|
4
4
|
description = "Evaluation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -44,7 +44,7 @@ dependencies = [
|
|
|
44
44
|
# is a dependency of sympy, but not explicitly listed in the requirements.txt
|
|
45
45
|
# https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
|
|
46
46
|
"antlr4-python3-runtime==4.11.0",
|
|
47
|
-
"scipy>=1.
|
|
47
|
+
"scipy>=1.18.0,<2", # required for the aggregation of pass@k metrics
|
|
48
48
|
]
|
|
49
49
|
|
|
50
50
|
[project.optional-dependencies]
|
|
@@ -104,12 +104,12 @@ dev = [
|
|
|
104
104
|
"pip-licenses>=5.5.5",
|
|
105
105
|
]
|
|
106
106
|
flash-attn = [
|
|
107
|
-
"flash-attn>=2.8.3,<2.9",
|
|
107
|
+
"flash-attn>=2.8.3.post1,<2.9",
|
|
108
108
|
"torch"
|
|
109
109
|
]
|
|
110
110
|
|
|
111
111
|
[build-system]
|
|
112
|
-
requires = ["uv_build>=0.11.
|
|
112
|
+
requires = ["uv_build>=0.11.23,<0.11.24"]
|
|
113
113
|
build-backend = "uv_build"
|
|
114
114
|
|
|
115
115
|
[tool.uv.build-backend]
|
|
@@ -18,7 +18,7 @@ from eval_framework.result_processors.base import Result, ResultProcessor
|
|
|
18
18
|
from eval_framework.shared.types import Completion, Loglikelihood
|
|
19
19
|
from eval_framework.tasks.base import ResponseType
|
|
20
20
|
from eval_framework.tasks.eval_config import EvalConfig
|
|
21
|
-
from eval_framework.tasks.registry import
|
|
21
|
+
from eval_framework.tasks.registry import registry
|
|
22
22
|
from eval_framework.utils.constants import RED, RESET
|
|
23
23
|
from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
|
|
24
24
|
|
|
@@ -36,13 +36,9 @@ class EvaluationGenerator:
|
|
|
36
36
|
self.result_processor = result_processor
|
|
37
37
|
self.save_intermediate_results = config.save_intermediate_results
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
task_metrics = list(task_class.TASK_STYLER.metrics)
|
|
43
|
-
else:
|
|
44
|
-
response_type = task_class.RESPONSE_TYPE
|
|
45
|
-
task_metrics = task_class.METRICS
|
|
39
|
+
eval_ = registry()[config.task_name]
|
|
40
|
+
response_type = eval_.response_type()
|
|
41
|
+
task_metrics = eval_.metrics()
|
|
46
42
|
|
|
47
43
|
if response_type == ResponseType.COMPLETION:
|
|
48
44
|
self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
|
|
@@ -51,7 +47,7 @@ class EvaluationGenerator:
|
|
|
51
47
|
else:
|
|
52
48
|
raise NotImplementedError
|
|
53
49
|
|
|
54
|
-
self.task_name = task_class.NAME
|
|
50
|
+
self.task_name = eval_.task_class().NAME
|
|
55
51
|
|
|
56
52
|
def _run_metric_calculators(self, responses: list[Completion | Loglikelihood]) -> list[Result]:
|
|
57
53
|
results: list[Result] = self.result_processor.load_metrics_results()
|
|
@@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable
|
|
|
5
5
|
from datetime import UTC, datetime
|
|
6
6
|
from functools import partial
|
|
7
7
|
|
|
8
|
-
from eval_framework.tasks.registry import
|
|
8
|
+
from eval_framework.tasks.registry import registry
|
|
9
9
|
|
|
10
10
|
try:
|
|
11
11
|
from determined._info import get_cluster_info
|
|
@@ -28,7 +28,6 @@ from eval_framework.shared.types import (
|
|
|
28
28
|
)
|
|
29
29
|
from eval_framework.tasks.base import Language, ResponseType, Sample
|
|
30
30
|
from eval_framework.tasks.eval_config import EvalConfig
|
|
31
|
-
from eval_framework.tasks.perturbation import create_perturbation_class
|
|
32
31
|
from eval_framework.tasks.utils import raise_errors
|
|
33
32
|
from eval_framework.utils.constants import RED, RESET
|
|
34
33
|
from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
|
|
@@ -54,7 +53,6 @@ def map_language_to_value(
|
|
|
54
53
|
|
|
55
54
|
class ResponseGenerator:
|
|
56
55
|
def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFileProcessor) -> None:
|
|
57
|
-
self.few_shot = config.num_fewshot
|
|
58
56
|
self.task_name = config.task_name
|
|
59
57
|
self.llm = llm
|
|
60
58
|
self.config = config
|
|
@@ -62,20 +60,16 @@ class ResponseGenerator:
|
|
|
62
60
|
self.num_samples = config.num_samples
|
|
63
61
|
self.save_intermediate_results = config.save_intermediate_results
|
|
64
62
|
|
|
65
|
-
task_class = get_task(config.task_name)
|
|
66
|
-
|
|
67
63
|
if config.perturbation_config is not None:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
64
|
+
self.task = registry()[config.task_name].create_perturbation(
|
|
65
|
+
config.perturbation_config,
|
|
66
|
+
config.num_fewshot,
|
|
67
|
+
config.task_subjects,
|
|
68
|
+
config.hf_revision,
|
|
73
69
|
)
|
|
74
70
|
else:
|
|
75
|
-
self.task =
|
|
76
|
-
|
|
77
|
-
custom_subjects=self.config.task_subjects,
|
|
78
|
-
custom_hf_revision=self.config.hf_revision,
|
|
71
|
+
self.task = registry()[config.task_name].create(
|
|
72
|
+
config.num_fewshot, config.task_subjects, config.hf_revision
|
|
79
73
|
)
|
|
80
74
|
|
|
81
75
|
self.response_type = self.task.get_response_type()
|
|
@@ -4,6 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion, AccuracyCompletionOLMES
|
|
6
6
|
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
|
|
7
|
+
from eval_framework.tasks.task_style import BPBStyle
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
@@ -215,3 +216,32 @@ class GSM8K_OLMES(GSM8K):
|
|
|
215
216
|
|
|
216
217
|
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
217
218
|
return self._clean_short_answer(completion_text)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class GSM8KBPB(GSM8K_OLMES):
|
|
222
|
+
NAME = "GSM8KBPB"
|
|
223
|
+
TASK_STYLER = BPBStyle(cue_text="Answer:", leading_space_continuations=False)
|
|
224
|
+
|
|
225
|
+
# BPBStyle already adds "Answer:" as that separate assistant message. But the methods we inherit
|
|
226
|
+
# still put "Answer:" at the end of the question text and leave it out of the fewshot answer.
|
|
227
|
+
# So we override them here: remove "Answer:" from the question, and add it back in front of the
|
|
228
|
+
# fewshot answer. Without this, the question ends in "Answer:Answer:" and fewshot answers have
|
|
229
|
+
# no "Answer:" label at all.
|
|
230
|
+
|
|
231
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
232
|
+
return f"Question: {item['question']}\n"
|
|
233
|
+
|
|
234
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
235
|
+
return f"Answer:{self.normalize_answer_str(item)}"
|
|
236
|
+
|
|
237
|
+
def _get_raw_question(self, item: dict[str, Any]) -> str:
|
|
238
|
+
return item["question"]
|
|
239
|
+
|
|
240
|
+
def _get_choices(self, item: dict[str, Any]) -> list[str]:
|
|
241
|
+
return [self.normalize_answer_str(item)]
|
|
242
|
+
|
|
243
|
+
def _get_correct_index(self, item: dict[str, Any]) -> int:
|
|
244
|
+
return 0
|
|
245
|
+
|
|
246
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str:
|
|
247
|
+
return self._get_choices(item)[0]
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
@@ -14,8 +14,8 @@ from eval_framework.metrics.completion.minerva_math_utils import (
|
|
|
14
14
|
extract_answers,
|
|
15
15
|
normalized_gold_from_solution,
|
|
16
16
|
)
|
|
17
|
-
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
|
|
18
17
|
from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
|
|
18
|
+
from eval_framework.tasks.task_style import BPBStyle
|
|
19
19
|
|
|
20
20
|
# Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB)
|
|
21
21
|
MATH_SUBJECTS = [
|
|
@@ -612,44 +612,6 @@ class MATH500Minerva(MATHMinerva):
|
|
|
612
612
|
super().__init__(num_fewshot)
|
|
613
613
|
|
|
614
614
|
|
|
615
|
-
class MATHMinervaBPB(MATHReasoning):
|
|
616
|
-
"""
|
|
617
|
-
MATH (Hendrycks) with Minerva-style prompt, evaluated via loglikelihood of the
|
|
618
|
-
gold answer string (bits-per-byte).
|
|
619
|
-
Same prompt as MATHMinerva; scores P(normalized_gold_answer | prompt).
|
|
620
|
-
"""
|
|
621
|
-
|
|
622
|
-
NAME = "MATHMinervaBPB"
|
|
623
|
-
DATASET_PATH = "EleutherAI/hendrycks_math"
|
|
624
|
-
SAMPLE_SPLIT = "test"
|
|
625
|
-
FEWSHOT_SPLIT = "train"
|
|
626
|
-
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
627
|
-
METRICS = [BitsPerByteLoglikelihood]
|
|
628
|
-
SUBJECTS = MATH_SUBJECTS
|
|
629
|
-
LANGUAGE = Language.ENG
|
|
630
|
-
|
|
631
|
-
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
632
|
-
return "Problem:\n" + item["problem"] + "\n\n" + "Solution:"
|
|
633
|
-
|
|
634
|
-
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
635
|
-
return ""
|
|
636
|
-
|
|
637
|
-
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
638
|
-
normalized = self._normalized_gold_from_solution(item["solution"])
|
|
639
|
-
if normalized is None:
|
|
640
|
-
return None
|
|
641
|
-
return " " + normalized
|
|
642
|
-
|
|
643
|
-
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
644
|
-
normalized = self._normalized_gold_from_solution(item["solution"])
|
|
645
|
-
if normalized is None:
|
|
646
|
-
return None
|
|
647
|
-
return [" " + normalized]
|
|
648
|
-
|
|
649
|
-
def _normalized_gold_from_solution(self, solution: str) -> str | None:
|
|
650
|
-
return normalized_gold_from_solution(solution)
|
|
651
|
-
|
|
652
|
-
|
|
653
615
|
class MATHLvl5(MATH):
|
|
654
616
|
NAME = "Math Lvl 5"
|
|
655
617
|
|
|
@@ -742,7 +704,7 @@ Answer:"""
|
|
|
742
704
|
|
|
743
705
|
|
|
744
706
|
_OLMES_FEWSHOTS = [
|
|
745
|
-
|
|
707
|
+
# https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
|
|
746
708
|
{
|
|
747
709
|
"problem": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
|
|
748
710
|
"solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so "
|
|
@@ -790,3 +752,35 @@ class MATHMinerva_OLMES(MATHMinerva):
|
|
|
790
752
|
|
|
791
753
|
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
792
754
|
return _OLMES_FEWSHOTS[: self.num_fewshot]
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
class MATHMinervaBPB(MATHMinerva_OLMES):
|
|
758
|
+
NAME = "MATHMinervaBPB"
|
|
759
|
+
TASK_STYLER = BPBStyle(cue_text="Solution:")
|
|
760
|
+
|
|
761
|
+
# BPBStyle already adds "Solution:" as that separate assistant message. But the methods we inherit
|
|
762
|
+
# still put "Solution:" at the end of the question text and leave it out of the fewshot answer.
|
|
763
|
+
# So we override them here: remove "Solution:" from the question, and add it back in front of the
|
|
764
|
+
# fewshot answer. Without this, the question ends in "Solution:Solution:" and fewshot answers have
|
|
765
|
+
# no "Solution:" label at all.
|
|
766
|
+
|
|
767
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
768
|
+
return "Problem:\n" + item["problem"] + "\n\n"
|
|
769
|
+
|
|
770
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
771
|
+
return f"Solution: {item['solution']}"
|
|
772
|
+
|
|
773
|
+
def _get_choices(self, item: dict[str, Any]) -> list[str]:
|
|
774
|
+
answer = normalized_gold_from_solution(item["solution"])
|
|
775
|
+
template = f"\nFinal Answer: The final answer is {answer}. I hope it is correct."
|
|
776
|
+
|
|
777
|
+
return [item["solution"] + template]
|
|
778
|
+
|
|
779
|
+
def _get_correct_index(self, item: dict[str, Any]) -> int:
|
|
780
|
+
return 0
|
|
781
|
+
|
|
782
|
+
def _get_raw_question(self, item: dict[str, Any]) -> str:
|
|
783
|
+
return item["problem"]
|
|
784
|
+
|
|
785
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
|
|
786
|
+
return self._get_choices(item)[0]
|
|
@@ -244,6 +244,11 @@ class SQuAD2_MA(SQUAD2):
|
|
|
244
244
|
|
|
245
245
|
METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
|
|
246
246
|
|
|
247
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
248
|
+
super().__init__(num_fewshot)
|
|
249
|
+
self.stop_sequences = []
|
|
250
|
+
self.max_tokens = None
|
|
251
|
+
|
|
247
252
|
def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
|
|
248
253
|
return (
|
|
249
254
|
"You are a helpful assistant and will answer the user's questions carefully, "
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
@@ -52,6 +52,11 @@ class TriviaQA_MA(TRIVIAQA):
|
|
|
52
52
|
METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
|
|
53
53
|
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
|
|
54
54
|
|
|
55
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
56
|
+
super().__init__(num_fewshot)
|
|
57
|
+
self.stop_sequences = []
|
|
58
|
+
self.max_tokens = None
|
|
59
|
+
|
|
55
60
|
def _get_context_text(self, item: dict[str, Any]) -> str:
|
|
56
61
|
return "\n\n".join(item["entity_pages"]["wiki_context"])
|
|
57
62
|
|
|
@@ -10,7 +10,7 @@ from eval_framework.llm.base import BaseLLM
|
|
|
10
10
|
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
|
|
11
11
|
from eval_framework.tasks.base import BaseTask
|
|
12
12
|
from eval_framework.tasks.perturbation import PerturbationConfig
|
|
13
|
-
from eval_framework.tasks.registry import get_task, validate_task_name
|
|
13
|
+
from eval_framework.tasks.registry import get_task, registry, validate_task_name
|
|
14
14
|
from eval_framework.utils.constants import ROOT_DIR
|
|
15
15
|
|
|
16
16
|
# Keys that don't impact actual evaluation results and should be excluded from config dumps for hashing purposes.
|
|
@@ -115,8 +115,7 @@ class EvalConfig(BaseConfig):
|
|
|
115
115
|
|
|
116
116
|
@model_validator(mode="after")
|
|
117
117
|
def validate_llm_judge_defined(self) -> "EvalConfig":
|
|
118
|
-
|
|
119
|
-
task_metrics = task(num_fewshot=0).get_metrics()
|
|
118
|
+
task_metrics = registry()[self.task_name].metrics()
|
|
120
119
|
for metric_class in task_metrics:
|
|
121
120
|
if issubclass(metric_class, BaseLLMJudgeMetric):
|
|
122
121
|
assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."
|
|
@@ -3,15 +3,19 @@ import importlib
|
|
|
3
3
|
import re
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
5
|
from collections.abc import Generator, Iterator, Sequence
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
7
|
|
|
8
|
-
from eval_framework.tasks.base import BaseTask
|
|
8
|
+
from eval_framework.tasks.base import BaseTask, ResponseType
|
|
9
|
+
from eval_framework.tasks.perturbation import PerturbationConfig, create_perturbation_class
|
|
9
10
|
from eval_framework.utils.packaging import is_extra_installed, validate_package_extras
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from eval_framework.metrics.base import BaseMetric
|
|
14
|
+
|
|
11
15
|
__all__ = [
|
|
12
16
|
"register_task",
|
|
13
17
|
"register_lazy_task",
|
|
14
|
-
"
|
|
18
|
+
"EvalFactory",
|
|
15
19
|
"Registry",
|
|
16
20
|
"with_registry",
|
|
17
21
|
"get_task",
|
|
@@ -22,13 +26,13 @@ __all__ = [
|
|
|
22
26
|
]
|
|
23
27
|
|
|
24
28
|
|
|
25
|
-
class
|
|
26
|
-
"""Produces a registered benchmark's
|
|
29
|
+
class EvalFactory(ABC):
|
|
30
|
+
"""Produces a registered benchmark's eval.
|
|
27
31
|
|
|
28
|
-
The registry stores one factory per
|
|
29
|
-
constructed without constructing all
|
|
32
|
+
The registry stores one factory per eval. This allows the factory to be
|
|
33
|
+
constructed without constructing all evals. Going via this ABC allows
|
|
30
34
|
the factory instances to contain state specifically relevant to the
|
|
31
|
-
|
|
35
|
+
eval, as well as supporting different strategies for instantiating it.
|
|
32
36
|
E.g. eager vs lazy loading of the required dependencies.
|
|
33
37
|
"""
|
|
34
38
|
|
|
@@ -41,11 +45,33 @@ class BenchmarkFactory(ABC):
|
|
|
41
45
|
def source_module(self) -> str:
|
|
42
46
|
"""Module the task class is defined in, resolvable without importing it."""
|
|
43
47
|
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def response_type(self) -> ResponseType:
|
|
50
|
+
"""The eval's response type"""
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def metrics(self) -> list[type["BaseMetric"]]:
|
|
54
|
+
"""The eval's metrics"""
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def create(
|
|
58
|
+
self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None
|
|
59
|
+
) -> BaseTask: ...
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def create_perturbation(
|
|
63
|
+
self,
|
|
64
|
+
perturbation_config: PerturbationConfig,
|
|
65
|
+
num_fewshot: int,
|
|
66
|
+
custom_subjects: list[str] | None,
|
|
67
|
+
custom_hf_revision: str | None,
|
|
68
|
+
) -> BaseTask: ...
|
|
69
|
+
|
|
44
70
|
|
|
45
|
-
class _Lazy(
|
|
71
|
+
class _Lazy(EvalFactory):
|
|
46
72
|
"""
|
|
47
|
-
Create
|
|
48
|
-
|
|
73
|
+
Create eval from qualified class path; Delays importing modules until
|
|
74
|
+
eval is constructed.
|
|
49
75
|
"""
|
|
50
76
|
|
|
51
77
|
def __init__(self, class_name: str, module: str, extras: Sequence[str] = ()) -> None:
|
|
@@ -73,8 +99,35 @@ class _Lazy(BenchmarkFactory):
|
|
|
73
99
|
self._loaded = getattr(module, self._class_name)
|
|
74
100
|
return self._loaded
|
|
75
101
|
|
|
102
|
+
def create(self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None) -> BaseTask:
|
|
103
|
+
return self.task_class().with_overwrite(
|
|
104
|
+
num_fewshot=num_fewshot, custom_subjects=custom_subjects, custom_hf_revision=custom_hf_revision
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def create_perturbation(
|
|
108
|
+
self,
|
|
109
|
+
perturbation_config: PerturbationConfig,
|
|
110
|
+
num_fewshot: int,
|
|
111
|
+
custom_subjects: list[str] | None,
|
|
112
|
+
custom_hf_revision: str | None,
|
|
113
|
+
) -> BaseTask:
|
|
114
|
+
perturbation_task_class = create_perturbation_class(self.task_class(), perturbation_config)
|
|
115
|
+
return perturbation_task_class.with_overwrite(
|
|
116
|
+
num_fewshot=num_fewshot,
|
|
117
|
+
custom_subjects=custom_subjects,
|
|
118
|
+
custom_hf_revision=custom_hf_revision,
|
|
119
|
+
)
|
|
76
120
|
|
|
77
|
-
|
|
121
|
+
def response_type(self) -> ResponseType:
|
|
122
|
+
"""The eval's response type"""
|
|
123
|
+
return self.task_class().get_response_type()
|
|
124
|
+
|
|
125
|
+
def metrics(self) -> list[type["BaseMetric"]]:
|
|
126
|
+
"""The eval's metrics"""
|
|
127
|
+
return self.task_class().get_metrics()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class _Eager(EvalFactory):
|
|
78
131
|
"""Wraps an already-imported task class."""
|
|
79
132
|
|
|
80
133
|
def __init__(self, task: type[BaseTask]) -> None:
|
|
@@ -87,6 +140,33 @@ class _Eager(BenchmarkFactory):
|
|
|
87
140
|
def task_class(self) -> type[BaseTask]:
|
|
88
141
|
return self._task
|
|
89
142
|
|
|
143
|
+
def create(self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None) -> BaseTask:
|
|
144
|
+
return self.task_class().with_overwrite(
|
|
145
|
+
num_fewshot=num_fewshot, custom_subjects=custom_subjects, custom_hf_revision=custom_hf_revision
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def create_perturbation(
|
|
149
|
+
self,
|
|
150
|
+
perturbation_config: PerturbationConfig,
|
|
151
|
+
num_fewshot: int,
|
|
152
|
+
custom_subjects: list[str] | None,
|
|
153
|
+
custom_hf_revision: str | None,
|
|
154
|
+
) -> BaseTask:
|
|
155
|
+
perturbation_task_class = create_perturbation_class(self.task_class(), perturbation_config)
|
|
156
|
+
return perturbation_task_class.with_overwrite(
|
|
157
|
+
num_fewshot=num_fewshot,
|
|
158
|
+
custom_subjects=custom_subjects,
|
|
159
|
+
custom_hf_revision=custom_hf_revision,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
def response_type(self) -> ResponseType:
|
|
163
|
+
"""The eval's response type"""
|
|
164
|
+
return self.task_class().get_response_type()
|
|
165
|
+
|
|
166
|
+
def metrics(self) -> list[type["BaseMetric"]]:
|
|
167
|
+
"""The eval's metrics"""
|
|
168
|
+
return self.task_class().get_metrics()
|
|
169
|
+
|
|
90
170
|
|
|
91
171
|
class Registry:
|
|
92
172
|
"""A registry for tasks with support for lazy loading.
|
|
@@ -97,7 +177,7 @@ class Registry:
|
|
|
97
177
|
|
|
98
178
|
def __init__(self) -> None:
|
|
99
179
|
# TODO: Lookup only with upper names
|
|
100
|
-
self._registry: dict[str, tuple[str,
|
|
180
|
+
self._registry: dict[str, tuple[str, EvalFactory]] = dict()
|
|
101
181
|
|
|
102
182
|
def __iter__(self) -> Iterator[str]:
|
|
103
183
|
for name, _ in self._registry.values():
|
|
@@ -116,20 +196,20 @@ class Registry:
|
|
|
116
196
|
task_key = self._task_key(name)
|
|
117
197
|
return task_key in self._registry
|
|
118
198
|
|
|
119
|
-
def __getitem__(self, name: str, /) ->
|
|
199
|
+
def __getitem__(self, name: str, /) -> EvalFactory:
|
|
120
200
|
task_key = self._task_key(name)
|
|
121
201
|
try:
|
|
122
202
|
_, factory = self._registry[task_key]
|
|
123
203
|
except KeyError:
|
|
124
|
-
raise KeyError(f"Task not found: {name}")
|
|
204
|
+
raise KeyError(f"Task not found: {name=} with task_key {task_key=}")
|
|
125
205
|
|
|
126
|
-
return factory
|
|
206
|
+
return factory
|
|
127
207
|
|
|
128
208
|
def add(self, task: type[BaseTask]) -> None:
|
|
129
209
|
task_key = self._task_key(task.NAME)
|
|
130
210
|
self._registry[task_key] = (task.NAME, _Eager(task))
|
|
131
211
|
|
|
132
|
-
def __setitem__(self, name: str, factory:
|
|
212
|
+
def __setitem__(self, name: str, factory: EvalFactory) -> None:
|
|
133
213
|
task_key = self._task_key(name)
|
|
134
214
|
if task_key in self._registry:
|
|
135
215
|
raise ValueError(f"Cannot register duplicate task with key: {task_key}")
|
|
@@ -140,6 +220,10 @@ class Registry:
|
|
|
140
220
|
_REGISTRY = Registry()
|
|
141
221
|
|
|
142
222
|
|
|
223
|
+
def registry() -> Registry:
|
|
224
|
+
return _REGISTRY
|
|
225
|
+
|
|
226
|
+
|
|
143
227
|
@contextlib.contextmanager
|
|
144
228
|
def with_registry(registry: Registry) -> Generator[None, Any, None]:
|
|
145
229
|
"""Contextmanager to change the current registry."""
|
|
@@ -183,7 +267,7 @@ def get_task(name: str, /) -> type[BaseTask]:
|
|
|
183
267
|
|
|
184
268
|
Note: This method will import any lazily registered task.
|
|
185
269
|
"""
|
|
186
|
-
return _REGISTRY[name]
|
|
270
|
+
return _REGISTRY[name].task_class()
|
|
187
271
|
|
|
188
272
|
|
|
189
273
|
def register_task(task: type[BaseTask]) -> str:
|
|
@@ -30,6 +30,8 @@ def register_all_tasks() -> None:
|
|
|
30
30
|
register_lazy_task("eval_framework.tasks.benchmarks.goldenswag.GOLDENSWAG_IDK")
|
|
31
31
|
register_lazy_task("eval_framework.tasks.benchmarks.gpqa.GPQA_OLMES")
|
|
32
32
|
register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8K_OLMES")
|
|
33
|
+
register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8KBPB")
|
|
34
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHMinervaBPB")
|
|
33
35
|
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.GSM8KReasoning")
|
|
34
36
|
register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG")
|
|
35
37
|
register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG_OLMES")
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
|
|
7
7
|
import tqdm
|
|
8
8
|
|
|
9
|
-
from eval_framework.tasks.registry import
|
|
9
|
+
from eval_framework.tasks.registry import registered_task_names, registry
|
|
10
10
|
from eval_framework.tasks.task_loader import load_extra_tasks
|
|
11
11
|
from template_formatting.formatter import BaseFormatter, ConcatFormatter, Llama3Formatter
|
|
12
12
|
|
|
@@ -69,7 +69,8 @@ def generate_docs_for_task(
|
|
|
69
69
|
output_docs_directory: Path, task_name: str, formatters: list[BaseFormatter], add_prompt_examples: bool
|
|
70
70
|
) -> None:
|
|
71
71
|
"""Generate documentation for a specific task."""
|
|
72
|
-
|
|
72
|
+
eval_ = registry()[task_name]
|
|
73
|
+
task_class = eval_.task_class()
|
|
73
74
|
|
|
74
75
|
try:
|
|
75
76
|
num_fewshot = 1
|
|
@@ -98,16 +99,9 @@ def generate_docs_for_task(
|
|
|
98
99
|
f.write(f"SAMPLE_SPLIT = {task.SAMPLE_SPLIT}".strip() + "\n")
|
|
99
100
|
if hasattr(task, "FEWSHOT_SPLIT"):
|
|
100
101
|
f.write(f"FEWSHOT_SPLIT = {task.FEWSHOT_SPLIT}".strip() + "\n")
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
|
|
105
|
-
else:
|
|
106
|
-
if hasattr(task, "RESPONSE_TYPE"):
|
|
107
|
-
f.write(f"RESPONSE_TYPE = {task.RESPONSE_TYPE.name}".strip() + "\n")
|
|
108
|
-
if hasattr(task, "METRICS"):
|
|
109
|
-
metrics_list = [f"{m.__name__}" for m in task.METRICS]
|
|
110
|
-
f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
|
|
102
|
+
f.write(f"RESPONSE_TYPE = {eval_.response_type().name}".strip() + "\n")
|
|
103
|
+
metrics_list = [f"{m.__name__}" for m in eval_.metrics()]
|
|
104
|
+
f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
|
|
111
105
|
if hasattr(task, "SUBJECTS"):
|
|
112
106
|
f.write(f"SUBJECTS = {repr(task.SUBJECTS)}".strip() + "\n")
|
|
113
107
|
if hasattr(task, "LANGUAGE"):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/drop_process_results.py
RENAMED
|
File without changes
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/aggregators/aggregators.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|