eval-framework 0.3.8__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.3.8 → eval_framework-0.5.0}/PKG-INFO +10 -14
- {eval_framework-0.3.8 → eval_framework-0.5.0}/README.md +0 -1
- {eval_framework-0.3.8 → eval_framework-0.5.0}/pyproject.toml +15 -36
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/openai.py +2 -2
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/base.py +1 -1
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_assertion.py +4 -14
- eval_framework-0.5.0/src/eval_framework/tasks/__init__.py +12 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/base.py +3 -3
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/aidanbench.py +2 -2
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores200.py +3 -3
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores_plus.py +1 -2
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -1
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/squad.py +21 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/triviaqa.py +27 -1
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/wmt.py +2 -2
- {eval_framework-0.3.8/src/eval_framework/tasks/benchmarks → eval_framework-0.5.0/src/eval_framework/tasks}/dataset_revisions.py +30 -7
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/registry.py +76 -45
- eval_framework-0.5.0/src/eval_framework/tasks/task-dataset-revisions.json +62 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/task_names.py +2 -122
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/task_style.py +64 -2
- eval_framework-0.3.8/src/eval_framework/metrics/completion/comet.py +0 -56
- eval_framework-0.3.8/src/eval_framework/tasks/__init__.py +0 -6
- eval_framework-0.3.8/src/eval_framework/tasks/benchmarks/task-dataset-revisions.json +0 -179
- {eval_framework-0.3.8 → eval_framework-0.5.0}/LICENSE +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/drop_process_results.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/llm/vllm_local_server.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/main.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/aggregators/aggregators.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/drop_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_minerva_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/minerva_math_utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/multipl_e_assertion.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/run.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/suite.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/Dockerfile_codebench +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/balancedcopa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/csqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/drop.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/global_mmlu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/goldenswag.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/lab_bench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/medqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/multipl_e.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/naturalqs_open.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/social_iqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/README.md +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.3.8 → eval_framework-0.5.0}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Evaluation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -212,16 +212,15 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
|
212
212
|
Classifier: Topic :: Software Development :: Libraries
|
|
213
213
|
Classifier: Typing :: Typed
|
|
214
214
|
Requires-Dist: pyyaml>=6.0.3,<7
|
|
215
|
-
Requires-Dist: xmltodict>=0.
|
|
215
|
+
Requires-Dist: xmltodict>=1.0.4,<1.1
|
|
216
216
|
Requires-Dist: pydantic>=2.13.4,<3
|
|
217
|
-
Requires-Dist: datasets>=
|
|
217
|
+
Requires-Dist: datasets>=5.0.0,<6
|
|
218
218
|
Requires-Dist: sacrebleu>=2.6.0,<3
|
|
219
|
-
Requires-Dist: pycountry>=
|
|
219
|
+
Requires-Dist: pycountry>=26.2.16,<27
|
|
220
220
|
Requires-Dist: nltk>=3.9.4,<4
|
|
221
221
|
Requires-Dist: python-dotenv>=1.2.2,<2
|
|
222
222
|
Requires-Dist: lingua-language-detector>=2.2.0,<3
|
|
223
223
|
Requires-Dist: google-crc32c>=1.8.0,<2
|
|
224
|
-
Requires-Dist: kubernetes>=31.0.0,<32
|
|
225
224
|
Requires-Dist: langdetect>=1.0.9,<2
|
|
226
225
|
Requires-Dist: spacy>=3.8.14,<4
|
|
227
226
|
Requires-Dist: jsonschema>=4.26.0,<5
|
|
@@ -232,18 +231,17 @@ Requires-Dist: llm-sandbox[docker]==0.3.39
|
|
|
232
231
|
Requires-Dist: jsonlines>=4,<5
|
|
233
232
|
Requires-Dist: lxml>=6.1.1,<7
|
|
234
233
|
Requires-Dist: python-iso639>=2026.4.20
|
|
235
|
-
Requires-Dist: wandb>=0.27.
|
|
236
|
-
Requires-Dist: boto3>=1.43.
|
|
237
|
-
Requires-Dist: numpy>=
|
|
234
|
+
Requires-Dist: wandb>=0.27.2,<1
|
|
235
|
+
Requires-Dist: boto3>=1.43.19,<2
|
|
236
|
+
Requires-Dist: numpy>=2.2.6
|
|
238
237
|
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
239
238
|
Requires-Dist: scipy>=1.17.1,<2
|
|
240
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
241
|
-
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,
|
|
240
|
+
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
|
|
242
241
|
Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
|
|
243
|
-
Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
|
|
244
242
|
Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
|
|
245
243
|
Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
|
|
246
|
-
Requires-Dist: mistral-common>=1.11.
|
|
244
|
+
Requires-Dist: mistral-common>=1.11.3,<2 ; extra == 'mistral'
|
|
247
245
|
Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
|
|
248
246
|
Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
|
|
249
247
|
Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
|
|
@@ -253,7 +251,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
|
|
|
253
251
|
Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
|
|
254
252
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
|
|
255
253
|
Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
|
|
256
|
-
Requires-Dist: accelerate>=
|
|
254
|
+
Requires-Dist: accelerate>=1.14.0,<2 ; extra == 'transformers'
|
|
257
255
|
Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
|
|
258
256
|
Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
|
|
259
257
|
Requires-Python: >=3.12, <3.13
|
|
@@ -261,7 +259,6 @@ Project-URL: repository, https://github.com/Aleph-Alpha-Research/eval-framework
|
|
|
261
259
|
Provides-Extra: accelerate
|
|
262
260
|
Provides-Extra: all
|
|
263
261
|
Provides-Extra: api
|
|
264
|
-
Provides-Extra: comet
|
|
265
262
|
Provides-Extra: determined
|
|
266
263
|
Provides-Extra: mistral
|
|
267
264
|
Provides-Extra: openai
|
|
@@ -319,7 +316,6 @@ pip install eval_framework
|
|
|
319
316
|
|
|
320
317
|
There are optional extras available to unlock specific features of the library:
|
|
321
318
|
- `api` for inference using the aleph-alpha client.
|
|
322
|
-
- `comet` for the COMET metric.
|
|
323
319
|
- `determined` for running jobs via determined.
|
|
324
320
|
- `mistral` for inference on Mistral models.
|
|
325
321
|
- `transformers` for inference using the transformers library.
|
|
@@ -47,7 +47,6 @@ pip install eval_framework
|
|
|
47
47
|
|
|
48
48
|
There are optional extras available to unlock specific features of the library:
|
|
49
49
|
- `api` for inference using the aleph-alpha client.
|
|
50
|
-
- `comet` for the COMET metric.
|
|
51
50
|
- `determined` for running jobs via determined.
|
|
52
51
|
- `mistral` for inference on Mistral models.
|
|
53
52
|
- `transformers` for inference using the transformers library.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.0"
|
|
4
4
|
description = "Evaluation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -19,16 +19,15 @@ classifiers = [
|
|
|
19
19
|
]
|
|
20
20
|
dependencies = [
|
|
21
21
|
"pyyaml>=6.0.3,<7",
|
|
22
|
-
"xmltodict>=0.
|
|
22
|
+
"xmltodict>=1.0.4,<1.1",
|
|
23
23
|
"pydantic>=2.13.4,<3",
|
|
24
|
-
"datasets>=
|
|
24
|
+
"datasets>=5.0.0,<6",
|
|
25
25
|
"sacrebleu>=2.6.0,<3",
|
|
26
|
-
"pycountry>=
|
|
26
|
+
"pycountry>=26.2.16,<27",
|
|
27
27
|
"nltk>=3.9.4,<4",
|
|
28
28
|
"python-dotenv>=1.2.2,<2",
|
|
29
29
|
"lingua-language-detector>=2.2.0,<3",
|
|
30
30
|
"google-crc32c>=1.8.0,<2",
|
|
31
|
-
"kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
|
|
32
31
|
"langdetect>=1.0.9,<2", # required by the original ifeval implementation
|
|
33
32
|
"spacy>=3.8.14,<4",
|
|
34
33
|
"jsonschema>=4.26.0,<5",
|
|
@@ -39,14 +38,13 @@ dependencies = [
|
|
|
39
38
|
"jsonlines>=4,<5",
|
|
40
39
|
"lxml>=6.1.1,<7",
|
|
41
40
|
"python-iso639>=2026.4.20",
|
|
42
|
-
"wandb>=0.27.
|
|
43
|
-
"boto3>=1.43.
|
|
44
|
-
"numpy>=
|
|
41
|
+
"wandb>=0.27.2,<1",
|
|
42
|
+
"boto3>=1.43.19,<2",
|
|
43
|
+
"numpy>=2.2.6",
|
|
45
44
|
# is a dependency of sympy, but not explicitly listed in the requirements.txt
|
|
46
45
|
# https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
|
|
47
46
|
"antlr4-python3-runtime==4.11.0",
|
|
48
47
|
"scipy>=1.17.1,<2", # required for the aggregation of pass@k metrics
|
|
49
|
-
|
|
50
48
|
]
|
|
51
49
|
|
|
52
50
|
[project.optional-dependencies]
|
|
@@ -64,7 +62,7 @@ openai = [
|
|
|
64
62
|
transformers = [
|
|
65
63
|
"transformers>=4.45.2,<5",
|
|
66
64
|
"torch>=2.5,<3",
|
|
67
|
-
"accelerate>=
|
|
65
|
+
"accelerate>=1.14.0,<2",
|
|
68
66
|
]
|
|
69
67
|
accelerate = ["accelerate"]
|
|
70
68
|
vllm = [
|
|
@@ -72,21 +70,17 @@ vllm = [
|
|
|
72
70
|
"torch>=2.5,<3"
|
|
73
71
|
]
|
|
74
72
|
mistral = [
|
|
75
|
-
"mistral-common>=1.11.
|
|
73
|
+
"mistral-common>=1.11.3,<2",
|
|
76
74
|
"huggingface-hub>=0.36.2,<0.37",
|
|
77
75
|
"eval_framework[vllm]",
|
|
78
76
|
]
|
|
79
|
-
# Benchmark/metric specific extras
|
|
80
|
-
comet = [
|
|
81
|
-
"unbabel-comet>=2.2.7,<3",
|
|
82
|
-
]
|
|
83
77
|
# from template-formatting
|
|
84
78
|
optional = [
|
|
85
79
|
"transformers>=4.45.2,<5",
|
|
86
80
|
"jinja2>=3.1.6,<4"
|
|
87
81
|
]
|
|
88
82
|
all = [
|
|
89
|
-
"eval_framework[determined,api,openai,transformers,accelerate,vllm,
|
|
83
|
+
"eval_framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral]"
|
|
90
84
|
]
|
|
91
85
|
|
|
92
86
|
[project.urls]
|
|
@@ -98,15 +92,15 @@ eval_framework = "eval_framework.run:run"
|
|
|
98
92
|
[dependency-groups]
|
|
99
93
|
dev = [
|
|
100
94
|
"mypy>=2.1.0,<3",
|
|
101
|
-
"pytest>=9.0
|
|
95
|
+
"pytest>=9.1.0,<10",
|
|
102
96
|
"pytest-mock>=3.15.1",
|
|
103
97
|
"pytest-xdist>=3.8.0,<4",
|
|
104
98
|
"pytest-sugar>1.1,<2",
|
|
105
99
|
"types-pyyaml>=6.0.12.20260518,<7",
|
|
106
100
|
"types-python-dateutil>=2.9.0.20260518,<3",
|
|
107
101
|
"types-requests>=2.33.0.20260518,<3",
|
|
108
|
-
"plotly>=
|
|
109
|
-
"ruff>=0.15.
|
|
102
|
+
"plotly>=6.8.0,<7",
|
|
103
|
+
"ruff>=0.15.18",
|
|
110
104
|
"pip-licenses>=5.5.5",
|
|
111
105
|
]
|
|
112
106
|
flash-attn = [
|
|
@@ -115,7 +109,7 @@ flash-attn = [
|
|
|
115
109
|
]
|
|
116
110
|
|
|
117
111
|
[build-system]
|
|
118
|
-
requires = ["uv_build>=0.11.
|
|
112
|
+
requires = ["uv_build>=0.11.22,<0.11.23"]
|
|
119
113
|
build-backend = "uv_build"
|
|
120
114
|
|
|
121
115
|
[tool.uv.build-backend]
|
|
@@ -126,22 +120,6 @@ override-dependencies = [
|
|
|
126
120
|
"requests>=2.32,<3", # fix for determined
|
|
127
121
|
]
|
|
128
122
|
|
|
129
|
-
[tool.uv.sources]
|
|
130
|
-
torch = [
|
|
131
|
-
{ index = "pytorch-default", marker = "sys_platform != 'linux'" },
|
|
132
|
-
{ index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
|
|
133
|
-
]
|
|
134
|
-
|
|
135
|
-
[[tool.uv.index]]
|
|
136
|
-
name = "pytorch-cu124"
|
|
137
|
-
url = "https://download.pytorch.org/whl/cu124"
|
|
138
|
-
explicit = true
|
|
139
|
-
|
|
140
|
-
[[tool.uv.index]]
|
|
141
|
-
name = "pytorch-default"
|
|
142
|
-
url = "https://pypi.org/simple"
|
|
143
|
-
explicit = true
|
|
144
|
-
|
|
145
123
|
[tool.uv.extra-build-dependencies]
|
|
146
124
|
# Build flash-attn with the same torch version as in the container. Details at:
|
|
147
125
|
# https://docs.astral.sh/uv/concepts/projects/config/#augmenting-build-dependencies
|
|
@@ -167,6 +145,7 @@ known-third-party = ["wandb"]
|
|
|
167
145
|
|
|
168
146
|
[tool.ruff.lint.extend-per-file-ignores]
|
|
169
147
|
"__init__.py" = ["F401"]
|
|
148
|
+
"tests/tests_eval_framework/tasks/benchmarks/test_mmlu_de.py" = ["E501"]
|
|
170
149
|
|
|
171
150
|
[tool.mypy]
|
|
172
151
|
plugins = "pydantic.mypy"
|
|
@@ -55,7 +55,7 @@ class OpenAIModel(BaseLLM):
|
|
|
55
55
|
formatter: BaseFormatter | None = None,
|
|
56
56
|
temperature: float | None = None,
|
|
57
57
|
top_p: float | None = None,
|
|
58
|
-
api_key: str | None =
|
|
58
|
+
api_key: str | None = None,
|
|
59
59
|
organization: str | None = None,
|
|
60
60
|
base_url: str | None = None,
|
|
61
61
|
bytes_per_token: float | None = None,
|
|
@@ -86,7 +86,7 @@ class OpenAIModel(BaseLLM):
|
|
|
86
86
|
self._top_p = top_p
|
|
87
87
|
|
|
88
88
|
self._client = OpenAI(
|
|
89
|
-
api_key=api_key,
|
|
89
|
+
api_key=api_key if api_key is not None else os.getenv("OPENAI_API_KEY", ""),
|
|
90
90
|
organization=organization,
|
|
91
91
|
base_url=base_url,
|
|
92
92
|
)
|
|
@@ -36,7 +36,7 @@ class BaseMetric[Response](ABC):
|
|
|
36
36
|
# macro averaging the overall computation default.
|
|
37
37
|
AGGREGATORS: list[Aggregator] = []
|
|
38
38
|
# Set by the evaluation generator before calculate(); controls how infra failures are handled.
|
|
39
|
-
fail_on_error: bool =
|
|
39
|
+
fail_on_error: bool = True
|
|
40
40
|
|
|
41
41
|
@classproperty
|
|
42
42
|
def NAMES(cls) -> list[str]:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from llm_sandbox.exceptions import SandboxTimeoutError
|
|
2
2
|
|
|
3
3
|
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
4
|
-
from eval_framework.shared.types import Completion
|
|
4
|
+
from eval_framework.shared.types import Completion
|
|
5
5
|
from eval_framework.tasks.utils import run_python_code
|
|
6
6
|
|
|
7
7
|
|
|
@@ -16,7 +16,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
|
|
|
16
16
|
code = response.completion
|
|
17
17
|
try:
|
|
18
18
|
output = run_python_code(code, image="python:3.12-slim")
|
|
19
|
-
except SandboxTimeoutError
|
|
19
|
+
except SandboxTimeoutError:
|
|
20
20
|
# The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
|
|
21
21
|
# problem.
|
|
22
22
|
import traceback
|
|
@@ -26,7 +26,7 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
|
|
|
26
26
|
metric_name=self.NAME,
|
|
27
27
|
value=0.0,
|
|
28
28
|
higher_is_better=True,
|
|
29
|
-
|
|
29
|
+
code_execution_trace=traceback.format_exc(),
|
|
30
30
|
)
|
|
31
31
|
]
|
|
32
32
|
except Exception as e:
|
|
@@ -42,22 +42,12 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
|
|
|
42
42
|
last_output = output_parts[-1]
|
|
43
43
|
|
|
44
44
|
success = last_output == "True"
|
|
45
|
-
error = (
|
|
46
|
-
None
|
|
47
|
-
if success
|
|
48
|
-
else Error(
|
|
49
|
-
error_class="CodeCompletionAssertionError",
|
|
50
|
-
message=f"Expected 'True' but got '{last_output}'",
|
|
51
|
-
traceback=output,
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
|
|
55
45
|
return [
|
|
56
46
|
MetricResult(
|
|
57
47
|
metric_name=self.NAME,
|
|
58
48
|
value=1.0 if success else 0.0,
|
|
59
49
|
higher_is_better=True,
|
|
60
|
-
error=
|
|
50
|
+
error=None,
|
|
61
51
|
code_execution_trace=output,
|
|
62
52
|
)
|
|
63
53
|
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Register all tasks on import
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from .dataset_revisions import DatasetRevision
|
|
5
|
+
from .task_names import register_all_tasks
|
|
6
|
+
|
|
7
|
+
DatasetRevision.add_revision_file(Path(__file__).parent / "task-dataset-revisions.json")
|
|
8
|
+
|
|
9
|
+
register_all_tasks()
|
|
10
|
+
|
|
11
|
+
del register_all_tasks
|
|
12
|
+
del DatasetRevision
|
|
@@ -15,7 +15,7 @@ from huggingface_hub.errors import RevisionNotFoundError
|
|
|
15
15
|
from pydantic import BaseModel, ConfigDict
|
|
16
16
|
|
|
17
17
|
from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion
|
|
18
|
-
from eval_framework.tasks.
|
|
18
|
+
from eval_framework.tasks.dataset_revisions import DatasetRevision
|
|
19
19
|
from eval_framework.tasks.utils import classproperty, raise_errors
|
|
20
20
|
from template_formatting.formatter import Message, Role
|
|
21
21
|
|
|
@@ -118,7 +118,7 @@ class BaseTask[SubjectType](ABC):
|
|
|
118
118
|
# Applied once at instance creation; not refreshed if the pin file changes mid-run.
|
|
119
119
|
if custom_hf_revision:
|
|
120
120
|
self.HF_REVISION = custom_hf_revision
|
|
121
|
-
elif self.HF_REVISION is None and (pinned :=
|
|
121
|
+
elif self.HF_REVISION is None and (pinned := DatasetRevision.pinned_revision(self.__class__.__name__)):
|
|
122
122
|
self.HF_REVISION = pinned
|
|
123
123
|
|
|
124
124
|
@classmethod
|
|
@@ -359,7 +359,7 @@ class BaseTask[SubjectType](ABC):
|
|
|
359
359
|
samples: list[Sample],
|
|
360
360
|
stop_sequences: list[str] | None = None,
|
|
361
361
|
max_tokens: int | None = None,
|
|
362
|
-
fail_on_error: bool =
|
|
362
|
+
fail_on_error: bool = True,
|
|
363
363
|
) -> list[Completion]:
|
|
364
364
|
"""
|
|
365
365
|
Generates completions for the sample.
|
{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
@@ -109,7 +109,7 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
109
109
|
stop_sequences: list[str] | None,
|
|
110
110
|
max_tokens: int | None,
|
|
111
111
|
initial_samples: list[Sample],
|
|
112
|
-
fail_on_error: bool =
|
|
112
|
+
fail_on_error: bool = True,
|
|
113
113
|
) -> tuple[list[list[Message]], list[Union["Error", None]]]:
|
|
114
114
|
initial_messages = [s.messages for s in initial_samples]
|
|
115
115
|
samples = [(s, False) for s in initial_samples] # (sample, is_done)
|
|
@@ -170,7 +170,7 @@ class AidanBenchOriginal(BaseTask[str]):
|
|
|
170
170
|
samples: list[Sample],
|
|
171
171
|
stop_sequences: list[str] | None = None,
|
|
172
172
|
max_tokens: int | None = None,
|
|
173
|
-
fail_on_error: bool =
|
|
173
|
+
fail_on_error: bool = True,
|
|
174
174
|
) -> list[Completion]:
|
|
175
175
|
assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), (
|
|
176
176
|
"Each sample must have exactly one USER message."
|
{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import random
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any, cast
|
|
5
5
|
|
|
6
6
|
import pycountry
|
|
7
7
|
from datasets import DatasetDict, DownloadConfig, load_dataset
|
|
@@ -100,11 +100,11 @@ class Flores200(BaseTask[str]):
|
|
|
100
100
|
|
|
101
101
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
102
102
|
source_key = item["subject"].split("-")[0]
|
|
103
|
-
source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
|
|
103
|
+
source_language = cast(Any, pycountry.languages.get(alpha_3=source_key.split("_")[0])).name
|
|
104
104
|
source = item[f"sentence_{source_key}"]
|
|
105
105
|
instruction = f"{source_language} sentence: {source}\n"
|
|
106
106
|
target_key = item["subject"].split("-")[1]
|
|
107
|
-
target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
|
|
107
|
+
target_language = cast(Any, pycountry.languages.get(alpha_3=target_key.split("_")[0])).name
|
|
108
108
|
|
|
109
109
|
return f"{instruction}{target_language} sentence:"
|
|
110
110
|
|
{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
@@ -4,7 +4,6 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from eval_framework.metrics.completion.bleu import BLEU
|
|
6
6
|
from eval_framework.metrics.completion.chrf import CHRF
|
|
7
|
-
from eval_framework.metrics.completion.comet import COMET
|
|
8
7
|
from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
|
|
9
8
|
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
|
|
10
9
|
|
|
@@ -29,7 +28,7 @@ class FloresPlus(BaseTask[str]):
|
|
|
29
28
|
SAMPLE_SPLIT = "dev"
|
|
30
29
|
FEWSHOT_SPLIT = "devtest"
|
|
31
30
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
32
|
-
METRICS = [BLEU, CHRF
|
|
31
|
+
METRICS = [BLEU, CHRF]
|
|
33
32
|
SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
|
|
34
33
|
PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
|
|
35
34
|
LANGUAGE = {
|
|
@@ -236,6 +236,27 @@ class SQUAD(SQUAD2):
|
|
|
236
236
|
return item["answers"]["text"]
|
|
237
237
|
|
|
238
238
|
|
|
239
|
+
class SQuAD2_MA(SQUAD2):
|
|
240
|
+
"""SQuAD v2 with the exact system prompt used in MA training"""
|
|
241
|
+
|
|
242
|
+
NAME = "SQuAD2_MA"
|
|
243
|
+
UNANSWERABLE_STR = "unanswerable"
|
|
244
|
+
|
|
245
|
+
METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
|
|
246
|
+
|
|
247
|
+
def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
|
|
248
|
+
return (
|
|
249
|
+
"You are a helpful assistant and will answer the user's questions carefully, "
|
|
250
|
+
"logically, accurately and well-reasoned.\n"
|
|
251
|
+
"Use the given context to answer the question faithfully. Answer only if the "
|
|
252
|
+
f"answer is present in the given context, otherwise respond with '{self.UNANSWERABLE_STR}' "
|
|
253
|
+
"if the answer is not present in the context."
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
257
|
+
return f"Context:\n{item['context']}\n\nQuestion:\n{item['question']}\n"
|
|
258
|
+
|
|
259
|
+
|
|
239
260
|
class SQuAD_OLMES(SQUAD):
|
|
240
261
|
"""SQuAD variant matching OLMES implementation."""
|
|
241
262
|
|
{eval_framework-0.3.8 → eval_framework-0.5.0}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
@@ -2,7 +2,7 @@ import random
|
|
|
2
2
|
from typing import Any
|
|
3
3
|
|
|
4
4
|
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
|
|
5
|
-
from eval_framework.metrics.completion.f1 import F1
|
|
5
|
+
from eval_framework.metrics.completion.f1 import F1, F1SquadNormalized
|
|
6
6
|
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
|
|
7
7
|
|
|
8
8
|
|
|
@@ -40,3 +40,29 @@ class TRIVIAQA(BaseTask[str]):
|
|
|
40
40
|
|
|
41
41
|
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
42
42
|
return completion_text.strip().rstrip(".")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TriviaQA_MA(TRIVIAQA):
|
|
46
|
+
"""TriviaQA with the exact system prompt used in MA training"""
|
|
47
|
+
|
|
48
|
+
NAME = "TriviaQA_MA"
|
|
49
|
+
SUBJECTS = ["rc.wikipedia"]
|
|
50
|
+
UNANSWERABLE_STR = "unanswerable"
|
|
51
|
+
|
|
52
|
+
METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
|
|
53
|
+
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
|
|
54
|
+
|
|
55
|
+
def _get_context_text(self, item: dict[str, Any]) -> str:
|
|
56
|
+
return "\n\n".join(item["entity_pages"]["wiki_context"])
|
|
57
|
+
|
|
58
|
+
def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
|
|
59
|
+
return (
|
|
60
|
+
"You are a helpful assistant and will answer the user's questions carefully, "
|
|
61
|
+
"logically, accurately and well-reasoned.\n"
|
|
62
|
+
"Use the given context to answer the question faithfully. Answer only if the "
|
|
63
|
+
f"answer is present in the given context, otherwise respond with '{self.UNANSWERABLE_STR}' "
|
|
64
|
+
"if the answer is not present in the context."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
68
|
+
return f"Context:\n{self._get_context_text(item)}\n\nQuestion:\n{item['question'].strip()}\n"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import random
|
|
2
2
|
from abc import ABC
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import Any, cast
|
|
4
4
|
|
|
5
5
|
import pycountry
|
|
6
6
|
import sacrebleu
|
|
@@ -38,7 +38,7 @@ class WMT(BaseTask[str], ABC):
|
|
|
38
38
|
def _code_to_language(self, code: str) -> str:
|
|
39
39
|
# key is alpha_2 or alpha_3 depending on the code length
|
|
40
40
|
key = f"alpha_{len(code)}"
|
|
41
|
-
language_tuple = pycountry.languages.get(**{key: code})
|
|
41
|
+
language_tuple = cast(Any, pycountry.languages.get(**{key: code}))
|
|
42
42
|
return language_tuple.name
|
|
43
43
|
|
|
44
44
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
@@ -25,12 +25,35 @@ def _pinned_revisions(revisions_file: Path) -> dict[str, str]:
|
|
|
25
25
|
return json.loads(revisions_file.read_text(encoding="utf-8"))
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
28
|
+
class DatasetRevision:
|
|
29
|
+
_INSTANCE: "DatasetRevision | None" = None
|
|
30
|
+
|
|
31
|
+
def __init__(self) -> None:
|
|
32
|
+
self._cache: dict[str, str] = {}
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def _get_instance(cls) -> "DatasetRevision":
|
|
36
|
+
if cls._INSTANCE is None:
|
|
37
|
+
cls._INSTANCE = cls()
|
|
38
|
+
return cls._INSTANCE
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def add_revision_file(cls, file_path: Path | str) -> None:
|
|
42
|
+
instance = cls._get_instance()
|
|
43
|
+
instance._append_revision_file(Path(file_path))
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def pinned_revision(cls, task_class_name: str) -> str | None:
|
|
47
|
+
return cls._get_instance()._cache.get(task_class_name)
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def reset(cls) -> None:
|
|
51
|
+
# for unit tests only.
|
|
52
|
+
cls._INSTANCE = None
|
|
53
|
+
|
|
54
|
+
def _append_revision_file(self, file_path: Path) -> None:
|
|
55
|
+
revisions = _pinned_revisions(file_path)
|
|
56
|
+
self._cache |= revisions
|
|
34
57
|
|
|
35
58
|
|
|
36
59
|
def _repo_sha(api: HfApi, repo_id: str, cache: dict[str, str | None]) -> str | None:
|
|
@@ -73,7 +96,7 @@ def main() -> None:
|
|
|
73
96
|
revisions = collect_dataset_revisions(registered_task_names(), HfApi())
|
|
74
97
|
REVISIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
75
98
|
REVISIONS_FILE.write_text(
|
|
76
|
-
json.dumps(dict(sorted(revisions.items())), indent=
|
|
99
|
+
json.dumps(dict(sorted(revisions.items())), indent=4, ensure_ascii=False) + "\n",
|
|
77
100
|
encoding="utf-8",
|
|
78
101
|
)
|
|
79
102
|
logger.info("Wrote %d revisions to %s", len(revisions), REVISIONS_FILE)
|