eval-framework 0.2.7__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.7 → eval_framework-0.2.9}/PKG-INFO +5 -4
- {eval_framework-0.2.7 → eval_framework-0.2.9}/README.md +3 -3
- {eval_framework-0.2.7 → eval_framework-0.2.9}/pyproject.toml +4 -1
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/determined.py +3 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/eval.py +2 -1
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/local.py +1 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/math_reasoning_completion.py +10 -9
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/response_generator.py +25 -6
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/run.py +8 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/math_reasoning.py +19 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/eval_config.py +3 -1
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/task_names.py +1 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/LICENSE +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.7 → eval_framework-0.2.9}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.9
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -235,6 +235,7 @@ Requires-Dist: python-iso639>=2025.2.18
|
|
|
235
235
|
Requires-Dist: wandb>=0.23.0,<1
|
|
236
236
|
Requires-Dist: boto3>=1.40.54,<2
|
|
237
237
|
Requires-Dist: numpy>=1.26.4
|
|
238
|
+
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
238
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
239
240
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
240
241
|
Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
|
|
@@ -283,7 +284,7 @@ Description-Content-Type: text/markdown
|
|
|
283
284
|
[](https://aleph-alpha-research.github.io/eval-framework/)
|
|
284
285
|
[](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
|
|
285
286
|
|
|
286
|
-

|
|
287
288
|
|
|
288
289
|
</div>
|
|
289
290
|
|
|
@@ -543,6 +544,6 @@ This project has received funding from the European Union’s Digital Europe Pro
|
|
|
543
544
|
The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
|
|
544
545
|
|
|
545
546
|
<p align="center">
|
|
546
|
-
<img src="
|
|
547
|
-
<img src="
|
|
547
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
|
|
548
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
|
|
548
549
|
</p>
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
[](https://aleph-alpha-research.github.io/eval-framework/)
|
|
14
14
|
[](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
|
|
15
15
|
|
|
16
|
-

|
|
17
17
|
|
|
18
18
|
</div>
|
|
19
19
|
|
|
@@ -273,6 +273,6 @@ This project has received funding from the European Union’s Digital Europe Pro
|
|
|
273
273
|
The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
|
|
274
274
|
|
|
275
275
|
<p align="center">
|
|
276
|
-
<img src="
|
|
277
|
-
<img src="
|
|
276
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
|
|
277
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
|
|
278
278
|
</p>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.9"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -42,6 +42,9 @@ dependencies = [
|
|
|
42
42
|
"wandb>=0.23.0,<1",
|
|
43
43
|
"boto3>=1.40.54,<2",
|
|
44
44
|
"numpy>=1.26.4",
|
|
45
|
+
# is a dependency of sympy, but not explicitly listed in the requirements.txt
|
|
46
|
+
# https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
|
|
47
|
+
"antlr4-python3-runtime==4.11.0",
|
|
45
48
|
]
|
|
46
49
|
|
|
47
50
|
[project.optional-dependencies]
|
|
@@ -31,6 +31,7 @@ class TaskArgs(BaseModel):
|
|
|
31
31
|
task_subjects: list[str] | None = None
|
|
32
32
|
hf_revision: str | None = None
|
|
33
33
|
perturbation_config: PerturbationConfig | None = None
|
|
34
|
+
repeats: int | None = None
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
class Hyperparameters(BaseModel):
|
|
@@ -110,6 +111,7 @@ class DeterminedContext(EvalContext):
|
|
|
110
111
|
"judge_model_name",
|
|
111
112
|
"judge_model_args",
|
|
112
113
|
"perturbation_config",
|
|
114
|
+
"repeats",
|
|
113
115
|
]:
|
|
114
116
|
val_cli = getattr(self, name, None)
|
|
115
117
|
val_hparams = getattr(self.hparams.task_args, name, None)
|
|
@@ -152,6 +154,7 @@ class DeterminedContext(EvalContext):
|
|
|
152
154
|
randomize_judge_order=self.randomize_judge_order,
|
|
153
155
|
delete_output_dir_after_upload=self.hparams.delete_output_dir_after_upload
|
|
154
156
|
or self.delete_output_dir_after_upload,
|
|
157
|
+
repeats=self.hparams.task_args.repeats or self.repeats,
|
|
155
158
|
)
|
|
156
159
|
|
|
157
160
|
return self
|
|
@@ -75,6 +75,7 @@ class EvalContext(AbstractContextManager):
|
|
|
75
75
|
perturbation_seed: int | None = None,
|
|
76
76
|
randomize_judge_order: bool = False,
|
|
77
77
|
delete_output_dir_after_upload: bool | None = None,
|
|
78
|
+
repeats: int | None = None,
|
|
78
79
|
) -> None:
|
|
79
80
|
self.llm_name = llm_name
|
|
80
81
|
self.models_path = models_path
|
|
@@ -99,7 +100,7 @@ class EvalContext(AbstractContextManager):
|
|
|
99
100
|
self.description = description
|
|
100
101
|
self.randomize_judge_order = randomize_judge_order
|
|
101
102
|
self.delete_output_dir_after_upload = delete_output_dir_after_upload
|
|
102
|
-
|
|
103
|
+
self.repeats = repeats
|
|
103
104
|
if perturbation_type or perturbation_probability is not None:
|
|
104
105
|
perturbation = {
|
|
105
106
|
"type": perturbation_type,
|
|
@@ -204,10 +204,15 @@ class MathReasoningCompletion(BaseMetric[Completion]):
|
|
|
204
204
|
timeout = 10
|
|
205
205
|
# latex parse all ingested ground truth values for math reasoning
|
|
206
206
|
for gt in response.ground_truth_list:
|
|
207
|
+
if gt is None:
|
|
208
|
+
continue
|
|
207
209
|
signal.signal(signal.SIGALRM, timeout_handler) # Set timeout signal
|
|
208
210
|
signal.alarm(timeout) # Set timeout duration
|
|
209
211
|
try:
|
|
210
|
-
|
|
212
|
+
gt_normalized = self.normalize_expression(gt)
|
|
213
|
+
gt_parsed = parse_latex(
|
|
214
|
+
gt_normalized
|
|
215
|
+
) # NOTE: parses f(x)=0,\quadf(x)=x-1,\quadf(x)=-x+1 to Eq(f(x), 0) ONLY
|
|
211
216
|
ground_truths.append(gt_parsed)
|
|
212
217
|
except Exception:
|
|
213
218
|
ground_truths.append(gt)
|
|
@@ -229,15 +234,11 @@ class MathReasoningCompletion(BaseMetric[Completion]):
|
|
|
229
234
|
)
|
|
230
235
|
]
|
|
231
236
|
else:
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
assert isinstance(response.ground_truth, str)
|
|
235
|
-
str_is_correct = self._is_str_correct(normalized_response, response.ground_truth)
|
|
236
|
-
return [
|
|
237
|
-
MetricResult(
|
|
238
|
-
metric_name=self.NAME, value=float(str_is_correct), higher_is_better=True, error=response.error
|
|
239
|
-
)
|
|
237
|
+
normalized_ground_truths = [
|
|
238
|
+
self.normalize_expression(gt) for gt in response.ground_truth_list if gt is not None
|
|
240
239
|
]
|
|
240
|
+
res = self._any_str_correct([normalized_response], normalized_ground_truths)
|
|
241
|
+
return [MetricResult(metric_name=self.NAME, value=float(res), higher_is_better=True, error=response.error)]
|
|
241
242
|
|
|
242
243
|
def _any_str_correct(self, response_list: list, ground_truths: list) -> bool:
|
|
243
244
|
"""
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
import traceback
|
|
4
|
-
from collections.abc import Callable
|
|
4
|
+
from collections.abc import Callable, Iterable
|
|
5
5
|
from datetime import UTC, datetime
|
|
6
6
|
from functools import partial
|
|
7
|
-
from typing import Any
|
|
8
7
|
|
|
9
8
|
from eval_framework.tasks.registry import get_task
|
|
10
9
|
|
|
@@ -14,6 +13,8 @@ except ImportError:
|
|
|
14
13
|
get_cluster_info = None # type: ignore[assignment]
|
|
15
14
|
|
|
16
15
|
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
17
18
|
from tqdm import tqdm
|
|
18
19
|
|
|
19
20
|
from eval_framework import __version__ as eval_framework_version
|
|
@@ -234,18 +235,21 @@ class ResponseGenerator:
|
|
|
234
235
|
# If samples_batch_size = 1, samples are run sequentially; in any case, we return here after finishing each
|
|
235
236
|
# individual batch to honor preemption requests and save cached results.
|
|
236
237
|
samples_batch_size = self.config.batch_size
|
|
238
|
+
repeats = self.config.repeats
|
|
237
239
|
|
|
238
240
|
# Calculate total samples for progress bar - use num_samples or iterate to count
|
|
239
|
-
|
|
240
|
-
if total_num_samples is None:
|
|
241
|
+
if self.num_samples is None:
|
|
241
242
|
# Count samples by iterating (this might be expensive for large datasets)
|
|
242
|
-
total_num_samples = sum(1 for _ in self.task.iterate_samples(None))
|
|
243
|
+
total_num_samples = sum(1 for _ in self.task.iterate_samples(None)) * repeats
|
|
244
|
+
else:
|
|
245
|
+
total_num_samples = self.num_samples * repeats
|
|
243
246
|
|
|
244
247
|
samples_batch: list[Sample] = []
|
|
245
248
|
with tqdm(
|
|
246
249
|
total=total_num_samples, desc=f"Processing {self.response_type.value}", disable=get_disable_bar_flag()
|
|
247
250
|
) as pbar:
|
|
248
|
-
|
|
251
|
+
samples = self.task.iterate_samples(self.num_samples)
|
|
252
|
+
for i, sample in enumerate(repeat_samples(samples, repeats)):
|
|
249
253
|
subject = f" - Subject: {sample.subject}"
|
|
250
254
|
sample_index = i + 1
|
|
251
255
|
|
|
@@ -330,6 +334,7 @@ class ResponseGenerator:
|
|
|
330
334
|
"llm_name",
|
|
331
335
|
"llm_args",
|
|
332
336
|
"perturbation_config",
|
|
337
|
+
"repeats",
|
|
333
338
|
]
|
|
334
339
|
for key in keys:
|
|
335
340
|
if loaded_metadata[key] != current_metadata[key]:
|
|
@@ -349,3 +354,17 @@ class ResponseGenerator:
|
|
|
349
354
|
logger.info("Completions generated and saved.")
|
|
350
355
|
|
|
351
356
|
return responses, preempted
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def repeat_samples(samples: Iterable[Sample], repeats: int) -> Iterable[Sample]:
|
|
360
|
+
"""Flatten repeats into a single stream of samples.
|
|
361
|
+
|
|
362
|
+
After expansion original sample indices do not point to the same sample anymore. They
|
|
363
|
+
Original sample can be recovered by `original_index = expanded_index // repeats`.
|
|
364
|
+
"""
|
|
365
|
+
for sample in samples:
|
|
366
|
+
base_id = sample.id * repeats
|
|
367
|
+
for repeat_idx in range(repeats):
|
|
368
|
+
repeated_sample = sample.model_copy()
|
|
369
|
+
repeated_sample.id = base_id + repeat_idx
|
|
370
|
+
yield repeated_sample
|
|
@@ -77,6 +77,13 @@ def parse_args() -> argparse.Namespace:
|
|
|
77
77
|
parser.add_argument(
|
|
78
78
|
"--num-fewshot", type=int, required=False, default=0, help="The number of fewshot examples to use."
|
|
79
79
|
)
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"--repeats",
|
|
82
|
+
type=int,
|
|
83
|
+
required=False,
|
|
84
|
+
default=1,
|
|
85
|
+
help="The number of times to repeat each sample in the evaluation.",
|
|
86
|
+
)
|
|
80
87
|
parser.add_argument("--task-name", type=str, required=False, help="The name of the task to evaluate.")
|
|
81
88
|
parser.add_argument(
|
|
82
89
|
"--randomize-judge-order",
|
|
@@ -319,6 +326,7 @@ def run_with_kwargs(kwargs: dict) -> None:
|
|
|
319
326
|
num_samples=kwargs["num_samples"],
|
|
320
327
|
max_tokens=kwargs["max_tokens"],
|
|
321
328
|
num_fewshot=kwargs["num_fewshot"],
|
|
329
|
+
repeats=kwargs["repeats"],
|
|
322
330
|
task_name=kwargs["task_name"],
|
|
323
331
|
task_subjects=kwargs["task_subjects"],
|
|
324
332
|
hf_revision=kwargs["hf_revision"],
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
@@ -355,6 +355,25 @@ class AIME2024(MATHReasoning):
|
|
|
355
355
|
return item["answer"].lstrip("0") # valid answers in this dataset range from 0-999 and have leading zeros
|
|
356
356
|
|
|
357
357
|
|
|
358
|
+
class AIME2025(AIME2024):
|
|
359
|
+
"""AIME 2025 dataset: https://huggingface.co/datasets/math-ai/aime25
|
|
360
|
+
|
|
361
|
+
This dataset contains a single test split of 30 questions.
|
|
362
|
+
Data contains
|
|
363
|
+
problem | answer | id
|
|
364
|
+
|
|
365
|
+
pass@1 evaluation
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
NAME = "AIME2025"
|
|
369
|
+
DATASET_PATH = "math-ai/aime25"
|
|
370
|
+
SAMPLE_SPLIT = "test"
|
|
371
|
+
FEWSHOT_SPLIT = "test"
|
|
372
|
+
|
|
373
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
|
|
374
|
+
return item["answer"]
|
|
375
|
+
|
|
376
|
+
|
|
358
377
|
class MATH500(MATHReasoning):
|
|
359
378
|
"""MATH500 dataset: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
|
|
360
379
|
|
|
@@ -54,7 +54,9 @@ class EvalConfig(BaseConfig):
|
|
|
54
54
|
save_intermediate_results: Annotated[bool, BeforeValidator(lambda v: True if v is None else v)] = True
|
|
55
55
|
save_logs: Annotated[bool, BeforeValidator(lambda v: True if v is None else v)] = True
|
|
56
56
|
delete_output_dir_after_upload: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
|
|
57
|
-
|
|
57
|
+
# how many times to repeat a single sample
|
|
58
|
+
# can be used to reduce variance of tasks with low number of samples, e.g. AIME24
|
|
59
|
+
repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
|
|
58
60
|
# Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
|
|
59
61
|
|
|
60
62
|
@property
|
|
@@ -18,6 +18,7 @@ class TaskNameEnum(Enum):
|
|
|
18
18
|
def register_all_tasks() -> None:
|
|
19
19
|
"""Register all the benchmark tasks with the eval framework."""
|
|
20
20
|
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2024")
|
|
21
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2025")
|
|
21
22
|
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC")
|
|
22
23
|
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_IDK")
|
|
23
24
|
register_lazy_task("eval_framework.tasks.benchmarks.arc_de.ARC_DE")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/niah_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/result_processors/wandb_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.7 → eval_framework-0.2.9}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|