eval-framework 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.0 → eval_framework-0.2.2}/LICENSE +1 -1
- {eval_framework-0.2.0 → eval_framework-0.2.2}/PKG-INFO +69 -76
- {eval_framework-0.2.0 → eval_framework-0.2.2}/README.md +67 -71
- {eval_framework-0.2.0 → eval_framework-0.2.2}/pyproject.toml +19 -22
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/context/determined.py +11 -12
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/context/eval.py +4 -3
- eval_framework-0.2.2/src/eval_framework/context/local.py +75 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/base.py +39 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/huggingface.py +58 -18
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/models.py +8 -3
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/vllm.py +70 -5
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/main.py +8 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/response_generator.py +3 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/run.py +30 -18
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/base.py +1 -1
- eval_framework-0.2.2/src/eval_framework/tasks/benchmarks/flores200.py +133 -0
- eval_framework-0.2.2/src/eval_framework/tasks/benchmarks/squad.py +211 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/struct_eval.py +17 -11
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/utils.py +7 -1
- eval_framework-0.2.2/src/eval_framework/utils/file_ops.py +224 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/generate_task_docs.py +6 -6
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/formatter.py +2 -1
- eval_framework-0.2.0/src/eval_framework/context/local.py +0 -52
- eval_framework-0.2.0/src/eval_framework/tasks/benchmarks/flores200.py +0 -62
- eval_framework-0.2.0/src/eval_framework/tasks/benchmarks/squad.py +0 -89
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/hf_processor.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/py.typed +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/tests/test_formatter_eval.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/tests/test_formatter_scaling.py +0 -0
- {eval_framework-0.2.0 → eval_framework-0.2.2}/src/template_formatting/tests/test_mistral_formatter.py +0 -0
|
@@ -186,7 +186,7 @@
|
|
|
186
186
|
same "printed page" as the copyright notice for easier
|
|
187
187
|
identification within third-party archives.
|
|
188
188
|
|
|
189
|
-
Copyright
|
|
189
|
+
Copyright 2025 Aleph Alpha Research GmbH
|
|
190
190
|
|
|
191
191
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
192
192
|
you may not use this file except in compliance with the License.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -191,7 +191,7 @@ License: Apache License
|
|
|
191
191
|
same "printed page" as the copyright notice for easier
|
|
192
192
|
identification within third-party archives.
|
|
193
193
|
|
|
194
|
-
Copyright
|
|
194
|
+
Copyright 2025 Aleph Alpha Research GmbH
|
|
195
195
|
|
|
196
196
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
197
197
|
you may not use this file except in compliance with the License.
|
|
@@ -218,8 +218,6 @@ Requires-Dist: datasets>=2.19.1,<4
|
|
|
218
218
|
Requires-Dist: sacrebleu>=2.4.3,<3
|
|
219
219
|
Requires-Dist: pycountry>=24.6.1,<25
|
|
220
220
|
Requires-Dist: nltk>=3.9.1,<4
|
|
221
|
-
Requires-Dist: types-pyyaml>=6.0.12.20240917,<7
|
|
222
|
-
Requires-Dist: psutil>=6.1,<7
|
|
223
221
|
Requires-Dist: python-dotenv>=1.0.1,<2
|
|
224
222
|
Requires-Dist: lingua-language-detector>=2.0.2,<3
|
|
225
223
|
Requires-Dist: google-crc32c>=1.5.0,<2
|
|
@@ -235,7 +233,6 @@ Requires-Dist: jsonlines>=4,<5
|
|
|
235
233
|
Requires-Dist: lxml>=6,<7
|
|
236
234
|
Requires-Dist: python-iso639>=2025.2.18
|
|
237
235
|
Requires-Dist: wandb>=0.21.1,<1
|
|
238
|
-
Requires-Dist: torch
|
|
239
236
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
240
237
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
241
238
|
Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
|
|
@@ -270,21 +267,73 @@ Description-Content-Type: text/markdown
|
|
|
270
267
|
# Aleph Alpha Eval-Framework
|
|
271
268
|
|
|
272
269
|
> **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
|
|
270
|
+

|
|
273
271
|
|
|
274
|
-
##
|
|
272
|
+
## Why Choose This Framework?
|
|
273
|
+
|
|
274
|
+
- **Scalability**: Built for distributed evaluation. Currently providing an integration with Determined AI.
|
|
275
|
+
- **Extensibility**: Easily add custom models, benchmarks, and metrics with object-oriented base classes.
|
|
276
|
+
- **Comprehensive**: Comes pre-loaded with over 90 tasks covering a broad and diverse range, from reasoning and coding to safety and long-context. Also comes with a comprehensive set of metrics, including LLM-as-a-judge evaluations.
|
|
277
|
+
|
|
278
|
+
## Other features
|
|
275
279
|
|
|
276
|
-
- 90+ Benchmarks: Covers reasoning, knowledge, coding, long-context, and safety tasks.
|
|
277
|
-
- Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
|
|
278
|
-
- Distributed Evaluation: Integration with Determined AI for scalable distributed evaluation.
|
|
279
|
-
- Docker Support: Pre-configured Dockerfiles for local and distributed setups.
|
|
280
280
|
- Flexible Model Integration: Supports models loaded via HuggingFace Transformers or custom implementations using the BaseLLM class.
|
|
281
|
+
- Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
|
|
281
282
|
- Custom Metrics: Easily define new metrics using the BaseMetric class.
|
|
282
|
-
- Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
|
|
283
283
|
- Perturbation Testing: Robustness analysis with configurable perturbation types and probabilities.
|
|
284
|
+
- Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
|
|
284
285
|
- Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
|
|
285
|
-
-
|
|
286
|
+
- Docker Support: Pre-configured Dockerfiles for local and distributed setups.
|
|
287
|
+
|
|
288
|
+
## Quick Start
|
|
289
|
+
|
|
290
|
+
The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
|
|
291
|
+
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](docs/installation.md).
|
|
292
|
+
|
|
293
|
+
The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
|
|
294
|
+
```
|
|
295
|
+
pip install eval_framework
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
There are optional extras available to unlock specific features of the library:
|
|
299
|
+
- `api` for inference using the aleph-alpha client.
|
|
300
|
+
- `comet` for the COMET metric.
|
|
301
|
+
- `determined` for running jobs via determined.
|
|
302
|
+
- `mistral` for inference on Mistral models.
|
|
303
|
+
- `transformers` for inference using the transformers library.
|
|
304
|
+
- `vllm` for inference via VLLM.
|
|
305
|
+
|
|
306
|
+
As a short hand, the `all` extra installs all of the above.
|
|
307
|
+
|
|
308
|
+
For development, you can instead install it directly from the repository. Please first install
|
|
309
|
+
[uv](https://docs.astral.sh/uv/getting-started/installation/)
|
|
286
310
|
|
|
287
|
-
|
|
311
|
+
To install the project with all optional extras use
|
|
312
|
+
```bash
|
|
313
|
+
uv sync --all-extras
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
We provide custom groups to control optional extras.
|
|
317
|
+
- `flash_attn`: Install `flash_attn` with correct handling of build isolation
|
|
318
|
+
|
|
319
|
+
Thus, the following will setup the project with `flash_attn`
|
|
320
|
+
```bash
|
|
321
|
+
uv sync --all-extras --group flash_attn
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
To evaluate a single benchmark locally, you can use the following command:
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
eval_framework \
|
|
328
|
+
--models src/eval_framework/llm/models.py \
|
|
329
|
+
--llm-name Smollm135MInstruct \
|
|
330
|
+
--task-name "GSM8K" \
|
|
331
|
+
--output-dir ./eval \
|
|
332
|
+
--num-fewshot 5 \
|
|
333
|
+
--num-samples 10
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
|
|
288
337
|
|
|
289
338
|
## Benchmark Coverage & Task Categories
|
|
290
339
|
|
|
@@ -336,51 +385,6 @@ Evaluation metrics include:
|
|
|
336
385
|
|
|
337
386
|
For the full list of tasks and metrics, see [Detailed Task Table](docs/benchmarks_and_metrics.md).
|
|
338
387
|
|
|
339
|
-
## Quick Start
|
|
340
|
-
|
|
341
|
-
The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
|
|
342
|
-
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support.
|
|
343
|
-
|
|
344
|
-
The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
|
|
345
|
-
```
|
|
346
|
-
pip install eval_framework
|
|
347
|
-
```
|
|
348
|
-
|
|
349
|
-
There are optional extras available to unlock specific features of the library:
|
|
350
|
-
- `mistral` for inference on Mistral models
|
|
351
|
-
- `transformers` for inference using the transformers library
|
|
352
|
-
- `api` for inference using the aleph-alpha client.
|
|
353
|
-
- `vllm` for inference via VLLM
|
|
354
|
-
- `determined` for running jobs via determined
|
|
355
|
-
- `comet` for the COMET metric
|
|
356
|
-
|
|
357
|
-
As a short hand, the `all` extra installs all of the above.
|
|
358
|
-
|
|
359
|
-
For development, you can instead install it directly from the repository instead, please first install
|
|
360
|
-
[uv](https://docs.astral.sh/uv/getting-started/installation/)
|
|
361
|
-
|
|
362
|
-
To install the project with all optional extras use
|
|
363
|
-
```bash
|
|
364
|
-
uv sync --all-extras
|
|
365
|
-
```
|
|
366
|
-
|
|
367
|
-
We provide custom groups to control optional extras.
|
|
368
|
-
- `cpu`: Use the CPU backend for torch
|
|
369
|
-
- `cu124`: Use the CUDA 12.4 backend
|
|
370
|
-
- `flash_attn`: Install `flash_attn` with correct handling of build isolation
|
|
371
|
-
|
|
372
|
-
Thus, the following will setup the project with `flash_attn` and CUDA 12.4
|
|
373
|
-
```bash
|
|
374
|
-
uv sync --all-extras --group flash_attn --group cu124
|
|
375
|
-
```
|
|
376
|
-
|
|
377
|
-
There is also a pre-commit hook to help with development:
|
|
378
|
-
```
|
|
379
|
-
uv run pre-commit install
|
|
380
|
-
```
|
|
381
|
-
|
|
382
|
-
After installation, task documentation can be generated with `uv run python src/eval_framework/utils/generate_task_docs.py` (see [docs/installation.md(docs/installation.md)) for more details.
|
|
383
|
-
|
|
384
388
|
## Getting Started
|
|
385
389
|
|
|
386
390
|
### Understanding the Evaluation Framework
|
|
@@ -449,22 +453,7 @@ pip install eval_framework[transformers]
|
|
|
449
453
|
- **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
|
|
450
454
|
- **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
|
|
451
455
|
- **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
|
|
452
|
-
|
|
453
|
-
### Example CLI Usage
|
|
454
|
-
|
|
455
|
-
To evaluate a single benchmark locally, you can use the following command:
|
|
456
|
-
|
|
457
|
-
```bash
|
|
458
|
-
eval_framework \
|
|
459
|
-
--models src/eval_framework/llm/models.py \
|
|
460
|
-
--llm-name Smollm135MInstruct \
|
|
461
|
-
--task-name "GSM8K" \
|
|
462
|
-
--output-dir ./eval \
|
|
463
|
-
--num-fewshot 5 \
|
|
464
|
-
--num-samples 10
|
|
465
|
-
```
|
|
466
|
-
|
|
467
|
-
For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
|
|
456
|
+
- **Log results in WandB**: See how [we integrate WandB](docs/wandb_integration.md) for metric and lineage tracking
|
|
468
457
|
|
|
469
458
|
## Documentation
|
|
470
459
|
|
|
@@ -485,6 +474,10 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
|
|
|
485
474
|
- **[Using Determined](docs/using_determined.md)** - Guide for distributed evaluation using Determined AI
|
|
486
475
|
- **[Controlling Upload Results](docs/controlling_upload_results.md)** - How to manage and control the upload of evaluation results
|
|
487
476
|
|
|
477
|
+
### Contributing
|
|
478
|
+
|
|
479
|
+
- **[Contributing Guide](CONTRIBUTING.md)** - Guide for contributing to this project
|
|
480
|
+
|
|
488
481
|
### Citation
|
|
489
482
|
|
|
490
483
|
If you use `eval-framework` in your research, please cite:
|
|
@@ -509,6 +502,6 @@ This project has received funding from the European Union’s Digital Europe Pro
|
|
|
509
502
|
The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
|
|
510
503
|
|
|
511
504
|
<p align="center">
|
|
512
|
-
<img src="docs/OELLM_1.png" alt="
|
|
513
|
-
<img src="docs/OELLM_2.png" alt="
|
|
505
|
+
<img src="docs/OELLM_1.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
|
|
506
|
+
<img src="docs/OELLM_2.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_2.png" width="350"/>
|
|
514
507
|
</p>
|
|
@@ -1,21 +1,73 @@
|
|
|
1
1
|
# Aleph Alpha Eval-Framework
|
|
2
2
|
|
|
3
3
|
> **Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
|
|
4
|
+

|
|
4
5
|
|
|
5
|
-
##
|
|
6
|
+
## Why Choose This Framework?
|
|
7
|
+
|
|
8
|
+
- **Scalability**: Built for distributed evaluation. Currently providing an integration with Determined AI.
|
|
9
|
+
- **Extensibility**: Easily add custom models, benchmarks, and metrics with object-oriented base classes.
|
|
10
|
+
- **Comprehensive**: Comes pre-loaded with over 90 tasks covering a broad and diverse range, from reasoning and coding to safety and long-context. Also comes with a comprehensive set of metrics, including LLM-as-a-judge evaluations.
|
|
11
|
+
|
|
12
|
+
## Other features
|
|
6
13
|
|
|
7
|
-
- 90+ Benchmarks: Covers reasoning, knowledge, coding, long-context, and safety tasks.
|
|
8
|
-
- Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
|
|
9
|
-
- Distributed Evaluation: Integration with Determined AI for scalable distributed evaluation.
|
|
10
|
-
- Docker Support: Pre-configured Dockerfiles for local and distributed setups.
|
|
11
14
|
- Flexible Model Integration: Supports models loaded via HuggingFace Transformers or custom implementations using the BaseLLM class.
|
|
15
|
+
- Custom Benchmarks: Easily add new benchmarks with minimal code using the BaseTask class.
|
|
12
16
|
- Custom Metrics: Easily define new metrics using the BaseMetric class.
|
|
13
|
-
- Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
|
|
14
17
|
- Perturbation Testing: Robustness analysis with configurable perturbation types and probabilities.
|
|
18
|
+
- Rich Outputs: Generates JSON results, plots, and detailed analysis reports.
|
|
15
19
|
- Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
|
|
16
|
-
-
|
|
20
|
+
- Docker Support: Pre-configured Dockerfiles for local and distributed setups.
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
|
|
25
|
+
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](docs/installation.md).
|
|
26
|
+
|
|
27
|
+
The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
|
|
28
|
+
```
|
|
29
|
+
pip install eval_framework
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
There are optional extras available to unlock specific features of the library:
|
|
33
|
+
- `api` for inference using the aleph-alpha client.
|
|
34
|
+
- `comet` for the COMET metric.
|
|
35
|
+
- `determined` for running jobs via determined.
|
|
36
|
+
- `mistral` for inference on Mistral models.
|
|
37
|
+
- `transformers` for inference using the transformers library.
|
|
38
|
+
- `vllm` for inference via VLLM.
|
|
39
|
+
|
|
40
|
+
As a short hand, the `all` extra installs all of the above.
|
|
41
|
+
|
|
42
|
+
For development, you can instead install it directly from the repository. Please first install
|
|
43
|
+
[uv](https://docs.astral.sh/uv/getting-started/installation/)
|
|
17
44
|
|
|
18
|
-
|
|
45
|
+
To install the project with all optional extras use
|
|
46
|
+
```bash
|
|
47
|
+
uv sync --all-extras
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
We provide custom groups to control optional extras.
|
|
51
|
+
- `flash_attn`: Install `flash_attn` with correct handling of build isolation
|
|
52
|
+
|
|
53
|
+
Thus, the following will setup the project with `flash_attn`
|
|
54
|
+
```bash
|
|
55
|
+
uv sync --all-extras --group flash_attn
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
To evaluate a single benchmark locally, you can use the following command:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
eval_framework \
|
|
62
|
+
--models src/eval_framework/llm/models.py \
|
|
63
|
+
--llm-name Smollm135MInstruct \
|
|
64
|
+
--task-name "GSM8K" \
|
|
65
|
+
--output-dir ./eval \
|
|
66
|
+
--num-fewshot 5 \
|
|
67
|
+
--num-samples 10
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
|
|
19
71
|
|
|
20
72
|
## Benchmark Coverage & Task Categories
|
|
21
73
|
|
|
@@ -67,51 +119,6 @@ Evaluation metrics include:
|
|
|
67
119
|
|
|
68
120
|
For the full list of tasks and metrics, see [Detailed Task Table](docs/benchmarks_and_metrics.md).
|
|
69
121
|
|
|
70
|
-
## Quick Start
|
|
71
|
-
|
|
72
|
-
The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
|
|
73
|
-
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support.
|
|
74
|
-
|
|
75
|
-
The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
|
|
76
|
-
```
|
|
77
|
-
pip install eval_framework
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
There are optional extras available to unlock specific features of the library:
|
|
81
|
-
- `mistral` for inference on Mistral models
|
|
82
|
-
- `transformers` for inference using the transformers library
|
|
83
|
-
- `api` for inference using the aleph-alpha client.
|
|
84
|
-
- `vllm` for inference via VLLM
|
|
85
|
-
- `determined` for running jobs via determined
|
|
86
|
-
- `comet` for the COMET metric
|
|
87
|
-
|
|
88
|
-
As a short hand, the `all` extra installs all of the above.
|
|
89
|
-
|
|
90
|
-
For development, you can instead install it directly from the repository instead, please first install
|
|
91
|
-
[uv](https://docs.astral.sh/uv/getting-started/installation/)
|
|
92
|
-
|
|
93
|
-
To install the project with all optional extras use
|
|
94
|
-
```bash
|
|
95
|
-
uv sync --all-extras
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
We provide custom groups to control optional extras.
|
|
99
|
-
- `cpu`: Use the CPU backend for torch
|
|
100
|
-
- `cu124`: Use the CUDA 12.4 backend
|
|
101
|
-
- `flash_attn`: Install `flash_attn` with correct handling of build isolation
|
|
102
|
-
|
|
103
|
-
Thus, the following will setup the project with `flash_attn` and CUDA 12.4
|
|
104
|
-
```bash
|
|
105
|
-
uv sync --all-extras --group flash_attn --group cu124
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
There is also a pre-commit hook to help with development:
|
|
109
|
-
```
|
|
110
|
-
uv run pre-commit install
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
After installation, task documentation can be generated with `uv run python src/eval_framework/utils/generate_task_docs.py` (see [docs/installation.md(docs/installation.md)) for more details.
|
|
114
|
-
|
|
115
122
|
## Getting Started
|
|
116
123
|
|
|
117
124
|
### Understanding the Evaluation Framework
|
|
@@ -180,22 +187,7 @@ pip install eval_framework[transformers]
|
|
|
180
187
|
- **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
|
|
181
188
|
- **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
|
|
182
189
|
- **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
|
|
183
|
-
|
|
184
|
-
### Example CLI Usage
|
|
185
|
-
|
|
186
|
-
To evaluate a single benchmark locally, you can use the following command:
|
|
187
|
-
|
|
188
|
-
```bash
|
|
189
|
-
eval_framework \
|
|
190
|
-
--models src/eval_framework/llm/models.py \
|
|
191
|
-
--llm-name Smollm135MInstruct \
|
|
192
|
-
--task-name "GSM8K" \
|
|
193
|
-
--output-dir ./eval \
|
|
194
|
-
--num-fewshot 5 \
|
|
195
|
-
--num-samples 10
|
|
196
|
-
```
|
|
197
|
-
|
|
198
|
-
For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usage.md).
|
|
190
|
+
- **Log results in WandB**: See how [we integrate WandB](docs/wandb_integration.md) for metric and lineage tracking
|
|
199
191
|
|
|
200
192
|
## Documentation
|
|
201
193
|
|
|
@@ -216,6 +208,10 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
|
|
|
216
208
|
- **[Using Determined](docs/using_determined.md)** - Guide for distributed evaluation using Determined AI
|
|
217
209
|
- **[Controlling Upload Results](docs/controlling_upload_results.md)** - How to manage and control the upload of evaluation results
|
|
218
210
|
|
|
211
|
+
### Contributing
|
|
212
|
+
|
|
213
|
+
- **[Contributing Guide](CONTRIBUTING.md)** - Guide for contributing to this project
|
|
214
|
+
|
|
219
215
|
### Citation
|
|
220
216
|
|
|
221
217
|
If you use `eval-framework` in your research, please cite:
|
|
@@ -240,6 +236,6 @@ This project has received funding from the European Union’s Digital Europe Pro
|
|
|
240
236
|
The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
|
|
241
237
|
|
|
242
238
|
<p align="center">
|
|
243
|
-
<img src="docs/OELLM_1.png" alt="
|
|
244
|
-
<img src="docs/OELLM_2.png" alt="
|
|
239
|
+
<img src="docs/OELLM_1.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
|
|
240
|
+
<img src="docs/OELLM_2.png" alt="https://github.com/Aleph-Alpha-Research/eval-framework/raw/main/docs/OELLM_2.png" width="350"/>
|
|
245
241
|
</p>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.2"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -25,25 +25,21 @@ dependencies = [
|
|
|
25
25
|
"sacrebleu>=2.4.3,<3",
|
|
26
26
|
"pycountry>=24.6.1,<25",
|
|
27
27
|
"nltk>=3.9.1,<4",
|
|
28
|
-
"types-pyyaml>=6.0.12.20240917,<7",
|
|
29
|
-
"psutil>=6.1,<7",
|
|
30
28
|
"python-dotenv>=1.0.1,<2",
|
|
31
29
|
"lingua-language-detector>=2.0.2,<3",
|
|
32
30
|
"google-crc32c>=1.5.0,<2",
|
|
33
|
-
"kubernetes>=31.0.0,<32",
|
|
34
|
-
"langdetect>=1.0.9,<2",
|
|
31
|
+
"kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
|
|
32
|
+
"langdetect>=1.0.9,<2", # required by the original ifeval implementation
|
|
35
33
|
"spacy>=3.8.3,<4",
|
|
36
34
|
"jsonschema>=4.23.0,<5",
|
|
37
|
-
"mysql-connector-python>=9.0.0,<10",
|
|
38
|
-
"psycopg2-binary>=2.9.9,<3",
|
|
35
|
+
"mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
|
|
36
|
+
"psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
|
|
39
37
|
"sympy>=1.13.1,<2",
|
|
40
38
|
"llm-sandbox[docker]>=0.1.8,<0.2",
|
|
41
39
|
"jsonlines>=4,<5",
|
|
42
40
|
"lxml>=6,<7",
|
|
43
41
|
"python-iso639>=2025.2.18",
|
|
44
42
|
"wandb>=0.21.1,<1",
|
|
45
|
-
# Needed for uv bug: https://github.com/astral-sh/uv/issues/15661
|
|
46
|
-
"torch",
|
|
47
43
|
]
|
|
48
44
|
|
|
49
45
|
[project.optional-dependencies]
|
|
@@ -99,9 +95,10 @@ dev = [
|
|
|
99
95
|
"plotly>=5.24.1,<6",
|
|
100
96
|
"ruff>=0.12.8",
|
|
101
97
|
]
|
|
102
|
-
flash-attn = [
|
|
103
|
-
|
|
104
|
-
|
|
98
|
+
flash-attn = [
|
|
99
|
+
"flash-attn>=2.7.2.post1,<2.8",
|
|
100
|
+
"torch"
|
|
101
|
+
]
|
|
105
102
|
|
|
106
103
|
[build-system]
|
|
107
104
|
requires = ["uv_build>=0.8.10,<0.9.0"]
|
|
@@ -114,17 +111,11 @@ module-name = ["eval_framework", "template_formatting"]
|
|
|
114
111
|
override-dependencies = [
|
|
115
112
|
"requests>=2.32,<3", # fix for determined
|
|
116
113
|
]
|
|
117
|
-
conflicts = [
|
|
118
|
-
[
|
|
119
|
-
{ group = "cpu" },
|
|
120
|
-
{ group = "cu124" },
|
|
121
|
-
],
|
|
122
|
-
]
|
|
123
114
|
|
|
124
115
|
[tool.uv.sources]
|
|
125
116
|
torch = [
|
|
126
|
-
{ index = "pytorch-
|
|
127
|
-
{ index = "pytorch-
|
|
117
|
+
{ index = "pytorch-default", marker = "sys_platform != 'linux'" },
|
|
118
|
+
{ index = "pytorch-cu124", marker = "sys_platform == 'linux'" },
|
|
128
119
|
]
|
|
129
120
|
|
|
130
121
|
[[tool.uv.index]]
|
|
@@ -133,8 +124,8 @@ url = "https://download.pytorch.org/whl/cu124"
|
|
|
133
124
|
explicit = true
|
|
134
125
|
|
|
135
126
|
[[tool.uv.index]]
|
|
136
|
-
name = "pytorch-
|
|
137
|
-
url = "https://
|
|
127
|
+
name = "pytorch-default"
|
|
128
|
+
url = "https://pypi.org/simple"
|
|
138
129
|
explicit = true
|
|
139
130
|
|
|
140
131
|
[tool.uv.extra-build-dependencies]
|
|
@@ -152,6 +143,12 @@ select = [
|
|
|
152
143
|
"UP", # Auto-upgrading of new Python features
|
|
153
144
|
"I", # Sort imports
|
|
154
145
|
]
|
|
146
|
+
[tool.ruff.lint.isort]
|
|
147
|
+
# https://github.com/astral-sh/ruff-pre-commit/issues/121
|
|
148
|
+
# https://github.com/astral-sh/ruff/issues/10519
|
|
149
|
+
# wandb creates a folder called 'wandb' during local runs (not logged in)
|
|
150
|
+
# this needs to be added to prevent isort from incorrectly sorting
|
|
151
|
+
known-third-party = ["wandb"]
|
|
155
152
|
|
|
156
153
|
[tool.ruff.lint.extend-per-file-ignores]
|
|
157
154
|
"__init__.py" = ["F401"]
|
|
@@ -8,7 +8,8 @@ from determined.core._context import init as determined_core_init
|
|
|
8
8
|
from determined.core._distributed import DummyDistributedContext
|
|
9
9
|
from pydantic import AfterValidator, BaseModel, ConfigDict
|
|
10
10
|
|
|
11
|
-
from eval_framework.context.eval import EvalContext
|
|
11
|
+
from eval_framework.context.eval import EvalContext
|
|
12
|
+
from eval_framework.context.local import _load_model
|
|
12
13
|
from eval_framework.llm.base import BaseLLM
|
|
13
14
|
from eval_framework.tasks.eval_config import EvalConfig
|
|
14
15
|
from eval_framework.tasks.perturbation import PerturbationConfig
|
|
@@ -111,18 +112,16 @@ class DeterminedContext(EvalContext):
|
|
|
111
112
|
if val_cli and val_hparams and val_cli != val_hparams:
|
|
112
113
|
logger.info(f"CLI argument {name} ({val_cli}) is being overridden by hyperparameters: ({val_hparams}).")
|
|
113
114
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
raise ValueError(f"LLM '{self.hparams.llm_name}' not found.")
|
|
117
|
-
llm_class = models[self.hparams.llm_name]
|
|
118
|
-
|
|
119
|
-
llm_judge_class: type[BaseLLM] | None = None
|
|
115
|
+
# Hyperparameters take precedence over core context
|
|
116
|
+
llm_name = self.hparams.llm_name or self.llm_name
|
|
120
117
|
judge_model_name = self.hparams.task_args.judge_model_name or self.judge_model_name
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
118
|
+
|
|
119
|
+
llm_class = _load_model(llm_name, models_path=self.models_path)
|
|
120
|
+
llm_judge_class: type[BaseLLM] | None = (
|
|
121
|
+
_load_model(judge_model_name, models_path=self.judge_models_path, info="judge")
|
|
122
|
+
if judge_model_name
|
|
123
|
+
else None
|
|
124
|
+
)
|
|
126
125
|
|
|
127
126
|
# for all optional hyperparameters, resort to the respective CLI argument if the hyperparameter is not set
|
|
128
127
|
self.config = EvalConfig(
|
|
@@ -2,6 +2,7 @@ import importlib.util
|
|
|
2
2
|
import inspect
|
|
3
3
|
import sys
|
|
4
4
|
from contextlib import AbstractContextManager
|
|
5
|
+
from os import PathLike
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Any
|
|
7
8
|
|
|
@@ -11,7 +12,7 @@ from eval_framework.tasks.eval_config import EvalConfig
|
|
|
11
12
|
from eval_framework.tasks.perturbation import PerturbationConfig
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
def import_models(models_file:
|
|
15
|
+
def import_models(models_file: PathLike | str) -> dict[str, type[BaseLLM]]:
|
|
15
16
|
models_file = Path(models_file).resolve()
|
|
16
17
|
library_path = Path(eval_framework.__path__[0]).resolve()
|
|
17
18
|
|
|
@@ -86,10 +87,10 @@ class EvalContext(AbstractContextManager):
|
|
|
86
87
|
self.wandb_run_id = wandb_run_id
|
|
87
88
|
self.hf_upload_dir = hf_upload_dir
|
|
88
89
|
self.hf_upload_repo = hf_upload_repo
|
|
89
|
-
self.llm_args = llm_args
|
|
90
|
+
self.llm_args = llm_args if llm_args is not None else {}
|
|
90
91
|
self.judge_models_path = judge_models_path
|
|
91
92
|
self.judge_model_name = judge_model_name
|
|
92
|
-
self.judge_model_args = judge_model_args
|
|
93
|
+
self.judge_model_args = judge_model_args if judge_model_args is not None else {}
|
|
93
94
|
self.batch_size = batch_size
|
|
94
95
|
self.description = description
|
|
95
96
|
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from os import PathLike
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from eval_framework.context.eval import EvalContext, import_models
|
|
6
|
+
from eval_framework.llm.base import BaseLLM
|
|
7
|
+
from eval_framework.tasks.eval_config import EvalConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _load_model(llm_name: str, models_path: str | PathLike | None, *, info: str = "") -> type[BaseLLM]:
|
|
11
|
+
"""Load a model class either from a models file or as a fully qualified module path.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
llm_name: The name of the model class to load, or a fully qualified module path.
|
|
15
|
+
models_path: The path to a Python file containing model class definitions
|
|
16
|
+
info: Additional info to include in error messages.
|
|
17
|
+
Returns:
|
|
18
|
+
The model class.
|
|
19
|
+
"""
|
|
20
|
+
if models_path is None or "." in llm_name:
|
|
21
|
+
# The llm_name must a a fully qualified module path
|
|
22
|
+
if "." not in llm_name:
|
|
23
|
+
raise ValueError(f"LLM {info} '{llm_name}' is not a fully qualified module path.")
|
|
24
|
+
module_path, llm_class_name = llm_name.rsplit(".", 1)
|
|
25
|
+
module = importlib.import_module(module_path)
|
|
26
|
+
if not hasattr(module, llm_class_name):
|
|
27
|
+
raise ValueError(f"LLM '{llm_class_name}' not found in module '{module_path}'.")
|
|
28
|
+
return getattr(module, llm_class_name)
|
|
29
|
+
else:
|
|
30
|
+
models_dict = import_models(models_path)
|
|
31
|
+
if llm_name not in models_dict:
|
|
32
|
+
if info:
|
|
33
|
+
info = f"{info.strip()} "
|
|
34
|
+
raise ValueError(f"LLM {info} '{llm_name}' not found in {models_path}.")
|
|
35
|
+
return models_dict[llm_name]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LocalContext(EvalContext):
|
|
39
|
+
def __enter__(self) -> "LocalContext":
|
|
40
|
+
llm_class = _load_model(self.llm_name, models_path=self.models_path)
|
|
41
|
+
self.llm_judge_class: type[BaseLLM] | None = None
|
|
42
|
+
if self.judge_model_name is not None:
|
|
43
|
+
self.llm_judge_class = _load_model(self.judge_model_name, models_path=self.judge_models_path, info="judge")
|
|
44
|
+
|
|
45
|
+
self.config = EvalConfig(
|
|
46
|
+
llm_class=llm_class,
|
|
47
|
+
llm_args=self.llm_args,
|
|
48
|
+
num_samples=self.num_samples,
|
|
49
|
+
max_tokens=self.max_tokens,
|
|
50
|
+
num_fewshot=self.num_fewshot,
|
|
51
|
+
perturbation_config=self.perturbation_config,
|
|
52
|
+
task_name=self.task_name,
|
|
53
|
+
task_subjects=self.task_subjects,
|
|
54
|
+
hf_revision=self.hf_revision,
|
|
55
|
+
output_dir=self.output_dir,
|
|
56
|
+
hf_upload_dir=self.hf_upload_dir,
|
|
57
|
+
hf_upload_repo=self.hf_upload_repo,
|
|
58
|
+
wandb_entity=self.wandb_entity,
|
|
59
|
+
wandb_project=self.wandb_project,
|
|
60
|
+
wandb_run_id=self.wandb_run_id,
|
|
61
|
+
llm_judge_class=self.llm_judge_class,
|
|
62
|
+
judge_model_args=self.judge_model_args,
|
|
63
|
+
batch_size=self.batch_size,
|
|
64
|
+
description=self.description,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
def __exit__(
|
|
70
|
+
self,
|
|
71
|
+
exc_type: type[BaseException] | None,
|
|
72
|
+
exc_value: BaseException | None,
|
|
73
|
+
traceback: Any | None,
|
|
74
|
+
) -> None:
|
|
75
|
+
pass
|