eval-framework 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.1 → eval_framework-0.2.3}/PKG-INFO +54 -35
- {eval_framework-0.2.1 → eval_framework-0.2.3}/README.md +51 -34
- {eval_framework-0.2.1 → eval_framework-0.2.3}/pyproject.toml +11 -2
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/determined.py +15 -5
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/eval.py +4 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/local.py +4 -2
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/aleph_alpha.py +13 -1
- eval_framework-0.2.3/src/eval_framework/llm/base.py +180 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/huggingface.py +99 -53
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/mistral.py +25 -10
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/openai.py +24 -3
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/vllm.py +94 -43
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/main.py +31 -62
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/json_format.py +9 -1
- eval_framework-0.2.3/src/eval_framework/metrics/llm/base.py +33 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +20 -21
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +20 -21
- eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/base.py +50 -0
- eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
- eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/dcs.py +43 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/loglikelihood/probability_mass.py +9 -12
- eval_framework-0.2.3/src/eval_framework/metrics/loglikelihood/ternary.py +42 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/response_generator.py +6 -2
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/result_processors/base.py +14 -0
- eval_framework-0.2.3/src/eval_framework/result_processors/hf_uploader.py +75 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/result_processors/result_processor.py +7 -7
- eval_framework-0.2.3/src/eval_framework/result_processors/wandb_uploader.py +141 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/run.py +26 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/gsm8k.py +7 -5
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/openbookqa.py +25 -3
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +3 -3
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/sciq.py +22 -1
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/sphyr.py +6 -2
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/truthfulqa.py +5 -5
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/eval_config.py +27 -4
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/task_names.py +3 -1
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/file_ops.py +61 -40
- eval_framework-0.2.1/src/eval_framework/llm/base.py +0 -97
- eval_framework-0.2.1/src/eval_framework/metrics/llm/base.py +0 -8
- eval_framework-0.2.1/src/eval_framework/result_processors/hf_processor.py +0 -87
- eval_framework-0.2.1/src/template_formatting/tests/test_formatter_eval.py +0 -408
- eval_framework-0.2.1/src/template_formatting/tests/test_formatter_scaling.py +0 -253
- eval_framework-0.2.1/src/template_formatting/tests/test_mistral_formatter.py +0 -136
- {eval_framework-0.2.1 → eval_framework-0.2.3}/LICENSE +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.1 → eval_framework-0.2.3}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -233,6 +233,7 @@ Requires-Dist: jsonlines>=4,<5
|
|
|
233
233
|
Requires-Dist: lxml>=6,<7
|
|
234
234
|
Requires-Dist: python-iso639>=2025.2.18
|
|
235
235
|
Requires-Dist: wandb>=0.21.1,<1
|
|
236
|
+
Requires-Dist: boto3>=1.40.54,<2
|
|
236
237
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
237
238
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
238
239
|
Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
|
|
@@ -248,6 +249,7 @@ Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
|
|
|
248
249
|
Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
|
|
249
250
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
|
|
250
251
|
Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
|
|
252
|
+
Requires-Dist: accelerate>=0.30.0,<1 ; extra == 'transformers'
|
|
251
253
|
Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
|
|
252
254
|
Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
|
|
253
255
|
Requires-Python: >=3.12, <3.13
|
|
@@ -305,13 +307,25 @@ There are optional extras available to unlock specific features of the library:
|
|
|
305
307
|
|
|
306
308
|
As a short hand, the `all` extra installs all of the above.
|
|
307
309
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
+
We use `uv` to better resolve dependencies when downloading the extras. You can install uv with:
|
|
311
|
+
```bash
|
|
312
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
313
|
+
```
|
|
314
|
+
or by follwing the `uv` [installation docs.](https://docs.astral.sh/uv/getting-started/installation/)
|
|
310
315
|
|
|
311
|
-
|
|
316
|
+
Now, you can safely install the project with all optional extras:
|
|
312
317
|
```bash
|
|
313
318
|
uv sync --all-extras
|
|
314
319
|
```
|
|
320
|
+
or with pip
|
|
321
|
+
```bash
|
|
322
|
+
uv pip install eval_framework[all]
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
Tip: ensure python is properly installed with uv:
|
|
326
|
+
```
|
|
327
|
+
uv python install 3.12 --reinstall
|
|
328
|
+
```
|
|
315
329
|
|
|
316
330
|
We provide custom groups to control optional extras.
|
|
317
331
|
- `flash_attn`: Install `flash_attn` with correct handling of build isolation
|
|
@@ -327,8 +341,9 @@ To evaluate a single benchmark locally, you can use the following command:
|
|
|
327
341
|
eval_framework \
|
|
328
342
|
--models src/eval_framework/llm/models.py \
|
|
329
343
|
--llm-name Smollm135MInstruct \
|
|
330
|
-
--task-name "
|
|
331
|
-
--
|
|
344
|
+
--task-name "MMLU" \
|
|
345
|
+
--task-subjects "abstract_algebra" \
|
|
346
|
+
--output-dir ./eval_results \
|
|
332
347
|
--num-fewshot 5 \
|
|
333
348
|
--num-samples 10
|
|
334
349
|
```
|
|
@@ -414,35 +429,37 @@ pip install eval_framework[transformers]
|
|
|
414
429
|
|
|
415
430
|
2. **Create and run your first evaluation using HuggingFace model**:
|
|
416
431
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
432
|
+
```python
|
|
433
|
+
from functools import partial
|
|
434
|
+
from pathlib import Path
|
|
435
|
+
|
|
436
|
+
from eval_framework.llm.huggingface import HFLLM
|
|
437
|
+
from eval_framework.main import main
|
|
438
|
+
from eval_framework.tasks.eval_config import EvalConfig
|
|
439
|
+
from template_formatting.formatter import HFFormatter
|
|
440
|
+
|
|
441
|
+
# Define your model
|
|
442
|
+
class MyHuggingFaceModel(HFLLM):
|
|
443
|
+
LLM_NAME = "microsoft/DialoGPT-medium"
|
|
444
|
+
DEFAULT_FORMATTER = partial(HFFormatter, "microsoft/DialoGPT-medium")
|
|
445
|
+
|
|
446
|
+
if __name__ == "__main__":
|
|
447
|
+
# Initialize your model
|
|
448
|
+
llm = MyHuggingFaceModel()
|
|
449
|
+
|
|
450
|
+
# Running evaluation on MMLU abstract algebra task using 5 few-shot examples and 10 samples
|
|
451
|
+
config = EvalConfig(
|
|
452
|
+
output_dir=Path("./eval_results"),
|
|
453
|
+
num_fewshot=5,
|
|
454
|
+
num_samples=10,
|
|
455
|
+
task_name="MMLU",
|
|
456
|
+
task_subjects=["abstract_algebra", "astronomy"],
|
|
457
|
+
llm_class=MyHuggingFaceModel,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Run evaluation and get results
|
|
461
|
+
results = main(llm=llm, config=config)
|
|
462
|
+
```
|
|
446
463
|
|
|
447
464
|
3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](docs/understanding_results_guide.md) to interpret them
|
|
448
465
|
|
|
@@ -450,6 +467,7 @@ pip install eval_framework[transformers]
|
|
|
450
467
|
|
|
451
468
|
- **Use CLI interface**: See [CLI usage guide](docs/cli_usage.md) for command-line evaluation options
|
|
452
469
|
- **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](docs/evaluate_huggingface_model.md)
|
|
470
|
+
- **Understand model arguments**: Read out [Model Arguments guide](docs/model_arguments.md)
|
|
453
471
|
- **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
|
|
454
472
|
- **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
|
|
455
473
|
- **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
|
|
@@ -465,6 +483,7 @@ pip install eval_framework[transformers]
|
|
|
465
483
|
|
|
466
484
|
### Advanced Usage
|
|
467
485
|
|
|
486
|
+
- **[Understanding Model Arguments](docs/model_arguments.md)** - Thorough guide on each constructor argument for salient model classes
|
|
468
487
|
- **[Adding New Benchmarks](docs/add_new_benchmark_guide.md)** - Complete guide with practical examples for adding new benchmarks
|
|
469
488
|
- **[Benchmarks and Metrics](docs/benchmarks_and_metrics.md)** - Comprehensive overview of all available benchmarks and evaluation metrics
|
|
470
489
|
- **[Overview of Dataloading](docs/overview_dataloading.md)** - Explanation of dataloading and task/sample/message structure
|
|
@@ -39,13 +39,25 @@ There are optional extras available to unlock specific features of the library:
|
|
|
39
39
|
|
|
40
40
|
As a short hand, the `all` extra installs all of the above.
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
42
|
+
We use `uv` to better resolve dependencies when downloading the extras. You can install uv with:
|
|
43
|
+
```bash
|
|
44
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
45
|
+
```
|
|
46
|
+
or by follwing the `uv` [installation docs.](https://docs.astral.sh/uv/getting-started/installation/)
|
|
44
47
|
|
|
45
|
-
|
|
48
|
+
Now, you can safely install the project with all optional extras:
|
|
46
49
|
```bash
|
|
47
50
|
uv sync --all-extras
|
|
48
51
|
```
|
|
52
|
+
or with pip
|
|
53
|
+
```bash
|
|
54
|
+
uv pip install eval_framework[all]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Tip: ensure python is properly installed with uv:
|
|
58
|
+
```
|
|
59
|
+
uv python install 3.12 --reinstall
|
|
60
|
+
```
|
|
49
61
|
|
|
50
62
|
We provide custom groups to control optional extras.
|
|
51
63
|
- `flash_attn`: Install `flash_attn` with correct handling of build isolation
|
|
@@ -61,8 +73,9 @@ To evaluate a single benchmark locally, you can use the following command:
|
|
|
61
73
|
eval_framework \
|
|
62
74
|
--models src/eval_framework/llm/models.py \
|
|
63
75
|
--llm-name Smollm135MInstruct \
|
|
64
|
-
--task-name "
|
|
65
|
-
--
|
|
76
|
+
--task-name "MMLU" \
|
|
77
|
+
--task-subjects "abstract_algebra" \
|
|
78
|
+
--output-dir ./eval_results \
|
|
66
79
|
--num-fewshot 5 \
|
|
67
80
|
--num-samples 10
|
|
68
81
|
```
|
|
@@ -148,35 +161,37 @@ pip install eval_framework[transformers]
|
|
|
148
161
|
|
|
149
162
|
2. **Create and run your first evaluation using HuggingFace model**:
|
|
150
163
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
164
|
+
```python
|
|
165
|
+
from functools import partial
|
|
166
|
+
from pathlib import Path
|
|
167
|
+
|
|
168
|
+
from eval_framework.llm.huggingface import HFLLM
|
|
169
|
+
from eval_framework.main import main
|
|
170
|
+
from eval_framework.tasks.eval_config import EvalConfig
|
|
171
|
+
from template_formatting.formatter import HFFormatter
|
|
172
|
+
|
|
173
|
+
# Define your model
|
|
174
|
+
class MyHuggingFaceModel(HFLLM):
|
|
175
|
+
LLM_NAME = "microsoft/DialoGPT-medium"
|
|
176
|
+
DEFAULT_FORMATTER = partial(HFFormatter, "microsoft/DialoGPT-medium")
|
|
177
|
+
|
|
178
|
+
if __name__ == "__main__":
|
|
179
|
+
# Initialize your model
|
|
180
|
+
llm = MyHuggingFaceModel()
|
|
181
|
+
|
|
182
|
+
# Running evaluation on MMLU abstract algebra task using 5 few-shot examples and 10 samples
|
|
183
|
+
config = EvalConfig(
|
|
184
|
+
output_dir=Path("./eval_results"),
|
|
185
|
+
num_fewshot=5,
|
|
186
|
+
num_samples=10,
|
|
187
|
+
task_name="MMLU",
|
|
188
|
+
task_subjects=["abstract_algebra", "astronomy"],
|
|
189
|
+
llm_class=MyHuggingFaceModel,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Run evaluation and get results
|
|
193
|
+
results = main(llm=llm, config=config)
|
|
194
|
+
```
|
|
180
195
|
|
|
181
196
|
3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](docs/understanding_results_guide.md) to interpret them
|
|
182
197
|
|
|
@@ -184,6 +199,7 @@ pip install eval_framework[transformers]
|
|
|
184
199
|
|
|
185
200
|
- **Use CLI interface**: See [CLI usage guide](docs/cli_usage.md) for command-line evaluation options
|
|
186
201
|
- **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](docs/evaluate_huggingface_model.md)
|
|
202
|
+
- **Understand model arguments**: Read out [Model Arguments guide](docs/model_arguments.md)
|
|
187
203
|
- **Create custom benchmarks**: Follow our [benchmark creation guide](docs/add_new_benchmark_guide.md)
|
|
188
204
|
- **Scale your evaluations**: Use [Determined AI integration](docs/using_determined.md) for distributed evaluation
|
|
189
205
|
- **Understand your results**: Read our [results interpretation guide](docs/understanding_results_guide.md)
|
|
@@ -199,6 +215,7 @@ pip install eval_framework[transformers]
|
|
|
199
215
|
|
|
200
216
|
### Advanced Usage
|
|
201
217
|
|
|
218
|
+
- **[Understanding Model Arguments](docs/model_arguments.md)** - Thorough guide on each constructor argument for salient model classes
|
|
202
219
|
- **[Adding New Benchmarks](docs/add_new_benchmark_guide.md)** - Complete guide with practical examples for adding new benchmarks
|
|
203
220
|
- **[Benchmarks and Metrics](docs/benchmarks_and_metrics.md)** - Comprehensive overview of all available benchmarks and evaluation metrics
|
|
204
221
|
- **[Overview of Dataloading](docs/overview_dataloading.md)** - Explanation of dataloading and task/sample/message structure
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.3"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -40,6 +40,7 @@ dependencies = [
|
|
|
40
40
|
"lxml>=6,<7",
|
|
41
41
|
"python-iso639>=2025.2.18",
|
|
42
42
|
"wandb>=0.21.1,<1",
|
|
43
|
+
"boto3>=1.40.54,<2",
|
|
43
44
|
]
|
|
44
45
|
|
|
45
46
|
[project.optional-dependencies]
|
|
@@ -53,7 +54,11 @@ openai = [
|
|
|
53
54
|
"openai>=1.62,<2",
|
|
54
55
|
"tiktoken>=0.9,<0.10"
|
|
55
56
|
]
|
|
56
|
-
transformers = [
|
|
57
|
+
transformers = [
|
|
58
|
+
"transformers>=4.45.2,<5",
|
|
59
|
+
"torch>=2.5,<3",
|
|
60
|
+
"accelerate>=0.30.0,<1",
|
|
61
|
+
]
|
|
57
62
|
accelerate = ["accelerate"]
|
|
58
63
|
vllm = [
|
|
59
64
|
"vllm>=0.8.5,<0.9",
|
|
@@ -87,6 +92,7 @@ eval_framework = "eval_framework.run:run"
|
|
|
87
92
|
dev = [
|
|
88
93
|
"mypy>=1.10,<2",
|
|
89
94
|
"pytest>=8.3.3,<9",
|
|
95
|
+
"pytest-mock>=3.14.1",
|
|
90
96
|
"pytest-xdist>=3.6.1,<4",
|
|
91
97
|
"pytest-sugar>1.1,<2",
|
|
92
98
|
"types-pyyaml>=6.0.12.20240917,<7",
|
|
@@ -172,3 +178,6 @@ markers = [
|
|
|
172
178
|
filterwarnings = [
|
|
173
179
|
"ignore::DeprecationWarning:datasets.utils._dill:",
|
|
174
180
|
]
|
|
181
|
+
env = [
|
|
182
|
+
"WANDB_MODE = disabled",
|
|
183
|
+
]
|
|
@@ -42,10 +42,12 @@ class Hyperparameters(BaseModel):
|
|
|
42
42
|
wandb_project: str | None = None
|
|
43
43
|
wandb_entity: str | None = None
|
|
44
44
|
wandb_run_id: str | None = None
|
|
45
|
+
wandb_upload_results: bool | None = None
|
|
45
46
|
description: str | None = None
|
|
46
47
|
task_args: TaskArgs
|
|
47
48
|
llm_args: dict[str, Any] | None = {}
|
|
48
49
|
extra_task_modules: list[str] | None = None
|
|
50
|
+
delete_output_dir_after_upload: bool | None = None
|
|
49
51
|
|
|
50
52
|
|
|
51
53
|
class DeterminedContext(EvalContext):
|
|
@@ -88,7 +90,9 @@ class DeterminedContext(EvalContext):
|
|
|
88
90
|
"wandb_project",
|
|
89
91
|
"wandb_entity",
|
|
90
92
|
"wandb_run_id",
|
|
93
|
+
"wandb_upload_results",
|
|
91
94
|
"description",
|
|
95
|
+
"delete_output_dir_after_upload",
|
|
92
96
|
]:
|
|
93
97
|
val_cli = getattr(self, name, None)
|
|
94
98
|
val_hparams = getattr(self.hparams, name, None)
|
|
@@ -112,13 +116,16 @@ class DeterminedContext(EvalContext):
|
|
|
112
116
|
if val_cli and val_hparams and val_cli != val_hparams:
|
|
113
117
|
logger.info(f"CLI argument {name} ({val_cli}) is being overridden by hyperparameters: ({val_hparams}).")
|
|
114
118
|
|
|
115
|
-
|
|
116
|
-
|
|
119
|
+
# Hyperparameters take precedence over core context
|
|
120
|
+
llm_name = self.hparams.llm_name or self.llm_name
|
|
121
|
+
judge_model_name = self.hparams.task_args.judge_model_name or self.judge_model_name
|
|
117
122
|
|
|
118
123
|
llm_class = _load_model(llm_name, models_path=self.models_path)
|
|
119
|
-
llm_judge_class: type[BaseLLM] | None =
|
|
120
|
-
|
|
121
|
-
|
|
124
|
+
llm_judge_class: type[BaseLLM] | None = (
|
|
125
|
+
_load_model(judge_model_name, models_path=self.judge_models_path, info="judge")
|
|
126
|
+
if judge_model_name
|
|
127
|
+
else None
|
|
128
|
+
)
|
|
122
129
|
|
|
123
130
|
# for all optional hyperparameters, resort to the respective CLI argument if the hyperparameter is not set
|
|
124
131
|
self.config = EvalConfig(
|
|
@@ -139,8 +146,11 @@ class DeterminedContext(EvalContext):
|
|
|
139
146
|
wandb_project=self.hparams.wandb_project or self.wandb_project,
|
|
140
147
|
wandb_entity=self.hparams.wandb_entity or self.wandb_entity,
|
|
141
148
|
wandb_run_id=self.hparams.wandb_run_id or self.wandb_run_id,
|
|
149
|
+
wandb_upload_results=self.hparams.wandb_upload_results or self.wandb_upload_results,
|
|
142
150
|
batch_size=self.hparams.task_args.batch_size or self.batch_size,
|
|
143
151
|
description=self.hparams.description or self.description,
|
|
152
|
+
delete_output_dir_after_upload=self.hparams.delete_output_dir_after_upload
|
|
153
|
+
or self.delete_output_dir_after_upload,
|
|
144
154
|
)
|
|
145
155
|
|
|
146
156
|
return self
|
|
@@ -61,6 +61,7 @@ class EvalContext(AbstractContextManager):
|
|
|
61
61
|
wandb_project: str | None = None,
|
|
62
62
|
wandb_entity: str | None = None,
|
|
63
63
|
wandb_run_id: str | None = None,
|
|
64
|
+
wandb_upload_results: bool | None = None,
|
|
64
65
|
hf_upload_dir: str | None = None,
|
|
65
66
|
hf_upload_repo: str | None = None,
|
|
66
67
|
llm_args: dict[str, Any] | None = None,
|
|
@@ -72,6 +73,7 @@ class EvalContext(AbstractContextManager):
|
|
|
72
73
|
perturbation_type: str | None = None,
|
|
73
74
|
perturbation_probability: float | None = None,
|
|
74
75
|
perturbation_seed: int | None = None,
|
|
76
|
+
delete_output_dir_after_upload: bool | None = None,
|
|
75
77
|
) -> None:
|
|
76
78
|
self.llm_name = llm_name
|
|
77
79
|
self.models_path = models_path
|
|
@@ -85,6 +87,7 @@ class EvalContext(AbstractContextManager):
|
|
|
85
87
|
self.wandb_project = wandb_project
|
|
86
88
|
self.wandb_entity = wandb_entity
|
|
87
89
|
self.wandb_run_id = wandb_run_id
|
|
90
|
+
self.wandb_upload_results = wandb_upload_results
|
|
88
91
|
self.hf_upload_dir = hf_upload_dir
|
|
89
92
|
self.hf_upload_repo = hf_upload_repo
|
|
90
93
|
self.llm_args = llm_args if llm_args is not None else {}
|
|
@@ -93,6 +96,7 @@ class EvalContext(AbstractContextManager):
|
|
|
93
96
|
self.judge_model_args = judge_model_args if judge_model_args is not None else {}
|
|
94
97
|
self.batch_size = batch_size
|
|
95
98
|
self.description = description
|
|
99
|
+
self.delete_output_dir_after_upload = delete_output_dir_after_upload
|
|
96
100
|
|
|
97
101
|
if perturbation_type or perturbation_probability is not None:
|
|
98
102
|
perturbation = {
|
|
@@ -20,7 +20,7 @@ def _load_model(llm_name: str, models_path: str | PathLike | None, *, info: str
|
|
|
20
20
|
if models_path is None or "." in llm_name:
|
|
21
21
|
# The llm_name must a a fully qualified module path
|
|
22
22
|
if "." not in llm_name:
|
|
23
|
-
raise ValueError(f"LLM {info}'{llm_name}' is not a fully qualified module path.")
|
|
23
|
+
raise ValueError(f"LLM {info} '{llm_name}' is not a fully qualified module path.")
|
|
24
24
|
module_path, llm_class_name = llm_name.rsplit(".", 1)
|
|
25
25
|
module = importlib.import_module(module_path)
|
|
26
26
|
if not hasattr(module, llm_class_name):
|
|
@@ -31,7 +31,7 @@ def _load_model(llm_name: str, models_path: str | PathLike | None, *, info: str
|
|
|
31
31
|
if llm_name not in models_dict:
|
|
32
32
|
if info:
|
|
33
33
|
info = f"{info.strip()} "
|
|
34
|
-
raise ValueError(f"LLM {info}'{llm_name}' not found in {models_path}.")
|
|
34
|
+
raise ValueError(f"LLM {info} '{llm_name}' not found in {models_path}.")
|
|
35
35
|
return models_dict[llm_name]
|
|
36
36
|
|
|
37
37
|
|
|
@@ -58,10 +58,12 @@ class LocalContext(EvalContext):
|
|
|
58
58
|
wandb_entity=self.wandb_entity,
|
|
59
59
|
wandb_project=self.wandb_project,
|
|
60
60
|
wandb_run_id=self.wandb_run_id,
|
|
61
|
+
wandb_upload_results=self.wandb_upload_results,
|
|
61
62
|
llm_judge_class=self.llm_judge_class,
|
|
62
63
|
judge_model_args=self.judge_model_args,
|
|
63
64
|
batch_size=self.batch_size,
|
|
64
65
|
description=self.description,
|
|
66
|
+
delete_output_dir_after_upload=self.delete_output_dir_after_upload,
|
|
65
67
|
)
|
|
66
68
|
|
|
67
69
|
return self
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
+
import math
|
|
4
5
|
import os
|
|
5
6
|
import random
|
|
6
7
|
import re
|
|
@@ -43,6 +44,7 @@ def safe_json_loads(s: str) -> dict:
|
|
|
43
44
|
class AlephAlphaAPIModel(BaseLLM):
|
|
44
45
|
LLM_NAME: str
|
|
45
46
|
DEFAULT_FORMATTER: Callable[[], BaseFormatter] | None = None
|
|
47
|
+
BYTES_PER_TOKEN: float = 4.0 # rule of thumb according to https://platform.openai.com/tokenizer
|
|
46
48
|
|
|
47
49
|
def __init__(
|
|
48
50
|
self,
|
|
@@ -53,6 +55,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
53
55
|
max_async_concurrent_requests: int = 32,
|
|
54
56
|
request_timeout_seconds: int = 30 * 60 + 5,
|
|
55
57
|
queue_full_timeout_seconds: int = 30 * 60 + 5,
|
|
58
|
+
bytes_per_token: float | None = None,
|
|
56
59
|
) -> None:
|
|
57
60
|
self._formatter: BaseFormatter
|
|
58
61
|
if formatter is None:
|
|
@@ -67,6 +70,12 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
67
70
|
self.request_timeout_seconds = request_timeout_seconds
|
|
68
71
|
self.queue_full_timeout_seconds = queue_full_timeout_seconds
|
|
69
72
|
self._validate_model_availability()
|
|
73
|
+
# set bytes_per_token_scalar for non-standard models
|
|
74
|
+
if bytes_per_token is not None and bytes_per_token <= 0:
|
|
75
|
+
raise ValueError("bytes_per_token must be positive")
|
|
76
|
+
self.bytes_per_token_scalar = (
|
|
77
|
+
4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
|
|
78
|
+
)
|
|
70
79
|
|
|
71
80
|
def _validate_model_availability(self) -> None:
|
|
72
81
|
"""
|
|
@@ -250,11 +259,14 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
250
259
|
|
|
251
260
|
requests = []
|
|
252
261
|
|
|
262
|
+
# Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
|
|
263
|
+
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
264
|
+
|
|
253
265
|
for single_messages in messages:
|
|
254
266
|
requests.append(
|
|
255
267
|
CompletionRequest(
|
|
256
268
|
prompt=Prompt.from_text(self._formatter.format(single_messages, output_mode="string")),
|
|
257
|
-
maximum_tokens=
|
|
269
|
+
maximum_tokens=scaled_max_tokens,
|
|
258
270
|
stop_sequences=stop_sequences,
|
|
259
271
|
temperature=effective_temperature,
|
|
260
272
|
)
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from eval_framework.shared.types import RawCompletion, RawLoglikelihood
|
|
7
|
+
from eval_framework.tasks.base import Sample
|
|
8
|
+
from template_formatting.formatter import BaseFormatter, Message
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseLLM(ABC):
|
|
12
|
+
@property
|
|
13
|
+
def name(self) -> str:
|
|
14
|
+
"""
|
|
15
|
+
This property is used to name the results folder and identify the eval results.
|
|
16
|
+
Overwrite this property in the subclass with e.g. the checkpoint name/huggingface model name."""
|
|
17
|
+
return self.__class__.__name__
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def generate_from_messages(
|
|
21
|
+
self,
|
|
22
|
+
messages: list[Sequence[Message]],
|
|
23
|
+
stop_sequences: list[str] | None = None,
|
|
24
|
+
max_tokens: int | None = None,
|
|
25
|
+
temperature: float | None = None,
|
|
26
|
+
) -> list[RawCompletion]:
|
|
27
|
+
"""
|
|
28
|
+
stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
|
|
29
|
+
extended with the properties of the model. This includes but is not limited to the stop tokens
|
|
30
|
+
by the evaluated checkpoint (e.g. <|eot_id|> for an instruction finetuned Llama3.1, <|endoftext|>
|
|
31
|
+
for a pretrained Llama3.1).
|
|
32
|
+
|
|
33
|
+
This function is expected to raise errors which are caught and reported when running the eval.
|
|
34
|
+
Please also make sure to raise an error in case of sequence length issues. We expect to always
|
|
35
|
+
raise an error if something impedes the expected completion of a task.
|
|
36
|
+
|
|
37
|
+
Important! The completion is expected to be detokenized and to NOT contain special tokens.
|
|
38
|
+
|
|
39
|
+
Returns: List[RawCompletion]
|
|
40
|
+
"""
|
|
41
|
+
raise NotImplementedError
|
|
42
|
+
|
|
43
|
+
def generate_from_samples(
|
|
44
|
+
self,
|
|
45
|
+
samples: list[Sample],
|
|
46
|
+
stop_sequences: list[str] | None = None,
|
|
47
|
+
max_tokens: int | None = None,
|
|
48
|
+
temperature: float | None = None,
|
|
49
|
+
) -> list[RawCompletion]:
|
|
50
|
+
"""
|
|
51
|
+
stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
|
|
52
|
+
extended with the properties of the model. This includes but is not limited to the stop tokens
|
|
53
|
+
by the evaluated checkpoint (e.g. <|eot_id|> for an instruction finetuned Llama3.1, <|endoftext|>
|
|
54
|
+
for a pretrained Llama3.1).
|
|
55
|
+
|
|
56
|
+
This function is expected to raise errors which are caught and reported when running the eval.
|
|
57
|
+
Please also make sure to raise an error in case of sequence length issues. We expect to always
|
|
58
|
+
raise an error if something impedes the expected completion of a task.
|
|
59
|
+
|
|
60
|
+
Important! The completion is expected to be detokenized and to NOT contain special tokens.
|
|
61
|
+
|
|
62
|
+
Returns: List[RawCompletion]
|
|
63
|
+
"""
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
|
|
68
|
+
"""
|
|
69
|
+
This function is expected to raise errors which are caught and reported when running the eval.
|
|
70
|
+
Please also make sure to raise an error in case of sequence length issues. We expect to always
|
|
71
|
+
raise an error if something prevents the expected completion of a task.
|
|
72
|
+
"""
|
|
73
|
+
raise NotImplementedError
|
|
74
|
+
|
|
75
|
+
def generate(
|
|
76
|
+
self,
|
|
77
|
+
samples: list[Sample],
|
|
78
|
+
stop_sequences: list[str] | None = None,
|
|
79
|
+
max_tokens: int | None = None,
|
|
80
|
+
temperature: float | None = None,
|
|
81
|
+
) -> list[RawCompletion]:
|
|
82
|
+
"""Generates a model response for each sample.
|
|
83
|
+
|
|
84
|
+
Uses 'generate_from_samples' to generate responses if implemented,
|
|
85
|
+
otherwise falls back to 'generate_from_messages'.
|
|
86
|
+
"""
|
|
87
|
+
try:
|
|
88
|
+
return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature)
|
|
89
|
+
except NotImplementedError:
|
|
90
|
+
messages: list[Sequence[Message]] = [sample.messages for sample in samples]
|
|
91
|
+
return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature)
|
|
92
|
+
|
|
93
|
+
def post_process_completion(self, completion: str, sample: Sample) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Model-specific post-processing of generated completions.
|
|
96
|
+
|
|
97
|
+
Override this method to apply model-specific cleanup or transformations
|
|
98
|
+
(e.g., removing specific artifacts such as reasoning traces, handling special tokens).
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
completion: The raw completion string from the model
|
|
102
|
+
sample: The sample that was used to generate the completion
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
The post-processed completion string
|
|
106
|
+
"""
|
|
107
|
+
return completion
|
|
108
|
+
|
|
109
|
+
def __del__(self) -> None:
|
|
110
|
+
"""
|
|
111
|
+
Method for custom resource cleanup (particularly GPUs)
|
|
112
|
+
"""
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
def _get_final_checkpoint(
|
|
116
|
+
self, checkpoint_path: str | Path | None = None, model_name: str | None = None, artifact_name: str | None = None
|
|
117
|
+
) -> tuple[str | Path | None, str | None]:
|
|
118
|
+
if (num_provided := sum(x is not None for x in [checkpoint_path, model_name, artifact_name])) == 0:
|
|
119
|
+
if not getattr(self, "LLM_NAME", ""):
|
|
120
|
+
raise ValueError("Either LLM_NAME, checkpoint_path, model_name, or artifact_name must be provided.")
|
|
121
|
+
return None, None # no argument given, so will use the LLM_NAME of the class
|
|
122
|
+
elif num_provided > 1:
|
|
123
|
+
raise ValueError("At most one of `checkpoint_path`, `model_name`, or `artifact_name` must be provided.")
|
|
124
|
+
|
|
125
|
+
elif checkpoint_path is not None:
|
|
126
|
+
return checkpoint_path, str(checkpoint_path)
|
|
127
|
+
|
|
128
|
+
elif model_name is not None:
|
|
129
|
+
return model_name, model_name
|
|
130
|
+
|
|
131
|
+
else:
|
|
132
|
+
from eval_framework.utils.file_ops import WandbFs
|
|
133
|
+
|
|
134
|
+
assert artifact_name is not None
|
|
135
|
+
artifact_base, version = artifact_name.split(":", 1) if ":" in artifact_name else (artifact_name, "latest")
|
|
136
|
+
with WandbFs() as wandb_fs:
|
|
137
|
+
self.artifact = wandb_fs.get_artifact(artifact_base, version) # self.artifact being read in main()
|
|
138
|
+
wandb_fs.download_artifact(self.artifact)
|
|
139
|
+
file_root = wandb_fs.find_hf_checkpoint_root_from_path_list()
|
|
140
|
+
if file_root is None:
|
|
141
|
+
raise ValueError(f"Could not find HuggingFace checkpoint in artifact {artifact_base}:{version}")
|
|
142
|
+
return file_root, artifact_name
|
|
143
|
+
|
|
144
|
+
def _get_final_formatter(
|
|
145
|
+
self,
|
|
146
|
+
formatter: BaseFormatter | None = None,
|
|
147
|
+
formatter_name: str | None = None,
|
|
148
|
+
formatter_kwargs: dict[str, Any] | None = None,
|
|
149
|
+
) -> BaseFormatter | None:
|
|
150
|
+
if (num_provided := sum(x is not None for x in [formatter, formatter_name])) == 0:
|
|
151
|
+
return None # none given, so will use the default of the class
|
|
152
|
+
elif num_provided > 1:
|
|
153
|
+
raise ValueError("At most one of `formatter` or `formatter_name` must be provided.")
|
|
154
|
+
|
|
155
|
+
if formatter:
|
|
156
|
+
if formatter_kwargs:
|
|
157
|
+
raise ValueError("Cannot provide `formatter_kwargs` when `formatter` is provided.")
|
|
158
|
+
return formatter
|
|
159
|
+
elif formatter_name:
|
|
160
|
+
kwargs = formatter_kwargs or {}
|
|
161
|
+
match formatter_name:
|
|
162
|
+
case "Llama3Formatter":
|
|
163
|
+
from template_formatting.formatter import Llama3Formatter
|
|
164
|
+
|
|
165
|
+
return Llama3Formatter()
|
|
166
|
+
case "MistralFormatter" | "MagistralFormatter":
|
|
167
|
+
from eval_framework.llm.mistral import MagistralFormatter
|
|
168
|
+
|
|
169
|
+
return MagistralFormatter(**kwargs)
|
|
170
|
+
case "ConcatFormatter":
|
|
171
|
+
from template_formatting.formatter import ConcatFormatter
|
|
172
|
+
|
|
173
|
+
return ConcatFormatter()
|
|
174
|
+
case "HFFormatter":
|
|
175
|
+
from template_formatting.formatter import HFFormatter
|
|
176
|
+
|
|
177
|
+
return HFFormatter(**kwargs)
|
|
178
|
+
case _:
|
|
179
|
+
raise ValueError(f"Unsupported formatter: {formatter_name}.")
|
|
180
|
+
return None
|