eval-framework 0.2.6__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.6 → eval_framework-0.2.8}/PKG-INFO +47 -29
- {eval_framework-0.2.6 → eval_framework-0.2.8}/README.md +45 -28
- {eval_framework-0.2.6 → eval_framework-0.2.8}/pyproject.toml +4 -1
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/determined.py +1 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/eval.py +2 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/local.py +1 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/evaluation_generator.py +4 -1
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/aleph_alpha.py +10 -6
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/math_reasoning_completion.py +10 -9
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/base.py +2 -1
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/comparison_grader.py +56 -4
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +110 -25
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +9 -0
- eval_framework-0.2.8/src/eval_framework/metrics/llm/utils.py +20 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/run.py +6 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/eval_config.py +1 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/generate_task_docs.py +24 -6
- {eval_framework-0.2.6 → eval_framework-0.2.8}/LICENSE +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.6 → eval_framework-0.2.8}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -235,6 +235,7 @@ Requires-Dist: python-iso639>=2025.2.18
|
|
|
235
235
|
Requires-Dist: wandb>=0.23.0,<1
|
|
236
236
|
Requires-Dist: boto3>=1.40.54,<2
|
|
237
237
|
Requires-Dist: numpy>=1.26.4
|
|
238
|
+
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
238
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
239
240
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
240
241
|
Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
|
|
@@ -268,10 +269,24 @@ Provides-Extra: transformers
|
|
|
268
269
|
Provides-Extra: vllm
|
|
269
270
|
Description-Content-Type: text/markdown
|
|
270
271
|
|
|
272
|
+
<!-- Badges -->
|
|
273
|
+
<div align="center">
|
|
274
|
+
|
|
271
275
|
# Aleph Alpha Eval-Framework
|
|
272
276
|
|
|
273
|
-
|
|
274
|
-
|
|
277
|
+
**Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
|
|
278
|
+
|
|
279
|
+
[](https://github.com/Aleph-Alpha-Research/eval-framework/actions)
|
|
280
|
+
[](https://github.com/Aleph-Alpha-Research/eval-framework/releases)
|
|
281
|
+
[](https://pypi.org/project/eval-framework/)
|
|
282
|
+
[](LICENSE)
|
|
283
|
+
|
|
284
|
+
[](https://aleph-alpha-research.github.io/eval-framework/)
|
|
285
|
+
[](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
|
|
286
|
+
|
|
287
|
+

|
|
288
|
+
|
|
289
|
+
</div>
|
|
275
290
|
|
|
276
291
|
## Why Choose This Framework?
|
|
277
292
|
|
|
@@ -289,10 +304,12 @@ Description-Content-Type: text/markdown
|
|
|
289
304
|
- Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
|
|
290
305
|
- Docker Support: Pre-configured Dockerfiles for local and distributed setups.
|
|
291
306
|
|
|
307
|
+
For full documentation, visit our [Docs Page](https://aleph-alpha-research.github.io/eval-framework/).
|
|
308
|
+
|
|
292
309
|
## Quick Start
|
|
293
310
|
|
|
294
311
|
The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
|
|
295
|
-
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](
|
|
312
|
+
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](https://aleph-alpha-research.github.io/eval-framework/installation.html).
|
|
296
313
|
|
|
297
314
|
The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
|
|
298
315
|
```
|
|
@@ -350,7 +367,7 @@ eval_framework \
|
|
|
350
367
|
--num-samples 10
|
|
351
368
|
```
|
|
352
369
|
|
|
353
|
-
For more detailed CLI usage instructions, see the [CLI Usage Guide](
|
|
370
|
+
For more detailed CLI usage instructions, see the [CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html).
|
|
354
371
|
|
|
355
372
|
## Benchmark Coverage & Task Categories
|
|
356
373
|
|
|
@@ -403,7 +420,7 @@ Evaluation metrics include:
|
|
|
403
420
|
- **LLM Metrics:** Chatbot Style Judge, Instruction Judge
|
|
404
421
|
- **Efficiency Metrics:** Bytes per Sequence Position
|
|
405
422
|
|
|
406
|
-
For the full list of tasks and metrics, see [Detailed Task Table](
|
|
423
|
+
For the full list of tasks and metrics, see [Detailed Task Table](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html).
|
|
407
424
|
|
|
408
425
|
## Getting Started
|
|
409
426
|
|
|
@@ -419,9 +436,9 @@ Eval-Framework provides a unified interface for evaluating language models acros
|
|
|
419
436
|
|
|
420
437
|
### Core Components
|
|
421
438
|
|
|
422
|
-
- **Models**: Defined via [`BaseLLM`](
|
|
423
|
-
- **Tasks**: Inherit from [`BaseTask`](
|
|
424
|
-
- **Metrics**: Automatic scoring via [`BaseMetric`](
|
|
439
|
+
- **Models**: Defined via [`BaseLLM`](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html) interface (HuggingFace, OpenAI, custom APIs)
|
|
440
|
+
- **Tasks**: Inherit from [`BaseTask`](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html) (completion, loglikelihood, or LLM-judge based)
|
|
441
|
+
- **Metrics**: Automatic scoring via [`BaseMetric`](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html) classes
|
|
425
442
|
- **Formatters**: Handle prompt construction and model-specific formatting
|
|
426
443
|
- **Results**: Structured outputs with sample-level details and aggregated statistics
|
|
427
444
|
|
|
@@ -466,41 +483,42 @@ if __name__ == "__main__":
|
|
|
466
483
|
results = main(llm=llm, config=config)
|
|
467
484
|
```
|
|
468
485
|
|
|
469
|
-
3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](
|
|
486
|
+
3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html) to interpret them
|
|
470
487
|
|
|
471
488
|
### Next Steps
|
|
472
489
|
|
|
473
|
-
- **Use CLI interface**: See [CLI usage guide](
|
|
474
|
-
- **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](
|
|
475
|
-
- **Understand model arguments**: Read out [Model Arguments guide](
|
|
476
|
-
- **Create custom benchmarks**: Follow our [benchmark creation guide](
|
|
477
|
-
- **Scale your evaluations**: Use [Determined AI integration](
|
|
478
|
-
- **Understand your results**: Read our [results interpretation guide](
|
|
479
|
-
- **Log results in WandB**: See how [we integrate WandB](
|
|
490
|
+
- **Use CLI interface**: See [CLI usage guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html) for command-line evaluation options
|
|
491
|
+
- **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)
|
|
492
|
+
- **Understand model arguments**: Read out [Model Arguments guide](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)
|
|
493
|
+
- **Create custom benchmarks**: Follow our [benchmark creation guide](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)
|
|
494
|
+
- **Scale your evaluations**: Use [Determined AI integration](https://aleph-alpha-research.github.io/eval-framework/using_determined.html) for distributed evaluation
|
|
495
|
+
- **Understand your results**: Read our [results interpretation guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)
|
|
496
|
+
- **Log results in WandB**: See how [we integrate WandB](https://aleph-alpha-research.github.io/eval-framework/wandb_integration.html) for metric and lineage tracking
|
|
480
497
|
|
|
481
498
|
## Documentation
|
|
482
499
|
|
|
483
500
|
### Getting Started
|
|
484
501
|
|
|
485
|
-
- **[CLI Usage Guide](
|
|
486
|
-
- **[Evaluating HuggingFace Models](
|
|
487
|
-
- **[Understanding Results](
|
|
502
|
+
- **[CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html)** - Detailed instructions for using the command-line interface
|
|
503
|
+
- **[Evaluating HuggingFace Models](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)** - Complete guide for evaluating HuggingFace models
|
|
504
|
+
- **[Understanding Results](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)** - How to read and interpret evaluation results
|
|
488
505
|
|
|
489
506
|
### Advanced Usage
|
|
490
507
|
|
|
491
|
-
- **[Understanding Model Arguments](
|
|
492
|
-
- **[Adding New Benchmarks](
|
|
493
|
-
- **[Benchmarks and Metrics](
|
|
494
|
-
- **[Overview of Dataloading](
|
|
508
|
+
- **[Understanding Model Arguments](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)** - Thorough guide on each constructor argument for salient model classes
|
|
509
|
+
- **[Adding New Benchmarks](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)** - Complete guide with practical examples for adding new benchmarks
|
|
510
|
+
- **[Benchmarks and Metrics](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html)** - Comprehensive overview of all available benchmarks and evaluation metrics
|
|
511
|
+
- **[Overview of Dataloading](https://aleph-alpha-research.github.io/eval-framework/overview_dataloading.html)** - Explanation of dataloading and task/sample/message structure
|
|
495
512
|
|
|
496
513
|
### Scaling & Production
|
|
497
514
|
|
|
498
|
-
- **[Using Determined](
|
|
499
|
-
- **[Controlling Upload Results](
|
|
515
|
+
- **[Using Determined](https://aleph-alpha-research.github.io/eval-framework/using_determined.html)** - Guide for distributed evaluation using Determined AI
|
|
516
|
+
- **[Controlling Upload Results](https://aleph-alpha-research.github.io/eval-framework/controlling_upload_results.html)** - How to manage and control the upload of evaluation results
|
|
500
517
|
|
|
501
518
|
### Contributing
|
|
502
519
|
|
|
503
|
-
- **[Contributing Guide](CONTRIBUTING.
|
|
520
|
+
- **[Contributing Guide](https://aleph-alpha-research.github.io/eval-framework/CONTRIBUTING.html)** - Guide for contributing to this project
|
|
521
|
+
- **[Testing](https://aleph-alpha-research.github.io/eval-framework/testing.html)** - Guide for running tests comparable to the CI pipelines
|
|
504
522
|
|
|
505
523
|
### Citation
|
|
506
524
|
|
|
@@ -526,6 +544,6 @@ This project has received funding from the European Union’s Digital Europe Pro
|
|
|
526
544
|
The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
|
|
527
545
|
|
|
528
546
|
<p align="center">
|
|
529
|
-
<img src="
|
|
530
|
-
<img src="
|
|
547
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
|
|
548
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
|
|
531
549
|
</p>
|
|
@@ -1,7 +1,21 @@
|
|
|
1
|
+
<!-- Badges -->
|
|
2
|
+
<div align="center">
|
|
3
|
+
|
|
1
4
|
# Aleph Alpha Eval-Framework
|
|
2
5
|
|
|
3
|
-
|
|
4
|
-
|
|
6
|
+
**Comprehensive LLM evaluation at scale** - A production-ready framework for evaluating large language models across 90+ benchmarks.
|
|
7
|
+
|
|
8
|
+
[](https://github.com/Aleph-Alpha-Research/eval-framework/actions)
|
|
9
|
+
[](https://github.com/Aleph-Alpha-Research/eval-framework/releases)
|
|
10
|
+
[](https://pypi.org/project/eval-framework/)
|
|
11
|
+
[](LICENSE)
|
|
12
|
+
|
|
13
|
+
[](https://aleph-alpha-research.github.io/eval-framework/)
|
|
14
|
+
[](https://github.com/Aleph-Alpha-Research/eval-framework/stargazers)
|
|
15
|
+
|
|
16
|
+

|
|
17
|
+
|
|
18
|
+
</div>
|
|
5
19
|
|
|
6
20
|
## Why Choose This Framework?
|
|
7
21
|
|
|
@@ -19,10 +33,12 @@
|
|
|
19
33
|
- Statistical Analysis: Includes confidence intervals and significance testing for reliable comparisons.
|
|
20
34
|
- Docker Support: Pre-configured Dockerfiles for local and distributed setups.
|
|
21
35
|
|
|
36
|
+
For full documentation, visit our [Docs Page](https://aleph-alpha-research.github.io/eval-framework/).
|
|
37
|
+
|
|
22
38
|
## Quick Start
|
|
23
39
|
|
|
24
40
|
The codebase is tested and compatible with Python 3.12 and PyTorch 2.5.
|
|
25
|
-
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](
|
|
41
|
+
You will also need the appropriate CUDA dependencies and version installed on your system for GPU support. Detailed installation instructions can be found [here](https://aleph-alpha-research.github.io/eval-framework/installation.html).
|
|
26
42
|
|
|
27
43
|
The easiest way to get started is by installing the library via `pip` and use it as an external dependency.
|
|
28
44
|
```
|
|
@@ -80,7 +96,7 @@ eval_framework \
|
|
|
80
96
|
--num-samples 10
|
|
81
97
|
```
|
|
82
98
|
|
|
83
|
-
For more detailed CLI usage instructions, see the [CLI Usage Guide](
|
|
99
|
+
For more detailed CLI usage instructions, see the [CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html).
|
|
84
100
|
|
|
85
101
|
## Benchmark Coverage & Task Categories
|
|
86
102
|
|
|
@@ -133,7 +149,7 @@ Evaluation metrics include:
|
|
|
133
149
|
- **LLM Metrics:** Chatbot Style Judge, Instruction Judge
|
|
134
150
|
- **Efficiency Metrics:** Bytes per Sequence Position
|
|
135
151
|
|
|
136
|
-
For the full list of tasks and metrics, see [Detailed Task Table](
|
|
152
|
+
For the full list of tasks and metrics, see [Detailed Task Table](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html).
|
|
137
153
|
|
|
138
154
|
## Getting Started
|
|
139
155
|
|
|
@@ -149,9 +165,9 @@ Eval-Framework provides a unified interface for evaluating language models acros
|
|
|
149
165
|
|
|
150
166
|
### Core Components
|
|
151
167
|
|
|
152
|
-
- **Models**: Defined via [`BaseLLM`](
|
|
153
|
-
- **Tasks**: Inherit from [`BaseTask`](
|
|
154
|
-
- **Metrics**: Automatic scoring via [`BaseMetric`](
|
|
168
|
+
- **Models**: Defined via [`BaseLLM`](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html) interface (HuggingFace, OpenAI, custom APIs)
|
|
169
|
+
- **Tasks**: Inherit from [`BaseTask`](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html) (completion, loglikelihood, or LLM-judge based)
|
|
170
|
+
- **Metrics**: Automatic scoring via [`BaseMetric`](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html) classes
|
|
155
171
|
- **Formatters**: Handle prompt construction and model-specific formatting
|
|
156
172
|
- **Results**: Structured outputs with sample-level details and aggregated statistics
|
|
157
173
|
|
|
@@ -196,41 +212,42 @@ if __name__ == "__main__":
|
|
|
196
212
|
results = main(llm=llm, config=config)
|
|
197
213
|
```
|
|
198
214
|
|
|
199
|
-
3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](
|
|
215
|
+
3. **Review results** - Check `./eval_results/` for detailed outputs and use our [results guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html) to interpret them
|
|
200
216
|
|
|
201
217
|
### Next Steps
|
|
202
218
|
|
|
203
|
-
- **Use CLI interface**: See [CLI usage guide](
|
|
204
|
-
- **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](
|
|
205
|
-
- **Understand model arguments**: Read out [Model Arguments guide](
|
|
206
|
-
- **Create custom benchmarks**: Follow our [benchmark creation guide](
|
|
207
|
-
- **Scale your evaluations**: Use [Determined AI integration](
|
|
208
|
-
- **Understand your results**: Read our [results interpretation guide](
|
|
209
|
-
- **Log results in WandB**: See how [we integrate WandB](
|
|
219
|
+
- **Use CLI interface**: See [CLI usage guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html) for command-line evaluation options
|
|
220
|
+
- **Evaluate HuggingFace models**: Follow our [HuggingFace evaluation guide](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)
|
|
221
|
+
- **Understand model arguments**: Read out [Model Arguments guide](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)
|
|
222
|
+
- **Create custom benchmarks**: Follow our [benchmark creation guide](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)
|
|
223
|
+
- **Scale your evaluations**: Use [Determined AI integration](https://aleph-alpha-research.github.io/eval-framework/using_determined.html) for distributed evaluation
|
|
224
|
+
- **Understand your results**: Read our [results interpretation guide](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)
|
|
225
|
+
- **Log results in WandB**: See how [we integrate WandB](https://aleph-alpha-research.github.io/eval-framework/wandb_integration.html) for metric and lineage tracking
|
|
210
226
|
|
|
211
227
|
## Documentation
|
|
212
228
|
|
|
213
229
|
### Getting Started
|
|
214
230
|
|
|
215
|
-
- **[CLI Usage Guide](
|
|
216
|
-
- **[Evaluating HuggingFace Models](
|
|
217
|
-
- **[Understanding Results](
|
|
231
|
+
- **[CLI Usage Guide](https://aleph-alpha-research.github.io/eval-framework/cli_usage.html)** - Detailed instructions for using the command-line interface
|
|
232
|
+
- **[Evaluating HuggingFace Models](https://aleph-alpha-research.github.io/eval-framework/evaluate_huggingface_model.html)** - Complete guide for evaluating HuggingFace models
|
|
233
|
+
- **[Understanding Results](https://aleph-alpha-research.github.io/eval-framework/understanding_results_guide.html)** - How to read and interpret evaluation results
|
|
218
234
|
|
|
219
235
|
### Advanced Usage
|
|
220
236
|
|
|
221
|
-
- **[Understanding Model Arguments](
|
|
222
|
-
- **[Adding New Benchmarks](
|
|
223
|
-
- **[Benchmarks and Metrics](
|
|
224
|
-
- **[Overview of Dataloading](
|
|
237
|
+
- **[Understanding Model Arguments](https://aleph-alpha-research.github.io/eval-framework/model_arguments.html)** - Thorough guide on each constructor argument for salient model classes
|
|
238
|
+
- **[Adding New Benchmarks](https://aleph-alpha-research.github.io/eval-framework/add_new_benchmark_guide.html)** - Complete guide with practical examples for adding new benchmarks
|
|
239
|
+
- **[Benchmarks and Metrics](https://aleph-alpha-research.github.io/eval-framework/benchmarks_and_metrics.html)** - Comprehensive overview of all available benchmarks and evaluation metrics
|
|
240
|
+
- **[Overview of Dataloading](https://aleph-alpha-research.github.io/eval-framework/overview_dataloading.html)** - Explanation of dataloading and task/sample/message structure
|
|
225
241
|
|
|
226
242
|
### Scaling & Production
|
|
227
243
|
|
|
228
|
-
- **[Using Determined](
|
|
229
|
-
- **[Controlling Upload Results](
|
|
244
|
+
- **[Using Determined](https://aleph-alpha-research.github.io/eval-framework/using_determined.html)** - Guide for distributed evaluation using Determined AI
|
|
245
|
+
- **[Controlling Upload Results](https://aleph-alpha-research.github.io/eval-framework/controlling_upload_results.html)** - How to manage and control the upload of evaluation results
|
|
230
246
|
|
|
231
247
|
### Contributing
|
|
232
248
|
|
|
233
|
-
- **[Contributing Guide](CONTRIBUTING.
|
|
249
|
+
- **[Contributing Guide](https://aleph-alpha-research.github.io/eval-framework/CONTRIBUTING.html)** - Guide for contributing to this project
|
|
250
|
+
- **[Testing](https://aleph-alpha-research.github.io/eval-framework/testing.html)** - Guide for running tests comparable to the CI pipelines
|
|
234
251
|
|
|
235
252
|
### Citation
|
|
236
253
|
|
|
@@ -256,6 +273,6 @@ This project has received funding from the European Union’s Digital Europe Pro
|
|
|
256
273
|
The contents of this publication are the sole responsibility of the OpenEuroLLM consortium and do not necessarily reflect the opinion of the European Union.
|
|
257
274
|
|
|
258
275
|
<p align="center">
|
|
259
|
-
<img src="
|
|
260
|
-
<img src="
|
|
276
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_1.png" width="100" style="margin-right: 50px;"/>
|
|
277
|
+
<img src="https://raw.githubusercontent.com/Aleph-Alpha-Research/eval-framework/main/docs/OELLM_2.png" width="350"/>
|
|
261
278
|
</p>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.8"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -42,6 +42,9 @@ dependencies = [
|
|
|
42
42
|
"wandb>=0.23.0,<1",
|
|
43
43
|
"boto3>=1.40.54,<2",
|
|
44
44
|
"numpy>=1.26.4",
|
|
45
|
+
# is a dependency of sympy, but not explicitly listed in the requirements.txt
|
|
46
|
+
# https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
|
|
47
|
+
"antlr4-python3-runtime==4.11.0",
|
|
45
48
|
]
|
|
46
49
|
|
|
47
50
|
[project.optional-dependencies]
|
|
@@ -149,6 +149,7 @@ class DeterminedContext(EvalContext):
|
|
|
149
149
|
wandb_upload_results=self.hparams.wandb_upload_results or self.wandb_upload_results,
|
|
150
150
|
batch_size=self.hparams.task_args.batch_size or self.batch_size,
|
|
151
151
|
description=self.hparams.description or self.description,
|
|
152
|
+
randomize_judge_order=self.randomize_judge_order,
|
|
152
153
|
delete_output_dir_after_upload=self.hparams.delete_output_dir_after_upload
|
|
153
154
|
or self.delete_output_dir_after_upload,
|
|
154
155
|
)
|
|
@@ -73,6 +73,7 @@ class EvalContext(AbstractContextManager):
|
|
|
73
73
|
perturbation_type: str | None = None,
|
|
74
74
|
perturbation_probability: float | None = None,
|
|
75
75
|
perturbation_seed: int | None = None,
|
|
76
|
+
randomize_judge_order: bool = False,
|
|
76
77
|
delete_output_dir_after_upload: bool | None = None,
|
|
77
78
|
) -> None:
|
|
78
79
|
self.llm_name = llm_name
|
|
@@ -96,6 +97,7 @@ class EvalContext(AbstractContextManager):
|
|
|
96
97
|
self.judge_model_args = judge_model_args if judge_model_args is not None else {}
|
|
97
98
|
self.batch_size = batch_size
|
|
98
99
|
self.description = description
|
|
100
|
+
self.randomize_judge_order = randomize_judge_order
|
|
99
101
|
self.delete_output_dir_after_upload = delete_output_dir_after_upload
|
|
100
102
|
|
|
101
103
|
if perturbation_type or perturbation_probability is not None:
|
|
@@ -63,6 +63,7 @@ class LocalContext(EvalContext):
|
|
|
63
63
|
judge_model_args=self.judge_model_args,
|
|
64
64
|
batch_size=self.batch_size,
|
|
65
65
|
description=self.description,
|
|
66
|
+
randomize_judge_order=self.randomize_judge_order,
|
|
66
67
|
delete_output_dir_after_upload=self.delete_output_dir_after_upload,
|
|
67
68
|
)
|
|
68
69
|
|
|
@@ -67,7 +67,10 @@ class EvaluationGenerator:
|
|
|
67
67
|
if llm_judge is None:
|
|
68
68
|
assert self.config.llm_judge_class is not None, "The llm_judge_class must be defined in the config."
|
|
69
69
|
llm_judge = self.config.llm_judge_class(**self.config.judge_model_args)
|
|
70
|
-
metric = metric_class(
|
|
70
|
+
metric = metric_class(
|
|
71
|
+
llm_judge=llm_judge,
|
|
72
|
+
randomize_order=self.config.randomize_judge_order,
|
|
73
|
+
)
|
|
71
74
|
else:
|
|
72
75
|
metric = metric_class()
|
|
73
76
|
|
|
@@ -55,6 +55,8 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
55
55
|
request_timeout_seconds: int = 30 * 60 + 5,
|
|
56
56
|
queue_full_timeout_seconds: int = 30 * 60 + 5,
|
|
57
57
|
bytes_per_token: float | None = None,
|
|
58
|
+
token: str = os.getenv("AA_TOKEN", "dummy"),
|
|
59
|
+
base_url: str = os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
|
|
58
60
|
) -> None:
|
|
59
61
|
self._formatter: BaseFormatter
|
|
60
62
|
if formatter is None:
|
|
@@ -69,7 +71,9 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
69
71
|
self.max_retries = max_retries
|
|
70
72
|
self.request_timeout_seconds = request_timeout_seconds
|
|
71
73
|
self.queue_full_timeout_seconds = queue_full_timeout_seconds
|
|
72
|
-
self.
|
|
74
|
+
self.token = token
|
|
75
|
+
self.base_url = base_url
|
|
76
|
+
self._validate_model_availability(base_url, token)
|
|
73
77
|
# set bytes_per_token_scalar for non-standard models
|
|
74
78
|
if bytes_per_token is not None and bytes_per_token <= 0:
|
|
75
79
|
raise ValueError("bytes_per_token must be positive")
|
|
@@ -77,15 +81,15 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
77
81
|
4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
|
|
78
82
|
)
|
|
79
83
|
|
|
80
|
-
def _validate_model_availability(self) -> None:
|
|
84
|
+
def _validate_model_availability(self, base_url: str, token: str) -> None:
|
|
81
85
|
"""
|
|
82
86
|
Validate that the model name is available by making a test request.
|
|
83
87
|
"""
|
|
84
88
|
try:
|
|
85
89
|
# 'Client' object does not support the context manager protocol
|
|
86
90
|
client = Client(
|
|
87
|
-
host=
|
|
88
|
-
token=
|
|
91
|
+
host=base_url,
|
|
92
|
+
token=token,
|
|
89
93
|
)
|
|
90
94
|
|
|
91
95
|
request = CompletionRequest(
|
|
@@ -190,10 +194,10 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
190
194
|
"""Process multiple requests concurrently, returning request/response pairs."""
|
|
191
195
|
semaphore = asyncio.Semaphore(self.max_async_concurrent_requests)
|
|
192
196
|
async with AsyncClient(
|
|
193
|
-
host=
|
|
197
|
+
host=self.base_url,
|
|
194
198
|
nice=True,
|
|
195
199
|
request_timeout_seconds=self.request_timeout_seconds,
|
|
196
|
-
token=
|
|
200
|
+
token=self.token,
|
|
197
201
|
total_retries=0, # we have a custom retry policy in _request_with_backoff()
|
|
198
202
|
) as client:
|
|
199
203
|
tasks = (
|
|
@@ -204,10 +204,15 @@ class MathReasoningCompletion(BaseMetric[Completion]):
|
|
|
204
204
|
timeout = 10
|
|
205
205
|
# latex parse all ingested ground truth values for math reasoning
|
|
206
206
|
for gt in response.ground_truth_list:
|
|
207
|
+
if gt is None:
|
|
208
|
+
continue
|
|
207
209
|
signal.signal(signal.SIGALRM, timeout_handler) # Set timeout signal
|
|
208
210
|
signal.alarm(timeout) # Set timeout duration
|
|
209
211
|
try:
|
|
210
|
-
|
|
212
|
+
gt_normalized = self.normalize_expression(gt)
|
|
213
|
+
gt_parsed = parse_latex(
|
|
214
|
+
gt_normalized
|
|
215
|
+
) # NOTE: parses f(x)=0,\quadf(x)=x-1,\quadf(x)=-x+1 to Eq(f(x), 0) ONLY
|
|
211
216
|
ground_truths.append(gt_parsed)
|
|
212
217
|
except Exception:
|
|
213
218
|
ground_truths.append(gt)
|
|
@@ -229,15 +234,11 @@ class MathReasoningCompletion(BaseMetric[Completion]):
|
|
|
229
234
|
)
|
|
230
235
|
]
|
|
231
236
|
else:
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
assert isinstance(response.ground_truth, str)
|
|
235
|
-
str_is_correct = self._is_str_correct(normalized_response, response.ground_truth)
|
|
236
|
-
return [
|
|
237
|
-
MetricResult(
|
|
238
|
-
metric_name=self.NAME, value=float(str_is_correct), higher_is_better=True, error=response.error
|
|
239
|
-
)
|
|
237
|
+
normalized_ground_truths = [
|
|
238
|
+
self.normalize_expression(gt) for gt in response.ground_truth_list if gt is not None
|
|
240
239
|
]
|
|
240
|
+
res = self._any_str_correct([normalized_response], normalized_ground_truths)
|
|
241
|
+
return [MetricResult(metric_name=self.NAME, value=float(res), higher_is_better=True, error=response.error)]
|
|
241
242
|
|
|
242
243
|
def _any_str_correct(self, response_list: list, ground_truths: list) -> bool:
|
|
243
244
|
"""
|
|
@@ -6,8 +6,9 @@ from eval_framework.shared.types import Completion, Error
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class BaseLLMJudgeMetric(BaseMetric[Completion]):
|
|
9
|
-
def __init__(self, llm_judge: BaseLLM) -> None:
|
|
9
|
+
def __init__(self, llm_judge: BaseLLM, randomize_order: bool = False) -> None:
|
|
10
10
|
self._llm_judge = llm_judge
|
|
11
|
+
self._randomize_order = randomize_order
|
|
11
12
|
|
|
12
13
|
def _create_metric_result(
|
|
13
14
|
self,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import random
|
|
1
2
|
from collections.abc import Mapping
|
|
2
3
|
from enum import Enum
|
|
3
4
|
|
|
@@ -8,6 +9,7 @@ from eval_framework.metrics.llm.graders.models import (
|
|
|
8
9
|
PromptTemplateWithParseMap,
|
|
9
10
|
parse_json_output,
|
|
10
11
|
)
|
|
12
|
+
from eval_framework.metrics.llm.utils import order_answers_for_comparison
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
class MatchOutcome(str, Enum):
|
|
@@ -23,6 +25,14 @@ class MatchOutcome(str, Enum):
|
|
|
23
25
|
return (0.5, 0.5)
|
|
24
26
|
return (0, 1)
|
|
25
27
|
|
|
28
|
+
def flip(self) -> "MatchOutcome":
|
|
29
|
+
"""Flip the outcome (A_WINS <-> B_WINS, DRAW stays DRAW)."""
|
|
30
|
+
if self == self.A_WINS:
|
|
31
|
+
return MatchOutcome.B_WINS
|
|
32
|
+
if self == self.B_WINS:
|
|
33
|
+
return MatchOutcome.A_WINS
|
|
34
|
+
return self # DRAW stays DRAW
|
|
35
|
+
|
|
26
36
|
@staticmethod
|
|
27
37
|
def from_rank_literal(rank: int) -> "MatchOutcome":
|
|
28
38
|
match rank:
|
|
@@ -122,25 +132,67 @@ Answer 2:
|
|
|
122
132
|
self._prompt_templates = prompt_templates
|
|
123
133
|
|
|
124
134
|
def grade(
|
|
125
|
-
self,
|
|
135
|
+
self,
|
|
136
|
+
instruction: str,
|
|
137
|
+
completion_1: str,
|
|
138
|
+
completion_2: str,
|
|
139
|
+
language: Language,
|
|
140
|
+
randomize_order: bool = False,
|
|
141
|
+
seed: int | None = None,
|
|
126
142
|
) -> ComparisonGradingOutput:
|
|
143
|
+
"""Grade two completions by comparing them.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
instruction: The instruction/task that was given.
|
|
147
|
+
completion_1: The first completion (typically the candidate).
|
|
148
|
+
completion_2: The second completion (typically the reference).
|
|
149
|
+
language: The language for the grading prompts.
|
|
150
|
+
randomize_order: If True, randomly swap the order of completions to eliminate
|
|
151
|
+
position bias.
|
|
152
|
+
seed: Optional random seed for reproducibility. If None and randomize_order
|
|
153
|
+
is True, uses a random swap decision.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
ComparisonGradingOutput with the outcome corrected for any position swap,
|
|
157
|
+
so outcome always reflects completion_1 vs completion_2 regardless of
|
|
158
|
+
presentation order to the judge.
|
|
159
|
+
"""
|
|
127
160
|
prompt_template = language.language_config(self._prompt_templates)
|
|
128
161
|
|
|
162
|
+
# Determine whether to swap the order
|
|
163
|
+
if randomize_order:
|
|
164
|
+
rng = random.Random(seed)
|
|
165
|
+
swap_order = rng.choice([True, False])
|
|
166
|
+
else:
|
|
167
|
+
swap_order = False
|
|
168
|
+
|
|
169
|
+
# Apply the swap if needed
|
|
170
|
+
actual_answer_1, actual_answer_2 = order_answers_for_comparison(completion_1, completion_2, swap_order)
|
|
171
|
+
|
|
129
172
|
messages = prompt_template.to_messages(
|
|
130
173
|
[],
|
|
131
174
|
[
|
|
132
175
|
(self.INSTRUCTION_KEY, instruction),
|
|
133
|
-
(self.ANSWER_1_KEY,
|
|
134
|
-
(self.ANSWER_2_KEY,
|
|
176
|
+
(self.ANSWER_1_KEY, actual_answer_1),
|
|
177
|
+
(self.ANSWER_2_KEY, actual_answer_2),
|
|
135
178
|
],
|
|
136
179
|
)
|
|
137
180
|
|
|
138
181
|
raw_completion = self._grading_model.generate_from_messages([messages])[0]
|
|
139
182
|
loaded_json = parse_json_output(raw_completion.completion)
|
|
140
183
|
|
|
184
|
+
# Get the raw outcome from the judge
|
|
185
|
+
raw_outcome: MatchOutcome | None = prompt_template.parse_map.get(
|
|
186
|
+
str(loaded_json.get(self.BETTER_ANSWER_KEY, None)), None
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Correct the outcome if we swapped the order
|
|
190
|
+
# If swapped: "Answer 1 is better" means completion_2 is better (B_WINS from completion_1's perspective)
|
|
191
|
+
final_outcome = raw_outcome.flip() if swap_order and raw_outcome is not None else raw_outcome
|
|
192
|
+
|
|
141
193
|
return ComparisonGradingOutput(
|
|
142
194
|
reasoning=loaded_json.get(self.REASONING_KEY, None),
|
|
143
|
-
outcome=
|
|
195
|
+
outcome=final_outcome,
|
|
144
196
|
judge_prompt=raw_completion.prompt,
|
|
145
197
|
judge_response=raw_completion.completion,
|
|
146
198
|
)
|