eval-framework 0.2.12__tar.gz → 0.2.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.12 → eval_framework-0.2.14}/PKG-INFO +2 -2
- {eval_framework-0.2.12 → eval_framework-0.2.14}/README.md +1 -1
- {eval_framework-0.2.12 → eval_framework-0.2.14}/pyproject.toml +3 -1
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/evaluation_generator.py +32 -6
- eval_framework-0.2.14/src/eval_framework/external/drop_process_results.py +250 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/aleph_alpha.py +7 -2
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/base.py +5 -2
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/huggingface.py +26 -3
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/openai.py +15 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/vllm.py +14 -4
- eval_framework-0.2.14/src/eval_framework/metrics/completion/drop_completion.py +47 -0
- eval_framework-0.2.14/src/eval_framework/metrics/completion/math_minerva_completion.py +103 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/math_reasoning_completion.py +2 -8
- eval_framework-0.2.14/src/eval_framework/metrics/completion/minerva_math_utils.py +394 -0
- eval_framework-0.2.14/src/eval_framework/metrics/loglikelihood/bits_per_byte.py +64 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/base.py +4 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc.py +28 -2
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/balancedcopa.py +56 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/copa.py +54 -6
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/csqa.py +90 -0
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/drop.py +207 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores200.py +23 -33
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/global_mmlu.py +532 -0
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/goldenswag.py +42 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gpqa.py +27 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag.py +2 -1
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/humaneval.py +44 -0
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/lab_bench.py +89 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/math_reasoning.py +149 -9
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mbpp.py +123 -6
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/medqa.py +83 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu.py +15 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +13 -0
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/naturalqs_open.py +100 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/openbookqa.py +45 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/piqa.py +29 -1
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sciq.py +50 -8
- eval_framework-0.2.14/src/eval_framework/tasks/benchmarks/social_iqa.py +231 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/squad.py +26 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/truthfulqa.py +31 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogrande.py +29 -1
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +2 -2
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/task_names.py +50 -1
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/generate_task_docs.py +8 -4
- {eval_framework-0.2.12 → eval_framework-0.2.14}/LICENSE +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/run.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.12 → eval_framework-0.2.14}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.14
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -377,7 +377,7 @@ Subset of core capabilities benchmarks coverd by `eval-framework`:
|
|
|
377
377
|
|
|
378
378
|
| **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
|
|
379
379
|
|---------------|---------------|----------|------------|------------------------|------------------|
|
|
380
|
-
| COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
|
|
380
|
+
| COPA, BalancedCOPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
|
|
381
381
|
| Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
|
|
382
382
|
| Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
|
|
383
383
|
|
|
@@ -106,7 +106,7 @@ Subset of core capabilities benchmarks coverd by `eval-framework`:
|
|
|
106
106
|
|
|
107
107
|
| **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
|
|
108
108
|
|---------------|---------------|----------|------------|------------------------|------------------|
|
|
109
|
-
| COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
|
|
109
|
+
| COPA, BalancedCOPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
|
|
110
110
|
| Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
|
|
111
111
|
| Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
|
|
112
112
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.14"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -105,6 +105,7 @@ dev = [
|
|
|
105
105
|
"types-requests>=2.32.0.20250328,<3",
|
|
106
106
|
"plotly>=5.24.1,<6",
|
|
107
107
|
"ruff>=0.12.8",
|
|
108
|
+
"scipy>=1.14.0,<2", # for tests comparing our Hungarian implementation to scipy
|
|
108
109
|
]
|
|
109
110
|
flash-attn = [
|
|
110
111
|
"flash-attn>=2.7.2.post1,<2.8",
|
|
@@ -178,6 +179,7 @@ addopts = "-p 'no:legacypath' --doctest-modules"
|
|
|
178
179
|
markers = [
|
|
179
180
|
"gpu: needs a GPU runner, otherwise test can not be run",
|
|
180
181
|
"cpu_slow: runs for a long time (on CPU)",
|
|
182
|
+
"slow_download: smoke tests that download large datasets (>15s); excluded from CI, run manually with -m slow_download",
|
|
181
183
|
"external_api: needs external services for execution",
|
|
182
184
|
"vllm: tests that specifically require vLLM functionality",
|
|
183
185
|
"formatter_hash: formatter consistency tests using hash comparisons",
|
|
@@ -134,16 +134,27 @@ class EvaluationGenerator:
|
|
|
134
134
|
# filter and count errors
|
|
135
135
|
total_count = len(data_subset)
|
|
136
136
|
|
|
137
|
-
mask =
|
|
137
|
+
mask = data_subset["error"].isnull()
|
|
138
138
|
data_subset_error_free = data_subset.loc[mask, ["subject", "key", "value"]]
|
|
139
|
-
# data_subset_error_free = data_subset[data_subset["error"].isnull()][["subject", "key", "value"]]
|
|
140
139
|
|
|
141
|
-
|
|
140
|
+
error_free_ratio = float(len(data_subset_error_free) / total_count)
|
|
141
|
+
aggregated_results[f"ErrorFreeRatio {metric}"] = error_free_ratio
|
|
142
142
|
|
|
143
143
|
# aggregate by key and subject first to have equal weights for all key / subject combinations
|
|
144
144
|
key_subject_mean = data_subset_error_free.groupby(["key", "subject"]).mean()
|
|
145
145
|
aggregated_results[f"Average {metric}"] = float(key_subject_mean[["value"]].mean()["value"])
|
|
146
146
|
|
|
147
|
+
if error_free_ratio < 1.0:
|
|
148
|
+
# Treat error samples (with value=None) as 0 for the "including errors" average
|
|
149
|
+
data_subset_with_errors = data_subset[["key", "subject", "value", "error"]].copy()
|
|
150
|
+
# Only fill value with 0 where there's an error (not for all None values)
|
|
151
|
+
error_mask = data_subset_with_errors["error"].notna()
|
|
152
|
+
data_subset_with_errors.loc[error_mask, "value"] = data_subset_with_errors.loc[
|
|
153
|
+
error_mask, "value"
|
|
154
|
+
].fillna(0.0)
|
|
155
|
+
key_subject_mean_with_errors = data_subset_with_errors.groupby(["key", "subject"])["value"].mean()
|
|
156
|
+
aggregated_results[f"Average {metric} (including Errors)"] = float(key_subject_mean_with_errors.mean())
|
|
157
|
+
|
|
147
158
|
std_err_mean_sum_of_squares = 0.0
|
|
148
159
|
std_err_mean_total_num_samples = 0.0
|
|
149
160
|
std_err_mean_num_subjects = 0
|
|
@@ -156,14 +167,29 @@ class EvaluationGenerator:
|
|
|
156
167
|
# group = data_subset[data[column] == name][["subject", "key", "value", "error"]]
|
|
157
168
|
group_total_count = len(group)
|
|
158
169
|
group_error_free = group[group["error"].isnull()][["subject", "key", "value"]]
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
)
|
|
170
|
+
group_error_free_ratio = float(len(group_error_free) / group_total_count)
|
|
171
|
+
aggregated_results[f"ErrorFreeRatio {metric} - {name[0]}"] = group_error_free_ratio
|
|
162
172
|
|
|
163
173
|
group_key_subject_mean = group_error_free.groupby(["key", "subject"]).mean()
|
|
164
174
|
value = float(group_key_subject_mean[["value"]].mean()["value"])
|
|
165
175
|
aggregated_results[f"Average {metric} - {name[0]}"] = value if not math.isnan(value) else None
|
|
166
176
|
|
|
177
|
+
if group_error_free_ratio < 1.0:
|
|
178
|
+
# Treat error samples (with value=None) as 0 for the "including errors" average
|
|
179
|
+
group_with_errors = group[["key", "subject", "value", "error"]].copy()
|
|
180
|
+
# Only fill value with 0 where there's an error (not for all None values)
|
|
181
|
+
error_mask = group_with_errors["error"].notna()
|
|
182
|
+
group_with_errors.loc[error_mask, "value"] = group_with_errors.loc[
|
|
183
|
+
error_mask, "value"
|
|
184
|
+
].fillna(0.0)
|
|
185
|
+
group_key_subject_mean_with_errors = group_with_errors.groupby(["key", "subject"])[
|
|
186
|
+
"value"
|
|
187
|
+
].mean()
|
|
188
|
+
value_with_errors = float(group_key_subject_mean_with_errors.mean())
|
|
189
|
+
aggregated_results[f"Average {metric} (including Errors) - {name[0]}"] = (
|
|
190
|
+
value_with_errors if not math.isnan(value_with_errors) else None
|
|
191
|
+
)
|
|
192
|
+
|
|
167
193
|
if not ("SequencePositions" in metric or "Bytes" in metric):
|
|
168
194
|
# calculate standard error for selected metrics
|
|
169
195
|
group_key_subject_std = group_error_free.groupby(["key", "subject"]).std()
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""DROP F1 and exact match evaluation.
|
|
2
|
+
|
|
3
|
+
Logic adapted from AllenNLP DROP evaluation:
|
|
4
|
+
https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import string
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _linear_sum_assignment(cost_matrix: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
16
|
+
"""Solve the linear sum assignment problem (minimize cost) using the Hungarian algorithm.
|
|
17
|
+
|
|
18
|
+
Pure NumPy implementation to avoid scipy dependency. Returns (row_ind, col_ind) with row_ind sorted,
|
|
19
|
+
matching scipy.optimize.linear_sum_assignment API for rectangular matrices.
|
|
20
|
+
"""
|
|
21
|
+
cost = np.atleast_2d(np.asarray(cost_matrix, dtype=np.float64))
|
|
22
|
+
n_rows, n_cols = cost.shape
|
|
23
|
+
n = max(n_rows, n_cols)
|
|
24
|
+
# Pad to square with zeros so unassigned rows/cols contribute zero cost
|
|
25
|
+
C = np.zeros((n, n))
|
|
26
|
+
C[:n_rows, :n_cols] = cost
|
|
27
|
+
|
|
28
|
+
# Row and column reductions
|
|
29
|
+
u = np.min(C, axis=1)
|
|
30
|
+
u[u == np.inf] = 0
|
|
31
|
+
C = C - u[:, np.newaxis]
|
|
32
|
+
v = np.min(C, axis=0)
|
|
33
|
+
v[v == np.inf] = 0
|
|
34
|
+
C = C - v[np.newaxis, :]
|
|
35
|
+
|
|
36
|
+
# Starred zeros: assignment (1 = starred). Start with no stars.
|
|
37
|
+
star = np.zeros((n, n), dtype=np.intp)
|
|
38
|
+
row_covered = np.zeros(n, dtype=bool)
|
|
39
|
+
col_covered = np.zeros(n, dtype=bool)
|
|
40
|
+
prime = np.zeros((n, n), dtype=np.intp)
|
|
41
|
+
|
|
42
|
+
def find_zero() -> tuple[int, int] | None:
|
|
43
|
+
for i in range(n):
|
|
44
|
+
if row_covered[i]:
|
|
45
|
+
continue
|
|
46
|
+
for j in range(n):
|
|
47
|
+
if not col_covered[j] and C[i, j] == 0:
|
|
48
|
+
return (i, j)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def star_in_row(i: int) -> int | None:
|
|
52
|
+
for j in range(n):
|
|
53
|
+
if star[i, j]:
|
|
54
|
+
return j
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
def star_in_col(j: int) -> int | None:
|
|
58
|
+
for i in range(n):
|
|
59
|
+
if star[i, j]:
|
|
60
|
+
return i
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
def prime_in_row(i: int) -> int | None:
|
|
64
|
+
for j in range(n):
|
|
65
|
+
if prime[i, j]:
|
|
66
|
+
return j
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
while True:
|
|
70
|
+
# Cover columns containing a starred zero
|
|
71
|
+
col_covered[:] = False
|
|
72
|
+
for j in range(n):
|
|
73
|
+
for i in range(n):
|
|
74
|
+
if star[i, j]:
|
|
75
|
+
col_covered[j] = True
|
|
76
|
+
break
|
|
77
|
+
if np.all(col_covered):
|
|
78
|
+
break
|
|
79
|
+
prime[:] = 0
|
|
80
|
+
row_covered[:] = False
|
|
81
|
+
while True:
|
|
82
|
+
z = find_zero()
|
|
83
|
+
if z is None:
|
|
84
|
+
# No uncovered zero: find minimum uncovered value and adjust
|
|
85
|
+
min_val = np.inf
|
|
86
|
+
for i in range(n):
|
|
87
|
+
if not row_covered[i]:
|
|
88
|
+
for j in range(n):
|
|
89
|
+
if not col_covered[j] and C[i, j] < min_val:
|
|
90
|
+
min_val = C[i, j]
|
|
91
|
+
if min_val == np.inf or min_val <= 0:
|
|
92
|
+
min_val = 1e-10
|
|
93
|
+
for i in range(n):
|
|
94
|
+
if row_covered[i]:
|
|
95
|
+
C[i, :] += min_val
|
|
96
|
+
for j in range(n):
|
|
97
|
+
if not col_covered[j]:
|
|
98
|
+
C[:, j] -= min_val
|
|
99
|
+
continue
|
|
100
|
+
i, j = z
|
|
101
|
+
prime[i, j] = 1
|
|
102
|
+
cj = star_in_row(i)
|
|
103
|
+
if cj is None:
|
|
104
|
+
# Augmenting path: unstar starred, star primed along path
|
|
105
|
+
path = [(i, j)]
|
|
106
|
+
while True:
|
|
107
|
+
ji = star_in_col(path[-1][1])
|
|
108
|
+
if ji is None:
|
|
109
|
+
break
|
|
110
|
+
path.append((ji, path[-1][1]))
|
|
111
|
+
pj = prime_in_row(ji)
|
|
112
|
+
if pj is None:
|
|
113
|
+
break
|
|
114
|
+
path.append((ji, pj))
|
|
115
|
+
for pi, pj in path:
|
|
116
|
+
star[pi, pj] = 1 - star[pi, pj]
|
|
117
|
+
prime[:] = 0
|
|
118
|
+
row_covered[:] = False
|
|
119
|
+
col_covered[:] = False
|
|
120
|
+
break
|
|
121
|
+
row_covered[i] = True
|
|
122
|
+
col_covered[cj] = False
|
|
123
|
+
|
|
124
|
+
# Extract assignment: (row_ind, col_ind) for starred zeros, row_ind sorted
|
|
125
|
+
row_ind = np.array([i for i in range(n) for j in range(n) if star[i, j]], dtype=np.intp)
|
|
126
|
+
col_ind = np.array([j for i in range(n) for j in range(n) if star[i, j]], dtype=np.intp)
|
|
127
|
+
# Keep only assignments within original matrix
|
|
128
|
+
mask = (row_ind < n_rows) & (col_ind < n_cols)
|
|
129
|
+
row_ind = row_ind[mask]
|
|
130
|
+
col_ind = col_ind[mask]
|
|
131
|
+
# Sort by row index (scipy API)
|
|
132
|
+
perm = np.argsort(row_ind)
|
|
133
|
+
return row_ind[perm], col_ind[perm]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def process_results(doc: dict, results: list) -> dict[str, float]:
|
|
137
|
+
"""Compute DROP exact_match and F1 between predictions and gold answers.
|
|
138
|
+
|
|
139
|
+
doc["answers"] is a list of gold answers (each a tuple or list of strings).
|
|
140
|
+
results is a list of predicted answers (one per prediction; for one completion use [pred]).
|
|
141
|
+
"""
|
|
142
|
+
preds, golds = results, doc["answers"]
|
|
143
|
+
max_em = 0.0
|
|
144
|
+
max_f1 = 0.0
|
|
145
|
+
for gold_answer in golds:
|
|
146
|
+
exact_match, f1_score = get_metrics(preds, gold_answer)
|
|
147
|
+
if gold_answer and (gold_answer[0].strip() if isinstance(gold_answer[0], str) else True):
|
|
148
|
+
max_em = max(max_em, exact_match)
|
|
149
|
+
max_f1 = max(max_f1, f1_score)
|
|
150
|
+
return {"exact_match": max_em, "f1": max_f1}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get_metrics(predicted: list | str, gold: tuple | list) -> tuple[float, float]:
|
|
154
|
+
"""Return (exact_match, f1) for one gold answer. predicted and gold can be string or list of strings."""
|
|
155
|
+
predicted_bags = _answer_to_bags(predicted)
|
|
156
|
+
gold_bags = _answer_to_bags(gold)
|
|
157
|
+
|
|
158
|
+
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
|
|
159
|
+
exact_match = 1.0
|
|
160
|
+
else:
|
|
161
|
+
exact_match = 0.0
|
|
162
|
+
|
|
163
|
+
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
|
|
164
|
+
f1 = float(np.mean(f1_per_bag))
|
|
165
|
+
f1 = round(f1, 2)
|
|
166
|
+
return exact_match, f1
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _answer_to_bags(answer: list | tuple | str) -> tuple[list[str], list[set]]:
|
|
170
|
+
if isinstance(answer, list | tuple):
|
|
171
|
+
raw_spans = list(answer)
|
|
172
|
+
else:
|
|
173
|
+
raw_spans = [answer]
|
|
174
|
+
normalized_spans = []
|
|
175
|
+
token_bags = []
|
|
176
|
+
for raw_span in raw_spans:
|
|
177
|
+
normalized_span = _normalize(str(raw_span))
|
|
178
|
+
normalized_spans.append(normalized_span)
|
|
179
|
+
token_bags.append(set(normalized_span.split()))
|
|
180
|
+
return normalized_spans, token_bags
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _align_bags(predicted: list[set], gold: list[set]) -> np.ndarray:
|
|
184
|
+
scores = np.zeros([len(gold), len(predicted)])
|
|
185
|
+
for gold_index, gold_item in enumerate(gold):
|
|
186
|
+
for pred_index, pred_item in enumerate(predicted):
|
|
187
|
+
if _match_numbers_if_present(gold_item, pred_item):
|
|
188
|
+
scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
|
|
189
|
+
row_ind, col_ind = _linear_sum_assignment(-scores)
|
|
190
|
+
|
|
191
|
+
max_scores = np.zeros([max(len(gold), len(predicted))])
|
|
192
|
+
for row, column in zip(row_ind, col_ind):
|
|
193
|
+
max_scores[row] = max(max_scores[row], scores[row, column])
|
|
194
|
+
return max_scores
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _compute_f1(predicted_bag: set, gold_bag: set) -> float:
|
|
198
|
+
intersection = len(gold_bag.intersection(predicted_bag))
|
|
199
|
+
if not predicted_bag:
|
|
200
|
+
precision = 1.0
|
|
201
|
+
else:
|
|
202
|
+
precision = intersection / float(len(predicted_bag))
|
|
203
|
+
if not gold_bag:
|
|
204
|
+
recall = 1.0
|
|
205
|
+
else:
|
|
206
|
+
recall = intersection / float(len(gold_bag))
|
|
207
|
+
return (2 * precision * recall) / (precision + recall) if (precision or recall) else 0.0
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _match_numbers_if_present(gold_bag: set, predicted_bag: set) -> bool:
|
|
211
|
+
gold_numbers = {w for w in gold_bag if _is_number(w)}
|
|
212
|
+
predicted_numbers = {w for w in predicted_bag if _is_number(w)}
|
|
213
|
+
return (not gold_numbers) or bool(gold_numbers.intersection(predicted_numbers))
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _is_number(text: str) -> bool:
|
|
217
|
+
try:
|
|
218
|
+
float(text)
|
|
219
|
+
return True
|
|
220
|
+
except ValueError:
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _remove_articles(text: str) -> str:
|
|
225
|
+
return _ARTICLES.sub(" ", text)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _white_space_fix(text: str) -> str:
|
|
229
|
+
return " ".join(text.split())
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _remove_punc(text: str) -> str:
|
|
233
|
+
exclude = set(string.punctuation)
|
|
234
|
+
return "".join(ch for ch in text if ch not in exclude) if not _is_number(text) else text
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _fix_number(text: str) -> str:
|
|
238
|
+
return str(float(text)) if _is_number(text) else text
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _tokenize(text: str) -> list[str]:
|
|
242
|
+
return re.split(" |-", text)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _normalize(answer: str) -> str:
|
|
246
|
+
tokens = [
|
|
247
|
+
_white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer)
|
|
248
|
+
]
|
|
249
|
+
tokens = [t for t in tokens if t.strip()]
|
|
250
|
+
return " ".join(tokens).strip()
|
|
@@ -200,11 +200,16 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
200
200
|
stop_sequences: list[str] | None = None,
|
|
201
201
|
max_tokens: int | None = None,
|
|
202
202
|
temperature: float | None = None,
|
|
203
|
+
top_p: float | None = None,
|
|
203
204
|
) -> list[RawCompletion]:
|
|
204
205
|
effective_temperature = temperature if temperature is not None else self._temperature
|
|
206
|
+
if effective_temperature is not None and not (0 <= effective_temperature <= 2):
|
|
207
|
+
raise ValueError(f"temperature must be between 0 and 2, got {effective_temperature}")
|
|
208
|
+
effective_top_p = top_p if top_p is not None else self._top_p
|
|
209
|
+
if effective_top_p is not None and not (0 < effective_top_p <= 1):
|
|
210
|
+
raise ValueError(f"top_p must be between 0 and 1 (exclusive), got {effective_top_p}")
|
|
205
211
|
|
|
206
212
|
requests: list[CompletionRequest] = []
|
|
207
|
-
|
|
208
213
|
# Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
|
|
209
214
|
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
210
215
|
|
|
@@ -215,7 +220,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
215
220
|
maximum_tokens=scaled_max_tokens,
|
|
216
221
|
stop_sequences=stop_sequences,
|
|
217
222
|
temperature=effective_temperature,
|
|
218
|
-
top_p=
|
|
223
|
+
top_p=effective_top_p,
|
|
219
224
|
)
|
|
220
225
|
)
|
|
221
226
|
|
|
@@ -24,6 +24,7 @@ class BaseLLM(ABC):
|
|
|
24
24
|
stop_sequences: list[str] | None = None,
|
|
25
25
|
max_tokens: int | None = None,
|
|
26
26
|
temperature: float | None = None,
|
|
27
|
+
top_p: float | None = None,
|
|
27
28
|
) -> list[RawCompletion]:
|
|
28
29
|
"""
|
|
29
30
|
stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
|
|
@@ -47,6 +48,7 @@ class BaseLLM(ABC):
|
|
|
47
48
|
stop_sequences: list[str] | None = None,
|
|
48
49
|
max_tokens: int | None = None,
|
|
49
50
|
temperature: float | None = None,
|
|
51
|
+
top_p: float | None = None,
|
|
50
52
|
) -> list[RawCompletion]:
|
|
51
53
|
"""
|
|
52
54
|
stop_sequences and max_tokens are injected by the task if exist. They should be overwritten or
|
|
@@ -79,6 +81,7 @@ class BaseLLM(ABC):
|
|
|
79
81
|
stop_sequences: list[str] | None = None,
|
|
80
82
|
max_tokens: int | None = None,
|
|
81
83
|
temperature: float | None = None,
|
|
84
|
+
top_p: float | None = None,
|
|
82
85
|
) -> list[RawCompletion]:
|
|
83
86
|
"""Generates a model response for each sample.
|
|
84
87
|
|
|
@@ -86,10 +89,10 @@ class BaseLLM(ABC):
|
|
|
86
89
|
otherwise falls back to 'generate_from_messages'.
|
|
87
90
|
"""
|
|
88
91
|
try:
|
|
89
|
-
return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature)
|
|
92
|
+
return self.generate_from_samples(samples, stop_sequences, max_tokens, temperature, top_p)
|
|
90
93
|
except NotImplementedError:
|
|
91
94
|
messages: list[Sequence[Message]] = [sample.messages for sample in samples]
|
|
92
|
-
return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature)
|
|
95
|
+
return self.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
|
|
93
96
|
|
|
94
97
|
def post_process_completion(self, completion: str, sample: Sample) -> str:
|
|
95
98
|
"""
|
|
@@ -10,7 +10,14 @@ from typing import Any
|
|
|
10
10
|
|
|
11
11
|
import torch
|
|
12
12
|
from tokenizers import Tokenizer
|
|
13
|
-
from transformers import
|
|
13
|
+
from transformers import (
|
|
14
|
+
AutoModelForCausalLM,
|
|
15
|
+
AutoTokenizer,
|
|
16
|
+
StoppingCriteria,
|
|
17
|
+
StoppingCriteriaList,
|
|
18
|
+
)
|
|
19
|
+
from transformers.models.gpt2 import GPT2Tokenizer
|
|
20
|
+
from transformers.tokenization_utils import PreTrainedTokenizerBase
|
|
14
21
|
|
|
15
22
|
from eval_framework.llm.base import BaseLLM
|
|
16
23
|
from eval_framework.shared.types import (
|
|
@@ -83,9 +90,13 @@ class BaseHFLLM(BaseLLM):
|
|
|
83
90
|
SEQ_LENGTH: int | None = None
|
|
84
91
|
BYTES_PER_TOKEN: float = 4.0 # rule of thumb according to https://platform.openai.com/tokenizer
|
|
85
92
|
|
|
93
|
+
def _load_tokenizer(self) -> PreTrainedTokenizerBase:
|
|
94
|
+
"""Load the tokenizer. Override in subclasses to use a specific tokenizer class."""
|
|
95
|
+
return AutoTokenizer.from_pretrained(self.LLM_NAME)
|
|
96
|
+
|
|
86
97
|
def __init__(self, formatter: BaseFormatter | None = None, bytes_per_token: float | None = None) -> None:
|
|
87
98
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
88
|
-
self.tokenizer =
|
|
99
|
+
self.tokenizer = self._load_tokenizer()
|
|
89
100
|
self.model = AutoModelForCausalLM.from_pretrained(self.LLM_NAME, device_map="auto")
|
|
90
101
|
logger.info(f"{RED}[ Model initialized --------------------- {RESET}{self.LLM_NAME} {RED}]{RESET}")
|
|
91
102
|
self._set_formatter(formatter)
|
|
@@ -135,6 +146,7 @@ class BaseHFLLM(BaseLLM):
|
|
|
135
146
|
stop_sequences: list[str] | None = None,
|
|
136
147
|
max_tokens: int | None = None,
|
|
137
148
|
temperature: float | None = None,
|
|
149
|
+
top_p: float | None = None,
|
|
138
150
|
) -> list[RawCompletion]:
|
|
139
151
|
if temperature is None:
|
|
140
152
|
effective_temperature = 0.0 # Current default, TODO: refactor to use model's default
|
|
@@ -143,7 +155,8 @@ class BaseHFLLM(BaseLLM):
|
|
|
143
155
|
)
|
|
144
156
|
else:
|
|
145
157
|
effective_temperature = temperature
|
|
146
|
-
|
|
158
|
+
if top_p is not None:
|
|
159
|
+
logger.warning("Huggingface LLM does not support top_p. Ignoring top_p value.")
|
|
147
160
|
raw_completions = []
|
|
148
161
|
for single_messages in messages:
|
|
149
162
|
# format
|
|
@@ -403,14 +416,24 @@ class Pythia410m(HFLLM):
|
|
|
403
416
|
|
|
404
417
|
|
|
405
418
|
class SmolLM135M(HFLLM):
|
|
419
|
+
"""SmolLM-135M uses a GPT2-style tokenizer; AutoTokenizer can incorrectly select LlamaTokenizer."""
|
|
420
|
+
|
|
406
421
|
LLM_NAME = "HuggingFaceTB/SmolLM-135M"
|
|
407
422
|
DEFAULT_FORMATTER = ConcatFormatter
|
|
408
423
|
|
|
424
|
+
def _load_tokenizer(self) -> PreTrainedTokenizerBase:
|
|
425
|
+
return GPT2Tokenizer.from_pretrained(self.LLM_NAME)
|
|
426
|
+
|
|
409
427
|
|
|
410
428
|
class Smollm135MInstruct(HFLLM):
|
|
429
|
+
"""SmolLM-135M-Instruct uses a GPT2-style tokenizer; AutoTokenizer can incorrectly select LlamaTokenizer."""
|
|
430
|
+
|
|
411
431
|
LLM_NAME = "HuggingFaceTB/SmolLM-135M-Instruct"
|
|
412
432
|
DEFAULT_FORMATTER = partial(HFFormatter, LLM_NAME)
|
|
413
433
|
|
|
434
|
+
def _load_tokenizer(self) -> PreTrainedTokenizerBase:
|
|
435
|
+
return GPT2Tokenizer.from_pretrained(self.LLM_NAME)
|
|
436
|
+
|
|
414
437
|
|
|
415
438
|
class Qwen3_0_6B(HFLLM):
|
|
416
439
|
LLM_NAME = "Qwen/Qwen3-0.6B"
|
|
@@ -34,6 +34,7 @@ class OpenAIModel(BaseLLM):
|
|
|
34
34
|
model_name: str | None = None,
|
|
35
35
|
formatter: BaseFormatter | None = None,
|
|
36
36
|
temperature: float | None = None,
|
|
37
|
+
top_p: float | None = None,
|
|
37
38
|
api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
|
|
38
39
|
organization: str | None = None,
|
|
39
40
|
base_url: str | None = None,
|
|
@@ -46,6 +47,7 @@ class OpenAIModel(BaseLLM):
|
|
|
46
47
|
model_name: OpenAI model name (e.g., "gpt-4o", "gpt-3.5-turbo"). If None, uses LLM_NAME class attribute.
|
|
47
48
|
formatter: Optional message formatter.
|
|
48
49
|
temperature: Sampling temperature used when not passed to generate methods (from 0.0 to 2.0).
|
|
50
|
+
top_p: Nucleus sampling probability mass (from 0.0 to 1.0). If None, the API default is used.
|
|
49
51
|
api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable).
|
|
50
52
|
organization: Optional OpenAI organization ID.
|
|
51
53
|
base_url: Optional API base URL for Azure or alternate endpoints.
|
|
@@ -59,6 +61,10 @@ class OpenAIModel(BaseLLM):
|
|
|
59
61
|
self._temperature = temperature if temperature is not None else 0.0
|
|
60
62
|
assert 0.0 <= self._temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
|
|
61
63
|
|
|
64
|
+
if top_p is not None:
|
|
65
|
+
assert 0.0 <= top_p <= 1.0, "top_p must be between 0.0 and 1.0"
|
|
66
|
+
self._top_p = top_p
|
|
67
|
+
|
|
62
68
|
self._client = OpenAI(
|
|
63
69
|
api_key=api_key,
|
|
64
70
|
organization=organization,
|
|
@@ -97,6 +103,7 @@ class OpenAIModel(BaseLLM):
|
|
|
97
103
|
stop_sequences: list[str] | None = None,
|
|
98
104
|
max_tokens: int | None = None,
|
|
99
105
|
temperature: float | None = None,
|
|
106
|
+
top_p: float | None = None,
|
|
100
107
|
) -> list[RawCompletion]:
|
|
101
108
|
"""
|
|
102
109
|
Generate completions for a list of message sequences concurrently.
|
|
@@ -108,6 +115,7 @@ class OpenAIModel(BaseLLM):
|
|
|
108
115
|
stop_sequences: Optional list of stop sequences.
|
|
109
116
|
max_tokens: Optional maximum number of tokens to generate.
|
|
110
117
|
temperature: Sampling temperature.
|
|
118
|
+
top_p: Nucleus sampling probability mass (0.0 to 1.0). Overrides instance default if provided.
|
|
111
119
|
|
|
112
120
|
Returns:
|
|
113
121
|
List of RawCompletion objects containing prompts and completions.
|
|
@@ -116,6 +124,10 @@ class OpenAIModel(BaseLLM):
|
|
|
116
124
|
effective_temperature = temperature if temperature is not None else self._temperature
|
|
117
125
|
assert 0.0 <= effective_temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
|
|
118
126
|
|
|
127
|
+
effective_top_p = top_p if top_p is not None else self._top_p
|
|
128
|
+
if effective_top_p is not None:
|
|
129
|
+
assert 0.0 <= effective_top_p <= 1.0, "top_p must be between 0.0 and 1.0"
|
|
130
|
+
|
|
119
131
|
def _process_one(single_messages: Sequence[Message]) -> RawCompletion:
|
|
120
132
|
# Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
|
|
121
133
|
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
@@ -129,6 +141,7 @@ class OpenAIModel(BaseLLM):
|
|
|
129
141
|
model=self._model_name,
|
|
130
142
|
prompt=prompt,
|
|
131
143
|
temperature=effective_temperature,
|
|
144
|
+
top_p=effective_top_p,
|
|
132
145
|
max_tokens=scaled_max_tokens,
|
|
133
146
|
stop=stop_sequences,
|
|
134
147
|
)
|
|
@@ -158,6 +171,7 @@ class OpenAIModel(BaseLLM):
|
|
|
158
171
|
model=self._model_name,
|
|
159
172
|
messages=chat_messages,
|
|
160
173
|
temperature=effective_temperature,
|
|
174
|
+
top_p=effective_top_p,
|
|
161
175
|
max_tokens=scaled_max_tokens,
|
|
162
176
|
stop=stop_sequences,
|
|
163
177
|
)
|
|
@@ -300,6 +314,7 @@ class OpenAIEmbeddingModel(BaseLLM):
|
|
|
300
314
|
stop_sequences: list[str] | None = None,
|
|
301
315
|
max_tokens: int | None = None,
|
|
302
316
|
temperature: float | None = None,
|
|
317
|
+
top_p: float | None = None,
|
|
303
318
|
) -> list[RawCompletion]:
|
|
304
319
|
raise NotImplementedError(
|
|
305
320
|
"Embedding model does not support generate_from_messages. Use generate_embeddings instead."
|
|
@@ -134,11 +134,12 @@ class BaseVLLMModel(BaseLLM):
|
|
|
134
134
|
**kwargs,
|
|
135
135
|
}
|
|
136
136
|
|
|
137
|
-
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
138
|
-
|
|
139
137
|
self.batch_size = batch_size
|
|
140
138
|
|
|
141
|
-
|
|
139
|
+
if "VLLM_TARGET_DEVICE" not in os.environ and not torch.cuda.is_available():
|
|
140
|
+
os.environ["VLLM_TARGET_DEVICE"] = "cpu"
|
|
141
|
+
|
|
142
|
+
self.model = LLM(**model_args)
|
|
142
143
|
|
|
143
144
|
self._tokenizer: None | VLLMTokenizerAPI = None
|
|
144
145
|
_ = self.tokenizer # make sure tokenizer is initialized
|
|
@@ -225,6 +226,7 @@ class BaseVLLMModel(BaseLLM):
|
|
|
225
226
|
stop_sequences: list[str] | None = None,
|
|
226
227
|
max_tokens: int | None = None,
|
|
227
228
|
temperature: float | None = None,
|
|
229
|
+
top_p: float | None = None,
|
|
228
230
|
) -> list[RawCompletion]:
|
|
229
231
|
raw_completions: list[RawCompletion | None] = [None] * len(messages)
|
|
230
232
|
prompt_objs = []
|
|
@@ -234,7 +236,7 @@ class BaseVLLMModel(BaseLLM):
|
|
|
234
236
|
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
235
237
|
|
|
236
238
|
sampling_params = self._resolve_sampling_params(
|
|
237
|
-
self.sampling_params, scaled_max_tokens, stop_sequences, temperature
|
|
239
|
+
self.sampling_params, scaled_max_tokens, stop_sequences, temperature, top_p
|
|
238
240
|
)
|
|
239
241
|
|
|
240
242
|
for i, single_messages in enumerate(messages):
|
|
@@ -294,6 +296,7 @@ class BaseVLLMModel(BaseLLM):
|
|
|
294
296
|
max_tokens: int | None,
|
|
295
297
|
stop_sequences: list[str] | None,
|
|
296
298
|
temperature: float | None,
|
|
299
|
+
top_p: float | None = None,
|
|
297
300
|
) -> SamplingParams:
|
|
298
301
|
sampling_params.max_tokens = max_tokens
|
|
299
302
|
sampling_params.stop = stop_sequences
|
|
@@ -307,6 +310,13 @@ class BaseVLLMModel(BaseLLM):
|
|
|
307
310
|
f"Using sampling params temperature value: {sampling_params.temperature} "
|
|
308
311
|
f"as no custom temperature value was provided"
|
|
309
312
|
)
|
|
313
|
+
if top_p is not None:
|
|
314
|
+
logger.warning(f"Overriding sampling params top_p {sampling_params.top_p} with custom value {top_p}")
|
|
315
|
+
sampling_params.top_p = top_p
|
|
316
|
+
else:
|
|
317
|
+
logger.info(
|
|
318
|
+
f"Using sampling params top_p value: {sampling_params.top_p} as no custom top_p value was provided"
|
|
319
|
+
)
|
|
310
320
|
return sampling_params
|
|
311
321
|
|
|
312
322
|
def _model_generate(
|