eval-framework 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.3 → eval_framework-0.2.4}/PKG-INFO +20 -15
- {eval_framework-0.2.3 → eval_framework-0.2.4}/README.md +15 -12
- {eval_framework-0.2.3 → eval_framework-0.2.4}/pyproject.toml +8 -5
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/evaluation_generator.py +4 -4
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/aleph_alpha.py +3 -7
- eval_framework-0.2.4/src/eval_framework/llm/openai.py +400 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/main.py +3 -2
- eval_framework-0.2.4/src/eval_framework/metrics/completion/aidanbench.py +28 -0
- eval_framework-0.2.4/src/eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
- eval_framework-0.2.4/src/eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/response_generator.py +11 -83
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/wandb_uploader.py +2 -6
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/run.py +14 -3
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/base.py +79 -1
- eval_framework-0.2.4/src/eval_framework/tasks/benchmarks/aidanbench.py +211 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/math_reasoning.py +14 -3
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/task_names.py +2 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/generate_task_docs.py +1 -4
- eval_framework-0.2.4/src/eval_framework/utils/helpers.py +32 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/logging.py +17 -5
- eval_framework-0.2.4/src/eval_framework/utils/tqdm_handler.py +14 -0
- eval_framework-0.2.3/src/eval_framework/llm/openai.py +0 -226
- eval_framework-0.2.3/src/eval_framework/utils/helpers.py +0 -3
- {eval_framework-0.2.3 → eval_framework-0.2.4}/LICENSE +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/gsm8k.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.3 → eval_framework-0.2.4}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -232,8 +232,9 @@ Requires-Dist: llm-sandbox[docker]>=0.1.8,<0.2
|
|
|
232
232
|
Requires-Dist: jsonlines>=4,<5
|
|
233
233
|
Requires-Dist: lxml>=6,<7
|
|
234
234
|
Requires-Dist: python-iso639>=2025.2.18
|
|
235
|
-
Requires-Dist: wandb>=0.
|
|
235
|
+
Requires-Dist: wandb>=0.23.0,<1
|
|
236
236
|
Requires-Dist: boto3>=1.40.54,<2
|
|
237
|
+
Requires-Dist: numpy>=1.26.4
|
|
237
238
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
238
239
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
239
240
|
Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
|
|
@@ -243,8 +244,9 @@ Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
|
|
|
243
244
|
Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
|
|
244
245
|
Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
|
|
245
246
|
Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
|
|
246
|
-
Requires-Dist: openai>=1.62,<2 ; extra == 'openai'
|
|
247
|
+
Requires-Dist: openai>=1.62,<2.8 ; extra == 'openai'
|
|
247
248
|
Requires-Dist: tiktoken>=0.9,<0.10 ; extra == 'openai'
|
|
249
|
+
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
|
|
248
250
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
|
|
249
251
|
Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
|
|
250
252
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
|
|
@@ -354,22 +356,25 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
|
|
|
354
356
|
|
|
355
357
|
### Core Capabilities
|
|
356
358
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
|
360
|
-
|
|
361
|
-
| ARC |
|
|
362
|
-
|
|
|
363
|
-
| Winogrande |
|
|
359
|
+
Subset of core capabilities benchmarks coverd by `eval-framework`:
|
|
360
|
+
|
|
361
|
+
| **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
|
|
362
|
+
|---------------|---------------|----------|------------|------------------------|------------------|
|
|
363
|
+
| COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
|
|
364
|
+
| Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
|
|
365
|
+
| Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
|
|
366
|
+
|
|
364
367
|
|
|
365
368
|
### Languages & Domains
|
|
366
369
|
|
|
367
|
-
|
|
370
|
+
Subset of language-specific and domain-specific benchmarks coverd by `eval-framework`:
|
|
371
|
+
|
|
372
|
+
| **Multilingual** | **Specialized** | **Safety & Bias** | **Efficiency Metrics** |
|
|
368
373
|
|------------------|-----------------|-------------------|----------------|
|
|
369
|
-
| WMT Translation |
|
|
370
|
-
| FLORES-200 | Winogender |
|
|
371
|
-
| Multilingual MMLU |
|
|
372
|
-
| German/Finnish tasks |
|
|
374
|
+
| WMT Translation | MMLU | TruthfulQA | Compression ratios |
|
|
375
|
+
| FLORES-200 | Legal (CaseHold) | Winogender | Runtime |
|
|
376
|
+
| Multilingual MMLU | Scientific (SciQ) | | |
|
|
377
|
+
| German/Finnish tasks | | | |
|
|
373
378
|
|
|
374
379
|
### Completion
|
|
375
380
|
|
|
@@ -86,22 +86,25 @@ For more detailed CLI usage instructions, see the [CLI Usage Guide](docs/cli_usa
|
|
|
86
86
|
|
|
87
87
|
### Core Capabilities
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
|
92
|
-
|
|
93
|
-
| ARC |
|
|
94
|
-
|
|
|
95
|
-
| Winogrande |
|
|
89
|
+
Subset of core capabilities benchmarks coverd by `eval-framework`:
|
|
90
|
+
|
|
91
|
+
| **Reasoning** | **Knowledge** | **Math** | **Coding** | **Structured outputs** | **Long Context** |
|
|
92
|
+
|---------------|---------------|----------|------------|------------------------|------------------|
|
|
93
|
+
| COPA | ARC | AIME | BigCodeBench | IFEval | InfiniteBench |
|
|
94
|
+
| Hellaswag | MMLU | GSM8K | HumanEval | StructEval | QUALITY |
|
|
95
|
+
| Winogrande | Openbook QA| MATH-500 | MBPP | | ZeroSCROLLS |
|
|
96
|
+
|
|
96
97
|
|
|
97
98
|
### Languages & Domains
|
|
98
99
|
|
|
99
|
-
|
|
100
|
+
Subset of language-specific and domain-specific benchmarks coverd by `eval-framework`:
|
|
101
|
+
|
|
102
|
+
| **Multilingual** | **Specialized** | **Safety & Bias** | **Efficiency Metrics** |
|
|
100
103
|
|------------------|-----------------|-------------------|----------------|
|
|
101
|
-
| WMT Translation |
|
|
102
|
-
| FLORES-200 | Winogender |
|
|
103
|
-
| Multilingual MMLU |
|
|
104
|
-
| German/Finnish tasks |
|
|
104
|
+
| WMT Translation | MMLU | TruthfulQA | Compression ratios |
|
|
105
|
+
| FLORES-200 | Legal (CaseHold) | Winogender | Runtime |
|
|
106
|
+
| Multilingual MMLU | Scientific (SciQ) | | |
|
|
107
|
+
| German/Finnish tasks | | | |
|
|
105
108
|
|
|
106
109
|
### Completion
|
|
107
110
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.4"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -39,8 +39,9 @@ dependencies = [
|
|
|
39
39
|
"jsonlines>=4,<5",
|
|
40
40
|
"lxml>=6,<7",
|
|
41
41
|
"python-iso639>=2025.2.18",
|
|
42
|
-
"wandb>=0.
|
|
42
|
+
"wandb>=0.23.0,<1",
|
|
43
43
|
"boto3>=1.40.54,<2",
|
|
44
|
+
"numpy>=1.26.4",
|
|
44
45
|
]
|
|
45
46
|
|
|
46
47
|
[project.optional-dependencies]
|
|
@@ -51,8 +52,9 @@ determined = [
|
|
|
51
52
|
]
|
|
52
53
|
api = ["aleph-alpha-client>=10,<11"]
|
|
53
54
|
openai = [
|
|
54
|
-
"openai>=1.62,<2",
|
|
55
|
-
"tiktoken>=0.9,<0.10"
|
|
55
|
+
"openai>=1.62,<2.8",
|
|
56
|
+
"tiktoken>=0.9,<0.10",
|
|
57
|
+
"transformers>=4.45.2,<5",
|
|
56
58
|
]
|
|
57
59
|
transformers = [
|
|
58
60
|
"transformers>=4.45.2,<5",
|
|
@@ -149,6 +151,7 @@ select = [
|
|
|
149
151
|
"UP", # Auto-upgrading of new Python features
|
|
150
152
|
"I", # Sort imports
|
|
151
153
|
]
|
|
154
|
+
|
|
152
155
|
[tool.ruff.lint.isort]
|
|
153
156
|
# https://github.com/astral-sh/ruff-pre-commit/issues/121
|
|
154
157
|
# https://github.com/astral-sh/ruff/issues/10519
|
|
@@ -163,7 +166,7 @@ known-third-party = ["wandb"]
|
|
|
163
166
|
plugins = "pydantic.mypy"
|
|
164
167
|
disallow_untyped_defs = true
|
|
165
168
|
ignore_missing_imports = true
|
|
166
|
-
files = ["src", "
|
|
169
|
+
files = ["src", "utils"]
|
|
167
170
|
|
|
168
171
|
[tool.pytest.ini_options]
|
|
169
172
|
testpaths = ["./tests"]
|
|
@@ -20,6 +20,7 @@ from eval_framework.tasks.base import ResponseType
|
|
|
20
20
|
from eval_framework.tasks.eval_config import EvalConfig
|
|
21
21
|
from eval_framework.tasks.registry import get_task
|
|
22
22
|
from eval_framework.utils.constants import RED, RESET
|
|
23
|
+
from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
|
|
23
24
|
|
|
24
25
|
logger = logging.getLogger(__name__)
|
|
25
26
|
|
|
@@ -71,8 +72,8 @@ class EvaluationGenerator:
|
|
|
71
72
|
metric = metric_class()
|
|
72
73
|
|
|
73
74
|
logger.info(f"Starting calculation of {metric.NAME}")
|
|
74
|
-
|
|
75
|
-
for response in tqdm(responses, desc=f"Calculating {metric.NAME}"):
|
|
75
|
+
safe_tqdm_write(f"INFO: Calculating {metric.NAME}")
|
|
76
|
+
for response in tqdm(responses, desc=f"Calculating {metric.NAME}", disable=get_disable_bar_flag()):
|
|
76
77
|
if f"{response.subject}_{response.id}_{metric.__class__.__name__}" in subject_result_id_existing:
|
|
77
78
|
continue
|
|
78
79
|
|
|
@@ -109,7 +110,7 @@ class EvaluationGenerator:
|
|
|
109
110
|
self.result_processor.save_metrics_result(result)
|
|
110
111
|
|
|
111
112
|
logger.info(f"Completed calculation of {metric.NAME}")
|
|
112
|
-
|
|
113
|
+
safe_tqdm_write(f"INFO: Completed {metric.NAME}")
|
|
113
114
|
|
|
114
115
|
if not self.save_intermediate_results:
|
|
115
116
|
self.result_processor.save_metrics_results(results)
|
|
@@ -224,7 +225,6 @@ class EvaluationGenerator:
|
|
|
224
225
|
aggregated_results = self._aggregate_results(metrics_results)
|
|
225
226
|
|
|
226
227
|
wandb.log(aggregated_results)
|
|
227
|
-
|
|
228
228
|
self.result_processor.save_aggregated_results(aggregated_results)
|
|
229
229
|
logger.info(aggregated_results)
|
|
230
230
|
logger.info(f"{RED}[ Evaluation completed and results saved! ]{RESET}")
|
|
@@ -50,6 +50,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
50
50
|
self,
|
|
51
51
|
formatter: BaseFormatter | None = None,
|
|
52
52
|
checkpoint_name: str | None = None,
|
|
53
|
+
temperature: float | None = None,
|
|
53
54
|
# Please see README.md for tips if adapting the following parameters.
|
|
54
55
|
max_retries: int = 100,
|
|
55
56
|
max_async_concurrent_requests: int = 32,
|
|
@@ -65,6 +66,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
65
66
|
else:
|
|
66
67
|
self._formatter = formatter
|
|
67
68
|
self._llm_name = checkpoint_name or self.LLM_NAME
|
|
69
|
+
self._temperature = temperature if temperature is not None else 0.0
|
|
68
70
|
self.max_async_concurrent_requests = max_async_concurrent_requests
|
|
69
71
|
self.max_retries = max_retries
|
|
70
72
|
self.request_timeout_seconds = request_timeout_seconds
|
|
@@ -249,13 +251,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
249
251
|
max_tokens: int | None = None,
|
|
250
252
|
temperature: float | None = None,
|
|
251
253
|
) -> list[RawCompletion]:
|
|
252
|
-
if temperature is None
|
|
253
|
-
effective_temperature = 0.0 # Current default, TODO: refactor to use model's default
|
|
254
|
-
logger.info(
|
|
255
|
-
f"Using default temperature value: {effective_temperature} as no custom temperature value was provided"
|
|
256
|
-
)
|
|
257
|
-
else:
|
|
258
|
-
effective_temperature = temperature
|
|
254
|
+
effective_temperature = temperature if temperature is not None else self._temperature
|
|
259
255
|
|
|
260
256
|
requests = []
|
|
261
257
|
|
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import concurrent.futures
|
|
2
|
+
import logging
|
|
3
|
+
import math
|
|
4
|
+
import os
|
|
5
|
+
import traceback
|
|
6
|
+
from collections.abc import Callable, Sequence
|
|
7
|
+
from functools import partial
|
|
8
|
+
|
|
9
|
+
import tiktoken
|
|
10
|
+
from openai import OpenAI
|
|
11
|
+
from openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam
|
|
12
|
+
from tokenizers import Tokenizer
|
|
13
|
+
from transformers import AutoTokenizer
|
|
14
|
+
|
|
15
|
+
from eval_framework.llm.base import BaseLLM
|
|
16
|
+
from eval_framework.shared.types import ConcatCompression, Error, RawCompletion, RawLoglikelihood
|
|
17
|
+
from eval_framework.tasks.base import Sample
|
|
18
|
+
from template_formatting.formatter import BaseFormatter, ConcatFormatter, HFFormatter, Message
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class OpenAIModel(BaseLLM):
|
|
24
|
+
"""
|
|
25
|
+
LLM wrapper for OpenAI API providing text/chat completions and log-probability evaluation output.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
LLM_NAME: str | None = None
|
|
29
|
+
DEFAULT_FORMATTER: Callable[[], BaseFormatter] | None = None
|
|
30
|
+
BYTES_PER_TOKEN: float = 4.0 # rule of thumb according to https://platform.openai.com/tokenizer
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
model_name: str | None = None,
|
|
35
|
+
formatter: BaseFormatter | None = None,
|
|
36
|
+
temperature: float | None = None,
|
|
37
|
+
api_key: str | None = os.getenv("OPENAI_API_KEY", ""),
|
|
38
|
+
organization: str | None = None,
|
|
39
|
+
base_url: str | None = None,
|
|
40
|
+
bytes_per_token: float | None = None,
|
|
41
|
+
) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Initialize the OpenAIModel.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
model_name: OpenAI model name (e.g., "gpt-4o", "gpt-3.5-turbo"). If None, uses LLM_NAME class attribute.
|
|
47
|
+
formatter: Optional message formatter.
|
|
48
|
+
temperature: Sampling temperature used when not passed to generate methods (from 0.0 to 2.0).
|
|
49
|
+
api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable).
|
|
50
|
+
organization: Optional OpenAI organization ID.
|
|
51
|
+
base_url: Optional API base URL for Azure or alternate endpoints.
|
|
52
|
+
bytes_per_token: Optional custom bytes per token scalar for non-standard models.
|
|
53
|
+
"""
|
|
54
|
+
assert model_name is not None or self.LLM_NAME is not None, "A model name must be specified."
|
|
55
|
+
self._model_name = model_name if model_name else self.LLM_NAME
|
|
56
|
+
logger.info(f"Instantiating OpenAIModel with name: {self._model_name}")
|
|
57
|
+
|
|
58
|
+
self._formatter = formatter or (self.DEFAULT_FORMATTER() if self.DEFAULT_FORMATTER is not None else None)
|
|
59
|
+
self._temperature = temperature if temperature is not None else 0.0
|
|
60
|
+
assert 0.0 <= self._temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
|
|
61
|
+
|
|
62
|
+
self._client = OpenAI(
|
|
63
|
+
api_key=api_key,
|
|
64
|
+
organization=organization,
|
|
65
|
+
base_url=base_url,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Initialize tokenizer for the model
|
|
69
|
+
self._encoder = self._get_encoder()
|
|
70
|
+
|
|
71
|
+
# set bytes_per_token_scalar for non-standard models
|
|
72
|
+
if bytes_per_token is not None and bytes_per_token <= 0:
|
|
73
|
+
raise ValueError("bytes_per_token must be positive")
|
|
74
|
+
self.bytes_per_token_scalar = (
|
|
75
|
+
4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _get_encoder(self) -> tiktoken.Encoding:
|
|
79
|
+
assert self._model_name is not None
|
|
80
|
+
return tiktoken.encoding_for_model(self._model_name)
|
|
81
|
+
|
|
82
|
+
def _count_tokens(self, text: str) -> int:
|
|
83
|
+
"""
|
|
84
|
+
Count tokens for the given text using the encoder.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
text: Input string.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Number of tokens.
|
|
91
|
+
"""
|
|
92
|
+
return len(self._encoder.encode(text))
|
|
93
|
+
|
|
94
|
+
def generate_from_messages(
|
|
95
|
+
self,
|
|
96
|
+
messages: list[Sequence[Message]],
|
|
97
|
+
stop_sequences: list[str] | None = None,
|
|
98
|
+
max_tokens: int | None = None,
|
|
99
|
+
temperature: float | None = None,
|
|
100
|
+
) -> list[RawCompletion]:
|
|
101
|
+
"""
|
|
102
|
+
Generate completions for a list of message sequences concurrently.
|
|
103
|
+
|
|
104
|
+
Uses text completion API when a formatter is configured, otherwise uses chat completion API.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
messages: Sequence of messages.
|
|
108
|
+
stop_sequences: Optional list of stop sequences.
|
|
109
|
+
max_tokens: Optional maximum number of tokens to generate.
|
|
110
|
+
temperature: Sampling temperature.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
List of RawCompletion objects containing prompts and completions.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
effective_temperature = temperature if temperature is not None else self._temperature
|
|
117
|
+
assert 0.0 <= effective_temperature <= 2.0, "Temperature must be between 0.0 and 2.0"
|
|
118
|
+
|
|
119
|
+
def _process_one(single_messages: Sequence[Message]) -> RawCompletion:
|
|
120
|
+
# Adjust max tokens based on bytes_per_token_scalar so that non-standard models generate full responses
|
|
121
|
+
scaled_max_tokens = math.ceil(max_tokens * self.bytes_per_token_scalar) if max_tokens is not None else None
|
|
122
|
+
|
|
123
|
+
if self._formatter is not None:
|
|
124
|
+
# Use formatter and text completion API
|
|
125
|
+
prompt = self._formatter.format(single_messages, output_mode="string")
|
|
126
|
+
# documentation: https://platform.openai.com/docs/api-reference/completions/create
|
|
127
|
+
assert self._model_name is not None
|
|
128
|
+
response = self._client.completions.create(
|
|
129
|
+
model=self._model_name,
|
|
130
|
+
prompt=prompt,
|
|
131
|
+
temperature=effective_temperature,
|
|
132
|
+
max_tokens=scaled_max_tokens,
|
|
133
|
+
stop=stop_sequences,
|
|
134
|
+
)
|
|
135
|
+
completion = response.choices[0].text
|
|
136
|
+
return RawCompletion(
|
|
137
|
+
prompt=prompt,
|
|
138
|
+
prompt_sequence_positions=self._count_tokens(prompt),
|
|
139
|
+
concat_compression=ConcatCompression.calculate(
|
|
140
|
+
single_messages, count_tokens=self._count_tokens, completion=completion
|
|
141
|
+
),
|
|
142
|
+
completion=completion,
|
|
143
|
+
completion_sequence_positions=self._count_tokens(completion),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
else:
|
|
147
|
+
# Use chat completion API
|
|
148
|
+
chat_messages = [
|
|
149
|
+
(
|
|
150
|
+
ChatCompletionUserMessageParam(role="user", content=m.content)
|
|
151
|
+
if m.role is not None and m.role.value.lower() == "user"
|
|
152
|
+
else ChatCompletionAssistantMessageParam(role="assistant", content=m.content)
|
|
153
|
+
)
|
|
154
|
+
for m in single_messages
|
|
155
|
+
]
|
|
156
|
+
assert self._model_name is not None
|
|
157
|
+
chat_response = self._client.chat.completions.create(
|
|
158
|
+
model=self._model_name,
|
|
159
|
+
messages=chat_messages,
|
|
160
|
+
temperature=effective_temperature,
|
|
161
|
+
max_tokens=scaled_max_tokens,
|
|
162
|
+
stop=stop_sequences,
|
|
163
|
+
)
|
|
164
|
+
prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in chat_messages])
|
|
165
|
+
prompt_tokens = getattr(chat_response.usage, "prompt_tokens", None)
|
|
166
|
+
completion = chat_response.choices[0].message.content or ""
|
|
167
|
+
return RawCompletion(
|
|
168
|
+
prompt=prompt,
|
|
169
|
+
prompt_sequence_positions=prompt_tokens,
|
|
170
|
+
concat_compression=ConcatCompression.calculate(
|
|
171
|
+
single_messages, count_tokens=self._count_tokens, completion=completion
|
|
172
|
+
),
|
|
173
|
+
completion=completion,
|
|
174
|
+
completion_sequence_positions=self._count_tokens(completion),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
178
|
+
results = list(executor.map(_process_one, messages))
|
|
179
|
+
return results
|
|
180
|
+
|
|
181
|
+
def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
|
|
182
|
+
"""
|
|
183
|
+
Compute total log-probabilities for possible completions given each sample's prompt.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
samples: List of Sample objects, each with prompt messages and possible completions.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
List of RawLoglikelihood objects mapping each prompt and completion to its log-probability.
|
|
190
|
+
|
|
191
|
+
Note:
|
|
192
|
+
Uses the OpenAI completions API with echo=True; chat logprobs are not supported.
|
|
193
|
+
"""
|
|
194
|
+
assert self._model_name in ["babbage-002", "davinci-002"], (
|
|
195
|
+
"Log-probs for prompt tokens are only supported for a limited set of models."
|
|
196
|
+
)
|
|
197
|
+
# apparently OpenAI stopped providing logprobs of prompt tokens, see discussion in:
|
|
198
|
+
# https://github.com/EleutherAI/lm-evaluation-harness/issues/1196
|
|
199
|
+
|
|
200
|
+
assert self._formatter is not None, "Log-probs require a formatter to create text prompts."
|
|
201
|
+
results: list[RawLoglikelihood] = []
|
|
202
|
+
for sample in samples:
|
|
203
|
+
prompt = self._formatter.format(sample.messages, output_mode="string") if sample.messages else ""
|
|
204
|
+
choices_log_probs: dict[str, float] = {}
|
|
205
|
+
choices_sequence_positions: dict[str, int] = {}
|
|
206
|
+
prompt_sequence_positions: int | None = self._count_tokens(prompt)
|
|
207
|
+
error: Error | None = None
|
|
208
|
+
|
|
209
|
+
for choice in sample.possible_completions or []:
|
|
210
|
+
if error is not None:
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
# Tokenize prompt and completion
|
|
214
|
+
prompt_tokens = self._encoder.encode(prompt)
|
|
215
|
+
completion_tokens = self._encoder.encode(choice)
|
|
216
|
+
full_text = prompt + choice
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
response = self._client.completions.create(
|
|
220
|
+
model=self._model_name,
|
|
221
|
+
prompt=full_text,
|
|
222
|
+
echo=True,
|
|
223
|
+
max_tokens=0,
|
|
224
|
+
logprobs=1,
|
|
225
|
+
temperature=0,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
choice_obj = response.choices[0]
|
|
229
|
+
if not hasattr(choice_obj, "logprobs") or choice_obj.logprobs is None:
|
|
230
|
+
raise ValueError("Logprobs not returned in response.")
|
|
231
|
+
|
|
232
|
+
all_tokens = getattr(choice_obj.logprobs, "tokens", None)
|
|
233
|
+
all_logprobs = getattr(choice_obj.logprobs, "token_logprobs", None)
|
|
234
|
+
|
|
235
|
+
if all_tokens is None or all_logprobs is None:
|
|
236
|
+
raise ValueError("Logprobs response missing expected 'tokens' or 'token_logprobs' fields.")
|
|
237
|
+
|
|
238
|
+
if len(all_tokens) != len(prompt_tokens) + len(completion_tokens):
|
|
239
|
+
raise ValueError(
|
|
240
|
+
f"Token count mismatch: tokens in response ({len(all_tokens)}) != prompt+completion "
|
|
241
|
+
f"({len(prompt_tokens) + len(completion_tokens)})"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Sum logprobs for the completion portion
|
|
245
|
+
choices_log_probs[choice] = sum(all_logprobs[len(prompt_tokens) :])
|
|
246
|
+
choices_sequence_positions[choice] = len(completion_tokens)
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
error = Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc())
|
|
250
|
+
prompt_sequence_positions = None
|
|
251
|
+
choices_log_probs = {}
|
|
252
|
+
choices_sequence_positions = {}
|
|
253
|
+
|
|
254
|
+
results.append(
|
|
255
|
+
RawLoglikelihood(
|
|
256
|
+
prompt=prompt,
|
|
257
|
+
prompt_sequence_positions=prompt_sequence_positions,
|
|
258
|
+
loglikelihoods=choices_log_probs,
|
|
259
|
+
loglikelihoods_sequence_positions=choices_sequence_positions,
|
|
260
|
+
raw_loglikelihood_error=error,
|
|
261
|
+
)
|
|
262
|
+
)
|
|
263
|
+
return results
|
|
264
|
+
|
|
265
|
+
def __del__(self) -> None:
|
|
266
|
+
if hasattr(self, "_client"):
|
|
267
|
+
self._client.close()
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class OpenAIEmbeddingModel(BaseLLM):
|
|
271
|
+
def __init__(
|
|
272
|
+
self,
|
|
273
|
+
model_name: str = "text-embedding-3-large",
|
|
274
|
+
formatter: BaseFormatter | None = None,
|
|
275
|
+
api_key: str | None = None,
|
|
276
|
+
organization: str | None = None,
|
|
277
|
+
base_url: str | None = None,
|
|
278
|
+
) -> None:
|
|
279
|
+
"""Initialize OpenAI API client.
|
|
280
|
+
Args:
|
|
281
|
+
model_name: Name of the OpenAI model to use (e.g., "text-embedding-3-large")
|
|
282
|
+
formatter: Optional message formatter
|
|
283
|
+
api_key: OpenAI API key (defaults to OPENAI_API_KEY env variable)
|
|
284
|
+
organization: Optional organization ID
|
|
285
|
+
base_url: Optional API base URL for Azure or other endpoints
|
|
286
|
+
"""
|
|
287
|
+
if formatter is not None:
|
|
288
|
+
raise ValueError("Formatter is not supported for embedding model.")
|
|
289
|
+
self._model_name = model_name
|
|
290
|
+
logger.info(f"Using {model_name} as embedding model")
|
|
291
|
+
self._client = OpenAI(
|
|
292
|
+
api_key=api_key or os.getenv("OPENAI_API_KEY", ""),
|
|
293
|
+
organization=organization,
|
|
294
|
+
base_url=base_url,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
def generate_from_messages(
|
|
298
|
+
self,
|
|
299
|
+
messages: list[Sequence[Message]],
|
|
300
|
+
stop_sequences: list[str] | None = None,
|
|
301
|
+
max_tokens: int | None = None,
|
|
302
|
+
temperature: float | None = None,
|
|
303
|
+
) -> list[RawCompletion]:
|
|
304
|
+
raise NotImplementedError(
|
|
305
|
+
"Embedding model does not support generate_from_messages. Use generate_embeddings instead."
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
def generate_embeddings(
|
|
309
|
+
self,
|
|
310
|
+
messages: list[Sequence[Message]],
|
|
311
|
+
) -> list[list[float]]:
|
|
312
|
+
embeddings = []
|
|
313
|
+
for single_messages in messages:
|
|
314
|
+
prompt = "".join([m.content for m in single_messages])
|
|
315
|
+
response = self._client.embeddings.create(model=self._model_name, input=[prompt])
|
|
316
|
+
embedding = response.data[0].embedding
|
|
317
|
+
embeddings.append(embedding)
|
|
318
|
+
return embeddings
|
|
319
|
+
|
|
320
|
+
def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
|
|
321
|
+
raise NotImplementedError("Embedding model cannot return logprobs.")
|
|
322
|
+
|
|
323
|
+
def __del__(self) -> None:
|
|
324
|
+
if hasattr(self, "_client"):
|
|
325
|
+
self._client.close()
|
|
326
|
+
try:
|
|
327
|
+
self._client.close()
|
|
328
|
+
except Exception:
|
|
329
|
+
pass
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
class DeepseekModel(OpenAIModel):
|
|
333
|
+
"""
|
|
334
|
+
General Deepseek model wrapper using OpenAI-compatible API for deepseek-chat and deepseek-reasoner models.
|
|
335
|
+
|
|
336
|
+
Using the deepseek API: https://api-docs.deepseek.com/quick_start/pricing
|
|
337
|
+
"""
|
|
338
|
+
|
|
339
|
+
def __init__(
|
|
340
|
+
self,
|
|
341
|
+
model_name: str | None = None,
|
|
342
|
+
formatter: BaseFormatter | None = None,
|
|
343
|
+
temperature: float | None = None,
|
|
344
|
+
api_key: str | None = None,
|
|
345
|
+
organization: str | None = None,
|
|
346
|
+
base_url: str | None = None,
|
|
347
|
+
tokenizer_name: str | None = None,
|
|
348
|
+
) -> None:
|
|
349
|
+
super().__init__(
|
|
350
|
+
model_name=model_name,
|
|
351
|
+
formatter=formatter,
|
|
352
|
+
temperature=temperature,
|
|
353
|
+
api_key=os.getenv("DEEPSEEK_API_KEY", ""),
|
|
354
|
+
organization=organization,
|
|
355
|
+
base_url="https://api.deepseek.com/beta",
|
|
356
|
+
)
|
|
357
|
+
self._tokenizer_name = tokenizer_name if tokenizer_name is not None else "deepseek-ai/DeepSeek-V3.2-Exp"
|
|
358
|
+
|
|
359
|
+
def _get_encoder(self) -> Tokenizer:
|
|
360
|
+
return AutoTokenizer.from_pretrained(self._tokenizer_name)
|
|
361
|
+
|
|
362
|
+
def _count_tokens(self, text: str) -> int:
|
|
363
|
+
return len(self._encoder.encode(text))
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
### Model Aliases ###
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class OpenAI_gpt_4o_mini(OpenAIModel):
|
|
370
|
+
LLM_NAME = "gpt-4o-mini-2024-07-18"
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
class OpenAI_gpt_4o_mini_with_ConcatFormatter(OpenAIModel):
|
|
374
|
+
LLM_NAME = "gpt-4o-mini-2024-07-18"
|
|
375
|
+
DEFAULT_FORMATTER = ConcatFormatter
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class OpenAI_davinci_002(OpenAIModel):
|
|
379
|
+
LLM_NAME = "davinci-002"
|
|
380
|
+
DEFAULT_FORMATTER = ConcatFormatter
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
class Deepseek_reasoner(DeepseekModel):
|
|
384
|
+
LLM_NAME = "deepseek-reasoner" # DeepSeek-V3.2-Exp (Thinking Mode)
|
|
385
|
+
# multi-round conversations for reasoning model documented here:
|
|
386
|
+
# https://api-docs.deepseek.com/guides/reasoning_model#api-example
|
|
387
|
+
# does not support completion API
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
class Deepseek_chat(DeepseekModel):
|
|
391
|
+
LLM_NAME = "deepseek-chat" # DeepSeek-V3.2-Exp (Non-thinking Mode)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
class Deepseek_chat_with_formatter(DeepseekModel):
|
|
395
|
+
LLM_NAME = "deepseek-chat" # DeepSeek-V3.2-Exp (Non-thinking Mode)
|
|
396
|
+
DEFAULT_FORMATTER = partial(HFFormatter, "deepseek-ai/DeepSeek-V3.2-Exp")
|
|
397
|
+
"""
|
|
398
|
+
<|begin▁of▁sentence|><|User|>Question: What color is the night sky?
|
|
399
|
+
<|Assistant|></think>Answer:
|
|
400
|
+
"""
|
|
@@ -29,12 +29,13 @@ def main(
|
|
|
29
29
|
trial_id: int | None = None,
|
|
30
30
|
*args: Any,
|
|
31
31
|
resource_cleanup: bool = False,
|
|
32
|
+
verbosity: int = 1,
|
|
32
33
|
) -> list[Result]:
|
|
33
34
|
"""Runs the entire evaluation process: responses generation and evaluation."""
|
|
34
35
|
# Set up centralized logging early
|
|
35
36
|
output_dir = generate_output_dir(llm.name, config)
|
|
36
|
-
|
|
37
|
-
|
|
37
|
+
setup_logging(output_dir=output_dir, log_level=verbosity, log_filename="evaluation.log")
|
|
38
|
+
logger.info(f"Output directory for evaluation: {output_dir}")
|
|
38
39
|
|
|
39
40
|
logger.info(f"{RED}[ Running full evaluation process ------- ]{RESET}")
|
|
40
41
|
logger.info(f"Evaluating {llm.name} on {config.task_name}")
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from eval_framework.metrics.base import BaseMetric, MetricResult
|
|
4
|
+
from eval_framework.shared.types import Completion
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AidanBenchMetric(BaseMetric[Completion]):
|
|
10
|
+
NAME = "AidanBench"
|
|
11
|
+
|
|
12
|
+
def calculate(self, response: Completion) -> list[MetricResult]:
|
|
13
|
+
# subtract 2 to not count 1) initial instruction and 2) the latest model response, which caused the stop
|
|
14
|
+
# i.e. was not (unique && coherent)
|
|
15
|
+
num_unique_responses = len(response.messages) - 2 if response.messages is not None else 0
|
|
16
|
+
if num_unique_responses < 0:
|
|
17
|
+
logger.warning(
|
|
18
|
+
"Number of unique responses calculated as negative, setting to 0."
|
|
19
|
+
"Likely something went wrong during answer generation."
|
|
20
|
+
)
|
|
21
|
+
num_unique_responses = 0
|
|
22
|
+
return [
|
|
23
|
+
MetricResult(
|
|
24
|
+
metric_name=f"{self.NAME}/num_responses",
|
|
25
|
+
value=num_unique_responses,
|
|
26
|
+
higher_is_better=True,
|
|
27
|
+
)
|
|
28
|
+
]
|