eval-framework 0.2.10__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.10 → eval_framework-0.2.12}/PKG-INFO +5 -5
- {eval_framework-0.2.10 → eval_framework-0.2.12}/pyproject.toml +5 -5
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/aleph_alpha.py +14 -70
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/base.py +8 -6
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/huggingface.py +12 -13
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/vllm.py +24 -23
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/base.py +0 -2
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/duc.py +11 -7
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores200.py +1 -2
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gsm8k.py +1 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/infinitebench.py +1 -4
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_de.py +1 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +10 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/piqa.py +1 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/squad.py +0 -1
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winox.py +32 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +2 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/LICENSE +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/README.md +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/run.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.10 → eval_framework-0.2.12}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.12
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -214,7 +214,7 @@ Classifier: Typing :: Typed
|
|
|
214
214
|
Requires-Dist: pyyaml>=6.0.1,<7
|
|
215
215
|
Requires-Dist: xmltodict>=0.13.0,<0.16
|
|
216
216
|
Requires-Dist: pydantic>=2.7,<3
|
|
217
|
-
Requires-Dist: datasets>=
|
|
217
|
+
Requires-Dist: datasets>=4.0.0,<5
|
|
218
218
|
Requires-Dist: sacrebleu>=2.4.3,<3
|
|
219
219
|
Requires-Dist: pycountry>=24.6.1,<25
|
|
220
220
|
Requires-Dist: nltk>=3.9.1,<4
|
|
@@ -238,15 +238,15 @@ Requires-Dist: numpy>=1.26.4
|
|
|
238
238
|
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
239
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
240
240
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
241
|
-
Requires-Dist: aleph-alpha-client>=
|
|
241
|
+
Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
|
|
242
242
|
Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
|
|
243
243
|
Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
|
|
244
244
|
Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
|
|
245
245
|
Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
|
|
246
246
|
Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
|
|
247
247
|
Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
|
|
248
|
-
Requires-Dist: openai>=1.62,<
|
|
249
|
-
Requires-Dist: tiktoken>=0.9,<
|
|
248
|
+
Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
|
|
249
|
+
Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
|
|
250
250
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
|
|
251
251
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
|
|
252
252
|
Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.12"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -21,7 +21,7 @@ dependencies = [
|
|
|
21
21
|
"pyyaml>=6.0.1,<7",
|
|
22
22
|
"xmltodict>=0.13.0,<0.16",
|
|
23
23
|
"pydantic>=2.7,<3",
|
|
24
|
-
"datasets>=
|
|
24
|
+
"datasets>=4.0.0,<5",
|
|
25
25
|
"sacrebleu>=2.4.3,<3",
|
|
26
26
|
"pycountry>=24.6.1,<25",
|
|
27
27
|
"nltk>=3.9.1,<4",
|
|
@@ -53,10 +53,10 @@ determined = [
|
|
|
53
53
|
"determined>=0.38,<0.39",
|
|
54
54
|
"tensorboard==2.19.0"
|
|
55
55
|
]
|
|
56
|
-
api = ["aleph-alpha-client>=
|
|
56
|
+
api = ["aleph-alpha-client>=11.5.1"]
|
|
57
57
|
openai = [
|
|
58
|
-
"openai>=1.62,<
|
|
59
|
-
"tiktoken>=0.9,<
|
|
58
|
+
"openai>=1.62,<3",
|
|
59
|
+
"tiktoken>=0.9,<1",
|
|
60
60
|
"transformers>=4.45.2,<5",
|
|
61
61
|
]
|
|
62
62
|
transformers = [
|
|
@@ -3,16 +3,12 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
import random
|
|
7
6
|
import re
|
|
8
|
-
import time
|
|
9
7
|
import traceback
|
|
10
8
|
from collections.abc import Callable, Sequence
|
|
11
9
|
|
|
12
|
-
import aiohttp
|
|
13
10
|
from aleph_alpha_client import (
|
|
14
11
|
AsyncClient,
|
|
15
|
-
BusyError,
|
|
16
12
|
Client,
|
|
17
13
|
CompletionRequest,
|
|
18
14
|
CompletionResponse,
|
|
@@ -49,11 +45,11 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
49
45
|
formatter: BaseFormatter | None = None,
|
|
50
46
|
checkpoint_name: str | None = None,
|
|
51
47
|
temperature: float | None = None,
|
|
48
|
+
top_p: float | None = None,
|
|
52
49
|
# Please see README.md for tips if adapting the following parameters.
|
|
53
50
|
max_retries: int = 100,
|
|
54
51
|
max_async_concurrent_requests: int = 32,
|
|
55
52
|
request_timeout_seconds: int = 30 * 60 + 5,
|
|
56
|
-
queue_full_timeout_seconds: int = 30 * 60 + 5,
|
|
57
53
|
bytes_per_token: float | None = None,
|
|
58
54
|
token: str = os.getenv("AA_TOKEN", "dummy"),
|
|
59
55
|
base_url: str = os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
|
|
@@ -67,10 +63,10 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
67
63
|
self._formatter = formatter
|
|
68
64
|
self._llm_name = checkpoint_name or self.LLM_NAME
|
|
69
65
|
self._temperature = temperature if temperature is not None else 0.0
|
|
66
|
+
self._top_p = top_p if top_p is not None else 0.0
|
|
70
67
|
self.max_async_concurrent_requests = max_async_concurrent_requests
|
|
71
68
|
self.max_retries = max_retries
|
|
72
69
|
self.request_timeout_seconds = request_timeout_seconds
|
|
73
|
-
self.queue_full_timeout_seconds = queue_full_timeout_seconds
|
|
74
70
|
self.token = token
|
|
75
71
|
self.base_url = base_url
|
|
76
72
|
self._validate_model_availability(base_url, token)
|
|
@@ -101,56 +97,6 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
101
97
|
except Exception as e:
|
|
102
98
|
raise RuntimeError(f"Model '{self._llm_name}' is not available: {e}")
|
|
103
99
|
|
|
104
|
-
async def _request_with_backoff(
|
|
105
|
-
self, client: AsyncClient, request: CompletionRequest, id: int
|
|
106
|
-
) -> CompletionResponse:
|
|
107
|
-
"""
|
|
108
|
-
Query Aleph-Alpha API with complete. Retry with back-off until it responds.
|
|
109
|
-
"""
|
|
110
|
-
num_attempts = 0
|
|
111
|
-
start_time: float | None = None
|
|
112
|
-
|
|
113
|
-
while True:
|
|
114
|
-
try:
|
|
115
|
-
return await client.complete(request, model=self._llm_name)
|
|
116
|
-
|
|
117
|
-
except (TimeoutError, BusyError, RuntimeError, aiohttp.ClientError) as e:
|
|
118
|
-
status_code: str = safe_json_loads(e.args[1]).get("code", "") if len(e.args) >= 2 else ""
|
|
119
|
-
str_e = str(e)
|
|
120
|
-
if status_code == "QUEUE_FULL":
|
|
121
|
-
# Worker not available or missed a heartbeat (inference longer than scheduler's
|
|
122
|
-
# API_MODEL_AVAILABLE_TIMEOUT_DURATION_MILLIS) or the scheduler is overloaded.
|
|
123
|
-
if start_time is None:
|
|
124
|
-
start_time = time.time()
|
|
125
|
-
elapsed = time.time() - start_time
|
|
126
|
-
if elapsed <= self.queue_full_timeout_seconds:
|
|
127
|
-
logger.info(
|
|
128
|
-
f"Request {id}: {status_code or str_e[:256]} - retrying: attempt"
|
|
129
|
-
f" {num_attempts}/{self.max_retries}, elapsed {elapsed:.1f} sec"
|
|
130
|
-
)
|
|
131
|
-
# don't count as retry (request returns immediately, so just wait a bit not to DoS the server)
|
|
132
|
-
await asyncio.sleep(random.randint(5, 30))
|
|
133
|
-
continue
|
|
134
|
-
|
|
135
|
-
elif (
|
|
136
|
-
status_code == "TIMEOUT_TASK"
|
|
137
|
-
or isinstance(e, TimeoutError)
|
|
138
|
-
or "502 Bad Gateway" in str_e
|
|
139
|
-
or "504 Gateway Time-out" in str_e
|
|
140
|
-
or isinstance(e, aiohttp.ClientError)
|
|
141
|
-
):
|
|
142
|
-
# client timeout, either because task too long in a queue or inference too long
|
|
143
|
-
# (scheduler's API_CLIENT_TIMEOUT_DURATION_MILLIS). Retrying for the "inference too long"
|
|
144
|
-
# case makes no sense but we unfortunately don't know which case has happened.
|
|
145
|
-
num_attempts += 1
|
|
146
|
-
start_time = None
|
|
147
|
-
if num_attempts < self.max_retries:
|
|
148
|
-
logger.info(f"Request {id}: TIMEOUT_TASK - retrying: attempt {num_attempts}/{self.max_retries}")
|
|
149
|
-
await asyncio.sleep(random.randint(5, 30))
|
|
150
|
-
continue
|
|
151
|
-
|
|
152
|
-
raise e
|
|
153
|
-
|
|
154
100
|
def _error_from_exception(self, e: Exception) -> Error:
|
|
155
101
|
"""Convert an exception to an Error object."""
|
|
156
102
|
if len(e.args) >= 2:
|
|
@@ -171,39 +117,36 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
171
117
|
async def _process_request_with_client(
|
|
172
118
|
self,
|
|
173
119
|
client: AsyncClient,
|
|
174
|
-
semaphore: asyncio.Semaphore,
|
|
175
120
|
request: CompletionRequest,
|
|
176
121
|
id: int,
|
|
177
122
|
) -> tuple[CompletionRequest, CompletionResponse | Error]:
|
|
178
123
|
"""Process a single request, returning the request and either a response or error."""
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
return (request, self._error_from_exception(e))
|
|
124
|
+
try:
|
|
125
|
+
response = await client.complete(request, model=self._llm_name)
|
|
126
|
+
logger.info(f"Request {id}: Success")
|
|
127
|
+
return (request, response)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
if raise_errors():
|
|
130
|
+
raise e
|
|
131
|
+
logger.info(f"Request {id}: Failure: {str(e)[:256]}")
|
|
132
|
+
return (request, self._error_from_exception(e))
|
|
189
133
|
|
|
190
134
|
async def _process_requests(
|
|
191
135
|
self,
|
|
192
136
|
requests: list[CompletionRequest],
|
|
193
137
|
) -> list[tuple[CompletionRequest, CompletionResponse | Error]]:
|
|
194
138
|
"""Process multiple requests concurrently, returning request/response pairs."""
|
|
195
|
-
semaphore = asyncio.Semaphore(self.max_async_concurrent_requests)
|
|
196
139
|
async with AsyncClient(
|
|
197
140
|
host=self.base_url,
|
|
198
141
|
nice=True,
|
|
199
142
|
request_timeout_seconds=self.request_timeout_seconds,
|
|
200
143
|
token=self.token,
|
|
201
|
-
total_retries=
|
|
144
|
+
total_retries=self.max_retries,
|
|
145
|
+
limit=self.max_async_concurrent_requests,
|
|
202
146
|
) as client:
|
|
203
147
|
tasks = (
|
|
204
148
|
self._process_request_with_client(
|
|
205
149
|
client,
|
|
206
|
-
semaphore,
|
|
207
150
|
request,
|
|
208
151
|
i,
|
|
209
152
|
)
|
|
@@ -272,6 +215,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
272
215
|
maximum_tokens=scaled_max_tokens,
|
|
273
216
|
stop_sequences=stop_sequences,
|
|
274
217
|
temperature=effective_temperature,
|
|
218
|
+
top_p=self._top_p,
|
|
275
219
|
)
|
|
276
220
|
)
|
|
277
221
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from collections.abc import Sequence
|
|
2
|
+
from collections.abc import Generator, Sequence
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
@@ -112,21 +113,22 @@ class BaseLLM(ABC):
|
|
|
112
113
|
"""
|
|
113
114
|
pass
|
|
114
115
|
|
|
116
|
+
@contextmanager
|
|
115
117
|
def _get_final_checkpoint(
|
|
116
118
|
self, checkpoint_path: str | Path | None = None, model_name: str | None = None, artifact_name: str | None = None
|
|
117
|
-
) -> tuple[str | Path | None, str | None]:
|
|
119
|
+
) -> Generator[tuple[str | Path | None, str | None], None, None]:
|
|
118
120
|
if (num_provided := sum(x is not None for x in [checkpoint_path, model_name, artifact_name])) == 0:
|
|
119
121
|
if not getattr(self, "LLM_NAME", ""):
|
|
120
122
|
raise ValueError("Either LLM_NAME, checkpoint_path, model_name, or artifact_name must be provided.")
|
|
121
|
-
|
|
123
|
+
yield None, None # no argument given, so will use the LLM_NAME of the class
|
|
122
124
|
elif num_provided > 1:
|
|
123
125
|
raise ValueError("At most one of `checkpoint_path`, `model_name`, or `artifact_name` must be provided.")
|
|
124
126
|
|
|
125
127
|
elif checkpoint_path is not None:
|
|
126
|
-
|
|
128
|
+
yield checkpoint_path, str(checkpoint_path)
|
|
127
129
|
|
|
128
130
|
elif model_name is not None:
|
|
129
|
-
|
|
131
|
+
yield model_name, model_name
|
|
130
132
|
|
|
131
133
|
else:
|
|
132
134
|
from eval_framework.utils.file_ops import WandbFs
|
|
@@ -139,7 +141,7 @@ class BaseLLM(ABC):
|
|
|
139
141
|
file_root = wandb_fs.find_hf_checkpoint_root_from_path_list()
|
|
140
142
|
if file_root is None:
|
|
141
143
|
raise ValueError(f"Could not find HuggingFace checkpoint in artifact {artifact_base}:{version}")
|
|
142
|
-
|
|
144
|
+
yield file_root, artifact_name
|
|
143
145
|
|
|
144
146
|
def _get_final_formatter(
|
|
145
147
|
self,
|
|
@@ -322,22 +322,21 @@ class HFLLM(BaseHFLLM):
|
|
|
322
322
|
bytes_per_token: float | None = None,
|
|
323
323
|
**kwargs: Any,
|
|
324
324
|
) -> None:
|
|
325
|
-
|
|
325
|
+
with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
|
|
326
|
+
self.checkpoint_name = checkpoint_name
|
|
327
|
+
if self.checkpoint_name is None and possible_name is not None:
|
|
328
|
+
self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
|
|
326
329
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
|
|
330
|
+
if final_path:
|
|
331
|
+
self.LLM_NAME = str(final_path)
|
|
330
332
|
|
|
331
|
-
|
|
332
|
-
self.LLM_NAME = str(final_path)
|
|
333
|
+
final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
|
|
333
334
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
**kwargs,
|
|
340
|
-
)
|
|
335
|
+
super().__init__(
|
|
336
|
+
formatter=final_formatter,
|
|
337
|
+
bytes_per_token=bytes_per_token,
|
|
338
|
+
**kwargs,
|
|
339
|
+
)
|
|
341
340
|
|
|
342
341
|
@property
|
|
343
342
|
def name(self) -> str:
|
|
@@ -137,10 +137,12 @@ class BaseVLLMModel(BaseLLM):
|
|
|
137
137
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
138
138
|
|
|
139
139
|
self.batch_size = batch_size
|
|
140
|
-
self._tokenizer: None | VLLMTokenizerAPI = None
|
|
141
140
|
|
|
142
141
|
self.model = LLM(**model_args, device=device)
|
|
143
142
|
|
|
143
|
+
self._tokenizer: None | VLLMTokenizerAPI = None
|
|
144
|
+
_ = self.tokenizer # make sure tokenizer is initialized
|
|
145
|
+
|
|
144
146
|
self.sampling_params: SamplingParams = self._process_sampling_params(sampling_params)
|
|
145
147
|
|
|
146
148
|
logger.info(
|
|
@@ -481,28 +483,27 @@ class VLLMModel(BaseVLLMModel):
|
|
|
481
483
|
sampling_params: SamplingParams | dict[str, Any] | None = None,
|
|
482
484
|
**kwargs: Any,
|
|
483
485
|
) -> None:
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
)
|
|
486
|
+
with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
|
|
487
|
+
if final_path:
|
|
488
|
+
self.LLM_NAME = str(final_path)
|
|
489
|
+
|
|
490
|
+
final_name = checkpoint_name
|
|
491
|
+
if final_name is None and possible_name is not None:
|
|
492
|
+
final_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
|
|
493
|
+
|
|
494
|
+
final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
|
|
495
|
+
|
|
496
|
+
super().__init__(
|
|
497
|
+
formatter=final_formatter,
|
|
498
|
+
checkpoint_path=final_path,
|
|
499
|
+
checkpoint_name=final_name,
|
|
500
|
+
max_model_len=max_model_len,
|
|
501
|
+
tensor_parallel_size=tensor_parallel_size,
|
|
502
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
503
|
+
batch_size=batch_size,
|
|
504
|
+
sampling_params=sampling_params,
|
|
505
|
+
**kwargs,
|
|
506
|
+
)
|
|
506
507
|
|
|
507
508
|
|
|
508
509
|
class VLLMRegistryModel(VLLMModel): # deprecated
|
|
@@ -171,7 +171,6 @@ class BaseTask[SubjectType](ABC):
|
|
|
171
171
|
return load_dataset(
|
|
172
172
|
**kwargs,
|
|
173
173
|
revision=self.HF_REVISION,
|
|
174
|
-
trust_remote_code=True,
|
|
175
174
|
cache_dir=cache_dir,
|
|
176
175
|
download_config=download_config,
|
|
177
176
|
)
|
|
@@ -179,7 +178,6 @@ class BaseTask[SubjectType](ABC):
|
|
|
179
178
|
return load_dataset(
|
|
180
179
|
**kwargs,
|
|
181
180
|
revision=self.HF_REVISION,
|
|
182
|
-
trust_remote_code=True,
|
|
183
181
|
cache_dir=f"{Path.home()}/.cache/eval-framework",
|
|
184
182
|
)
|
|
185
183
|
|
|
@@ -12,11 +12,12 @@ class DUC(BaseTask[str], ABC):
|
|
|
12
12
|
"""https://huggingface.co/datasets/midas/duc2001"""
|
|
13
13
|
|
|
14
14
|
DATASET_PATH: str = "midas/duc2001"
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
HF_REVISION: str = "77d6dedcbce421695a12f24c8802e8847a129d92"
|
|
16
|
+
SAMPLE_SPLIT: str = "train"
|
|
17
|
+
FEWSHOT_SPLIT: str = "train"
|
|
17
18
|
RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
|
|
18
19
|
METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
|
|
19
|
-
SUBJECTS: list[str] = ["
|
|
20
|
+
SUBJECTS: list[str] = ["default"]
|
|
20
21
|
PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
|
|
21
22
|
LANGUAGE = Language.ENG
|
|
22
23
|
|
|
@@ -33,6 +34,10 @@ class DUC(BaseTask[str], ABC):
|
|
|
33
34
|
completion_text = completion_text.strip()
|
|
34
35
|
return completion_text
|
|
35
36
|
|
|
37
|
+
def _load_dataset(self, subject: str) -> None:
|
|
38
|
+
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
|
|
39
|
+
self.dataset = self._shuffle_splits(hf_dataset=hf_dataset)
|
|
40
|
+
|
|
36
41
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
37
42
|
instruction_text = " ".join(item["document"])
|
|
38
43
|
instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
|
|
@@ -47,7 +52,7 @@ class DUC(BaseTask[str], ABC):
|
|
|
47
52
|
|
|
48
53
|
class DUC_EXTRACTIVE(DUC):
|
|
49
54
|
NAME = "DUC Extractive"
|
|
50
|
-
SUBJECTS: list[str] = ["
|
|
55
|
+
SUBJECTS: list[str] = ["default"]
|
|
51
56
|
|
|
52
57
|
def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
|
|
53
58
|
return item["extractive_keyphrases"]
|
|
@@ -61,14 +66,13 @@ class DUC_EXTRACTIVE(DUC):
|
|
|
61
66
|
|
|
62
67
|
class DUC_ABSTRACTIVE(DUC):
|
|
63
68
|
NAME = "DUC Abstractive"
|
|
64
|
-
SUBJECTS: list[str] = ["
|
|
69
|
+
SUBJECTS: list[str] = ["default"]
|
|
65
70
|
|
|
66
71
|
def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
|
|
67
72
|
return item["abstractive_keyphrases"]
|
|
68
73
|
|
|
69
74
|
def _load_dataset(self, subject: str) -> None:
|
|
70
|
-
|
|
71
|
-
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
|
|
75
|
+
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
|
|
72
76
|
self.dataset = {}
|
|
73
77
|
|
|
74
78
|
for split, data in hf_dataset.items():
|
{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
@@ -25,6 +25,7 @@ class Flores200(BaseTask[str]):
|
|
|
25
25
|
|
|
26
26
|
NAME = "FLoRes-200"
|
|
27
27
|
DATASET_PATH = "facebook/flores"
|
|
28
|
+
HF_REVISION = "fd7d8f42fccb9dbc35830053a8c705a2627124ce"
|
|
28
29
|
SAMPLE_SPLIT = "devtest"
|
|
29
30
|
FEWSHOT_SPLIT = "dev"
|
|
30
31
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
@@ -66,7 +67,6 @@ class Flores200(BaseTask[str]):
|
|
|
66
67
|
split=kwargs.get("split"),
|
|
67
68
|
data_files=None, # Let it auto-discover parquet files
|
|
68
69
|
revision=self.HF_REVISION,
|
|
69
|
-
trust_remote_code=False, # Disable the loading script!
|
|
70
70
|
cache_dir=cache_dir,
|
|
71
71
|
download_config=download_config,
|
|
72
72
|
)
|
|
@@ -79,7 +79,6 @@ class Flores200(BaseTask[str]):
|
|
|
79
79
|
dataset = load_dataset(
|
|
80
80
|
**kwargs,
|
|
81
81
|
revision=self.HF_REVISION,
|
|
82
|
-
trust_remote_code=True,
|
|
83
82
|
cache_dir=cache_dir,
|
|
84
83
|
download_config=download_config,
|
|
85
84
|
)
|
{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
@@ -39,13 +39,10 @@ class InfiniteBench(BaseTask[str], ABC):
|
|
|
39
39
|
}
|
|
40
40
|
)
|
|
41
41
|
try:
|
|
42
|
-
return load_dataset(
|
|
43
|
-
**kwargs, trust_remote_code=True, cache_dir=cache_dir, download_config=download_config, features=ft
|
|
44
|
-
)
|
|
42
|
+
return load_dataset(**kwargs, cache_dir=cache_dir, download_config=download_config, features=ft)
|
|
45
43
|
except Exception:
|
|
46
44
|
return load_dataset(
|
|
47
45
|
**kwargs,
|
|
48
|
-
trust_remote_code=True,
|
|
49
46
|
cache_dir=f"{Path.home()}/.cache/eval-framework",
|
|
50
47
|
features=ft,
|
|
51
48
|
)
|
{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
@@ -27,6 +27,7 @@ class ARC_EU20_DE(ARC):
|
|
|
27
27
|
|
|
28
28
|
NAME = "ARC_EU20_DE"
|
|
29
29
|
DATASET_PATH = "openGPT-X/arcx"
|
|
30
|
+
HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
|
|
30
31
|
SAMPLE_SPLIT = "test"
|
|
31
32
|
FEWSHOT_SPLIT = "train"
|
|
32
33
|
SUBJECTS = ["challenge_DE", "easy_DE"]
|
|
@@ -36,6 +37,7 @@ class ARC_EU20_DE(ARC):
|
|
|
36
37
|
class ARC_EU20_FR(ARC):
|
|
37
38
|
NAME = "ARC_EU20_FR"
|
|
38
39
|
DATASET_PATH = "openGPT-X/arcx"
|
|
40
|
+
HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
|
|
39
41
|
SAMPLE_SPLIT = "test"
|
|
40
42
|
FEWSHOT_SPLIT = "train"
|
|
41
43
|
SUBJECTS = ["challenge_FR", "easy_FR"]
|
|
@@ -51,6 +53,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
|
|
|
51
53
|
""" # noqa: E501
|
|
52
54
|
|
|
53
55
|
NAME = "GSM8K_EU20_DE"
|
|
56
|
+
HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
|
|
54
57
|
DATASET_PATH = "openGPT-X/gsm8kx"
|
|
55
58
|
SAMPLE_SPLIT = "test"
|
|
56
59
|
FEWSHOT_SPLIT = "train"
|
|
@@ -60,6 +63,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
|
|
|
60
63
|
|
|
61
64
|
class GSM8K_EU20_FR(GSM8KEvalHarness):
|
|
62
65
|
NAME = "GSM8K_EU20_FR"
|
|
66
|
+
HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
|
|
63
67
|
DATASET_PATH = "openGPT-X/gsm8kx"
|
|
64
68
|
SAMPLE_SPLIT = "test"
|
|
65
69
|
FEWSHOT_SPLIT = "train"
|
|
@@ -77,6 +81,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
|
|
|
77
81
|
|
|
78
82
|
NAME = "HellaSwag_EU20_DE"
|
|
79
83
|
DATASET_PATH = "openGPT-X/hellaswagx"
|
|
84
|
+
HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
|
|
80
85
|
SAMPLE_SPLIT = "train"
|
|
81
86
|
FEWSHOT_SPLIT = "validation"
|
|
82
87
|
SUBJECTS = ["DE"]
|
|
@@ -86,6 +91,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
|
|
|
86
91
|
class HELLASWAG_EU20_FR(HELLASWAG):
|
|
87
92
|
NAME = "HellaSwag_EU20_FR"
|
|
88
93
|
DATASET_PATH = "openGPT-X/hellaswagx"
|
|
94
|
+
HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
|
|
89
95
|
SAMPLE_SPLIT = "train"
|
|
90
96
|
FEWSHOT_SPLIT = "validation"
|
|
91
97
|
SUBJECTS = ["FR"]
|
|
@@ -128,6 +134,7 @@ class TRUTHFULQA_EU20_DE(TRUTHFULQA):
|
|
|
128
134
|
|
|
129
135
|
NAME = "TruthfulQA_EU20_DE"
|
|
130
136
|
DATASET_PATH = "openGPT-X/truthfulqax"
|
|
137
|
+
HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
|
|
131
138
|
LANGUAGE = Language.DEU
|
|
132
139
|
|
|
133
140
|
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
@@ -182,6 +189,7 @@ TRUTHFULQA_EU20_FR_FEWSHOT_ITEMS = [
|
|
|
182
189
|
class TRUTHFULQA_EU20_FR(TRUTHFULQA):
|
|
183
190
|
NAME = "TruthfulQA_EU20_FR"
|
|
184
191
|
DATASET_PATH = "openGPT-X/truthfulqax"
|
|
192
|
+
HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
|
|
185
193
|
LANGUAGE = Language.FRA
|
|
186
194
|
|
|
187
195
|
def _load_dataset(self, subject: SubjectType) -> None:
|
|
@@ -214,6 +222,7 @@ class MMLU_EU20_DE(MMLU):
|
|
|
214
222
|
|
|
215
223
|
NAME = "MMLU_EU20_DE"
|
|
216
224
|
DATASET_PATH = "openGPT-X/mmlux"
|
|
225
|
+
HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
|
|
217
226
|
SAMPLE_SPLIT = "test"
|
|
218
227
|
FEWSHOT_SPLIT = "dev" # one could merge dev and validation to have a larger pool of fewshot examples
|
|
219
228
|
SUBJECTS = [i + "_DE" for i in MMLU_SUBJECTS]
|
|
@@ -321,6 +330,7 @@ MMLU_SUBJECTS_TRANSLATION_FR = {
|
|
|
321
330
|
class MMLU_EU20_FR(MMLU):
|
|
322
331
|
NAME = "MMLU_EU20_FR"
|
|
323
332
|
DATASET_PATH = "openGPT-X/mmlux"
|
|
333
|
+
HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
|
|
324
334
|
SAMPLE_SPLIT = "test"
|
|
325
335
|
FEWSHOT_SPLIT = "dev"
|
|
326
336
|
SUBJECTS = [i + "_FR" for i in MMLU_SUBJECTS]
|
|
@@ -15,6 +15,7 @@ class PIQA(BaseTask[str]):
|
|
|
15
15
|
|
|
16
16
|
NAME = "PIQA"
|
|
17
17
|
DATASET_PATH = "ybisk/piqa"
|
|
18
|
+
HF_REVISION = "6b3aceb3276e5ab7e51895d73151a718690af38c"
|
|
18
19
|
SAMPLE_SPLIT = "validation" # 1838 examples (same split as lm-eval)
|
|
19
20
|
FEWSHOT_SPLIT = "test" # 3084 examples
|
|
20
21
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winox.py
RENAMED
|
@@ -1,5 +1,11 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
1
3
|
from typing import Any
|
|
2
4
|
|
|
5
|
+
from datasets import DownloadConfig, load_dataset
|
|
6
|
+
from huggingface_hub import HfApi
|
|
7
|
+
from huggingface_hub.errors import RevisionNotFoundError
|
|
8
|
+
|
|
3
9
|
from eval_framework.tasks.base import Language
|
|
4
10
|
from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
|
|
5
11
|
|
|
@@ -19,6 +25,7 @@ class WINOX(WINOGRANDE):
|
|
|
19
25
|
"""
|
|
20
26
|
|
|
21
27
|
DATASET_PATH = "demelin/wino_x"
|
|
28
|
+
HF_REVISION = "7d82697fd52ac8b03e62aadfddc61077320f21e7"
|
|
22
29
|
SAMPLE_SPLIT = "test"
|
|
23
30
|
FEWSHOT_SPLIT = "test"
|
|
24
31
|
LANGUAGE_SHORT_CODE = ""
|
|
@@ -42,6 +49,31 @@ class WINOX(WINOGRANDE):
|
|
|
42
49
|
]
|
|
43
50
|
return choices
|
|
44
51
|
|
|
52
|
+
def _load_hf_dataset(self, **kwargs: Any) -> Any:
|
|
53
|
+
"""Override to handle FLORES-200 encoding issues by using parquet files."""
|
|
54
|
+
# Check if the HF_REVISION is valid before loading the dataset
|
|
55
|
+
if self.HF_REVISION:
|
|
56
|
+
try:
|
|
57
|
+
_ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
if isinstance(e, RevisionNotFoundError):
|
|
60
|
+
raise e
|
|
61
|
+
|
|
62
|
+
cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
|
|
63
|
+
download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
|
|
64
|
+
|
|
65
|
+
dataset = load_dataset(
|
|
66
|
+
kwargs.get("path", self.DATASET_PATH),
|
|
67
|
+
name=kwargs.get("name"),
|
|
68
|
+
split=kwargs.get("split"),
|
|
69
|
+
data_files=None, # Let it auto-discover parquet files
|
|
70
|
+
revision=self.HF_REVISION,
|
|
71
|
+
cache_dir=cache_dir,
|
|
72
|
+
download_config=download_config,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return dataset
|
|
76
|
+
|
|
45
77
|
|
|
46
78
|
class WINOX_DE(WINOX):
|
|
47
79
|
NAME = "WINOX_DE"
|
{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
@@ -16,6 +16,7 @@ class ZERO_SCROLLS_QUALITY(BaseTask[str]):
|
|
|
16
16
|
|
|
17
17
|
NAME = "ZeroSCROLLS QuALITY"
|
|
18
18
|
DATASET_PATH = "tau/zero_scrolls"
|
|
19
|
+
HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
|
|
19
20
|
SAMPLE_SPLIT = "validation"
|
|
20
21
|
FEWSHOT_SPLIT = "validation"
|
|
21
22
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
@@ -48,6 +49,7 @@ class ZERO_SCROLLS_COMPLETION(BaseTask[str]):
|
|
|
48
49
|
"""ZeroSCROLLS dataset: https://huggingface.co/datasets/tau/zero_scrolls"""
|
|
49
50
|
|
|
50
51
|
DATASET_PATH = "tau/zero_scrolls"
|
|
52
|
+
HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
|
|
51
53
|
SAMPLE_SPLIT = "validation"
|
|
52
54
|
FEWSHOT_SPLIT = "validation"
|
|
53
55
|
RESPONSE_TYPE = ResponseType.COMPLETION
|