eval-framework 0.2.11__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.11 → eval_framework-0.2.12}/PKG-INFO +4 -4
- {eval_framework-0.2.11 → eval_framework-0.2.12}/pyproject.toml +4 -4
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/aleph_alpha.py +3 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/base.py +0 -2
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/duc.py +11 -7
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores200.py +1 -2
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gsm8k.py +1 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/infinitebench.py +1 -4
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_de.py +1 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +10 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/piqa.py +1 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/squad.py +0 -1
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winox.py +32 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +2 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/LICENSE +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/README.md +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/run.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/casehold.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/math_reasoning.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/truthfulqa.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogrande.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.12
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -214,7 +214,7 @@ Classifier: Typing :: Typed
|
|
|
214
214
|
Requires-Dist: pyyaml>=6.0.1,<7
|
|
215
215
|
Requires-Dist: xmltodict>=0.13.0,<0.16
|
|
216
216
|
Requires-Dist: pydantic>=2.7,<3
|
|
217
|
-
Requires-Dist: datasets>=
|
|
217
|
+
Requires-Dist: datasets>=4.0.0,<5
|
|
218
218
|
Requires-Dist: sacrebleu>=2.4.3,<3
|
|
219
219
|
Requires-Dist: pycountry>=24.6.1,<25
|
|
220
220
|
Requires-Dist: nltk>=3.9.1,<4
|
|
@@ -245,8 +245,8 @@ Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
|
|
|
245
245
|
Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
|
|
246
246
|
Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
|
|
247
247
|
Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
|
|
248
|
-
Requires-Dist: openai>=1.62,<
|
|
249
|
-
Requires-Dist: tiktoken>=0.9,<
|
|
248
|
+
Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
|
|
249
|
+
Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
|
|
250
250
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
|
|
251
251
|
Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
|
|
252
252
|
Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.12"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -21,7 +21,7 @@ dependencies = [
|
|
|
21
21
|
"pyyaml>=6.0.1,<7",
|
|
22
22
|
"xmltodict>=0.13.0,<0.16",
|
|
23
23
|
"pydantic>=2.7,<3",
|
|
24
|
-
"datasets>=
|
|
24
|
+
"datasets>=4.0.0,<5",
|
|
25
25
|
"sacrebleu>=2.4.3,<3",
|
|
26
26
|
"pycountry>=24.6.1,<25",
|
|
27
27
|
"nltk>=3.9.1,<4",
|
|
@@ -55,8 +55,8 @@ determined = [
|
|
|
55
55
|
]
|
|
56
56
|
api = ["aleph-alpha-client>=11.5.1"]
|
|
57
57
|
openai = [
|
|
58
|
-
"openai>=1.62,<
|
|
59
|
-
"tiktoken>=0.9,<
|
|
58
|
+
"openai>=1.62,<3",
|
|
59
|
+
"tiktoken>=0.9,<1",
|
|
60
60
|
"transformers>=4.45.2,<5",
|
|
61
61
|
]
|
|
62
62
|
transformers = [
|
|
@@ -45,6 +45,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
45
45
|
formatter: BaseFormatter | None = None,
|
|
46
46
|
checkpoint_name: str | None = None,
|
|
47
47
|
temperature: float | None = None,
|
|
48
|
+
top_p: float | None = None,
|
|
48
49
|
# Please see README.md for tips if adapting the following parameters.
|
|
49
50
|
max_retries: int = 100,
|
|
50
51
|
max_async_concurrent_requests: int = 32,
|
|
@@ -62,6 +63,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
62
63
|
self._formatter = formatter
|
|
63
64
|
self._llm_name = checkpoint_name or self.LLM_NAME
|
|
64
65
|
self._temperature = temperature if temperature is not None else 0.0
|
|
66
|
+
self._top_p = top_p if top_p is not None else 0.0
|
|
65
67
|
self.max_async_concurrent_requests = max_async_concurrent_requests
|
|
66
68
|
self.max_retries = max_retries
|
|
67
69
|
self.request_timeout_seconds = request_timeout_seconds
|
|
@@ -213,6 +215,7 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
213
215
|
maximum_tokens=scaled_max_tokens,
|
|
214
216
|
stop_sequences=stop_sequences,
|
|
215
217
|
temperature=effective_temperature,
|
|
218
|
+
top_p=self._top_p,
|
|
216
219
|
)
|
|
217
220
|
)
|
|
218
221
|
|
|
@@ -171,7 +171,6 @@ class BaseTask[SubjectType](ABC):
|
|
|
171
171
|
return load_dataset(
|
|
172
172
|
**kwargs,
|
|
173
173
|
revision=self.HF_REVISION,
|
|
174
|
-
trust_remote_code=True,
|
|
175
174
|
cache_dir=cache_dir,
|
|
176
175
|
download_config=download_config,
|
|
177
176
|
)
|
|
@@ -179,7 +178,6 @@ class BaseTask[SubjectType](ABC):
|
|
|
179
178
|
return load_dataset(
|
|
180
179
|
**kwargs,
|
|
181
180
|
revision=self.HF_REVISION,
|
|
182
|
-
trust_remote_code=True,
|
|
183
181
|
cache_dir=f"{Path.home()}/.cache/eval-framework",
|
|
184
182
|
)
|
|
185
183
|
|
|
@@ -12,11 +12,12 @@ class DUC(BaseTask[str], ABC):
|
|
|
12
12
|
"""https://huggingface.co/datasets/midas/duc2001"""
|
|
13
13
|
|
|
14
14
|
DATASET_PATH: str = "midas/duc2001"
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
HF_REVISION: str = "77d6dedcbce421695a12f24c8802e8847a129d92"
|
|
16
|
+
SAMPLE_SPLIT: str = "train"
|
|
17
|
+
FEWSHOT_SPLIT: str = "train"
|
|
17
18
|
RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
|
|
18
19
|
METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
|
|
19
|
-
SUBJECTS: list[str] = ["
|
|
20
|
+
SUBJECTS: list[str] = ["default"]
|
|
20
21
|
PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
|
|
21
22
|
LANGUAGE = Language.ENG
|
|
22
23
|
|
|
@@ -33,6 +34,10 @@ class DUC(BaseTask[str], ABC):
|
|
|
33
34
|
completion_text = completion_text.strip()
|
|
34
35
|
return completion_text
|
|
35
36
|
|
|
37
|
+
def _load_dataset(self, subject: str) -> None:
|
|
38
|
+
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
|
|
39
|
+
self.dataset = self._shuffle_splits(hf_dataset=hf_dataset)
|
|
40
|
+
|
|
36
41
|
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
37
42
|
instruction_text = " ".join(item["document"])
|
|
38
43
|
instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
|
|
@@ -47,7 +52,7 @@ class DUC(BaseTask[str], ABC):
|
|
|
47
52
|
|
|
48
53
|
class DUC_EXTRACTIVE(DUC):
|
|
49
54
|
NAME = "DUC Extractive"
|
|
50
|
-
SUBJECTS: list[str] = ["
|
|
55
|
+
SUBJECTS: list[str] = ["default"]
|
|
51
56
|
|
|
52
57
|
def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
|
|
53
58
|
return item["extractive_keyphrases"]
|
|
@@ -61,14 +66,13 @@ class DUC_EXTRACTIVE(DUC):
|
|
|
61
66
|
|
|
62
67
|
class DUC_ABSTRACTIVE(DUC):
|
|
63
68
|
NAME = "DUC Abstractive"
|
|
64
|
-
SUBJECTS: list[str] = ["
|
|
69
|
+
SUBJECTS: list[str] = ["default"]
|
|
65
70
|
|
|
66
71
|
def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
|
|
67
72
|
return item["abstractive_keyphrases"]
|
|
68
73
|
|
|
69
74
|
def _load_dataset(self, subject: str) -> None:
|
|
70
|
-
|
|
71
|
-
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
|
|
75
|
+
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
|
|
72
76
|
self.dataset = {}
|
|
73
77
|
|
|
74
78
|
for split, data in hf_dataset.items():
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
@@ -25,6 +25,7 @@ class Flores200(BaseTask[str]):
|
|
|
25
25
|
|
|
26
26
|
NAME = "FLoRes-200"
|
|
27
27
|
DATASET_PATH = "facebook/flores"
|
|
28
|
+
HF_REVISION = "fd7d8f42fccb9dbc35830053a8c705a2627124ce"
|
|
28
29
|
SAMPLE_SPLIT = "devtest"
|
|
29
30
|
FEWSHOT_SPLIT = "dev"
|
|
30
31
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
@@ -66,7 +67,6 @@ class Flores200(BaseTask[str]):
|
|
|
66
67
|
split=kwargs.get("split"),
|
|
67
68
|
data_files=None, # Let it auto-discover parquet files
|
|
68
69
|
revision=self.HF_REVISION,
|
|
69
|
-
trust_remote_code=False, # Disable the loading script!
|
|
70
70
|
cache_dir=cache_dir,
|
|
71
71
|
download_config=download_config,
|
|
72
72
|
)
|
|
@@ -79,7 +79,6 @@ class Flores200(BaseTask[str]):
|
|
|
79
79
|
dataset = load_dataset(
|
|
80
80
|
**kwargs,
|
|
81
81
|
revision=self.HF_REVISION,
|
|
82
|
-
trust_remote_code=True,
|
|
83
82
|
cache_dir=cache_dir,
|
|
84
83
|
download_config=download_config,
|
|
85
84
|
)
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
@@ -39,13 +39,10 @@ class InfiniteBench(BaseTask[str], ABC):
|
|
|
39
39
|
}
|
|
40
40
|
)
|
|
41
41
|
try:
|
|
42
|
-
return load_dataset(
|
|
43
|
-
**kwargs, trust_remote_code=True, cache_dir=cache_dir, download_config=download_config, features=ft
|
|
44
|
-
)
|
|
42
|
+
return load_dataset(**kwargs, cache_dir=cache_dir, download_config=download_config, features=ft)
|
|
45
43
|
except Exception:
|
|
46
44
|
return load_dataset(
|
|
47
45
|
**kwargs,
|
|
48
|
-
trust_remote_code=True,
|
|
49
46
|
cache_dir=f"{Path.home()}/.cache/eval-framework",
|
|
50
47
|
features=ft,
|
|
51
48
|
)
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
@@ -27,6 +27,7 @@ class ARC_EU20_DE(ARC):
|
|
|
27
27
|
|
|
28
28
|
NAME = "ARC_EU20_DE"
|
|
29
29
|
DATASET_PATH = "openGPT-X/arcx"
|
|
30
|
+
HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
|
|
30
31
|
SAMPLE_SPLIT = "test"
|
|
31
32
|
FEWSHOT_SPLIT = "train"
|
|
32
33
|
SUBJECTS = ["challenge_DE", "easy_DE"]
|
|
@@ -36,6 +37,7 @@ class ARC_EU20_DE(ARC):
|
|
|
36
37
|
class ARC_EU20_FR(ARC):
|
|
37
38
|
NAME = "ARC_EU20_FR"
|
|
38
39
|
DATASET_PATH = "openGPT-X/arcx"
|
|
40
|
+
HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
|
|
39
41
|
SAMPLE_SPLIT = "test"
|
|
40
42
|
FEWSHOT_SPLIT = "train"
|
|
41
43
|
SUBJECTS = ["challenge_FR", "easy_FR"]
|
|
@@ -51,6 +53,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
|
|
|
51
53
|
""" # noqa: E501
|
|
52
54
|
|
|
53
55
|
NAME = "GSM8K_EU20_DE"
|
|
56
|
+
HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
|
|
54
57
|
DATASET_PATH = "openGPT-X/gsm8kx"
|
|
55
58
|
SAMPLE_SPLIT = "test"
|
|
56
59
|
FEWSHOT_SPLIT = "train"
|
|
@@ -60,6 +63,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
|
|
|
60
63
|
|
|
61
64
|
class GSM8K_EU20_FR(GSM8KEvalHarness):
|
|
62
65
|
NAME = "GSM8K_EU20_FR"
|
|
66
|
+
HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
|
|
63
67
|
DATASET_PATH = "openGPT-X/gsm8kx"
|
|
64
68
|
SAMPLE_SPLIT = "test"
|
|
65
69
|
FEWSHOT_SPLIT = "train"
|
|
@@ -77,6 +81,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
|
|
|
77
81
|
|
|
78
82
|
NAME = "HellaSwag_EU20_DE"
|
|
79
83
|
DATASET_PATH = "openGPT-X/hellaswagx"
|
|
84
|
+
HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
|
|
80
85
|
SAMPLE_SPLIT = "train"
|
|
81
86
|
FEWSHOT_SPLIT = "validation"
|
|
82
87
|
SUBJECTS = ["DE"]
|
|
@@ -86,6 +91,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
|
|
|
86
91
|
class HELLASWAG_EU20_FR(HELLASWAG):
|
|
87
92
|
NAME = "HellaSwag_EU20_FR"
|
|
88
93
|
DATASET_PATH = "openGPT-X/hellaswagx"
|
|
94
|
+
HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
|
|
89
95
|
SAMPLE_SPLIT = "train"
|
|
90
96
|
FEWSHOT_SPLIT = "validation"
|
|
91
97
|
SUBJECTS = ["FR"]
|
|
@@ -128,6 +134,7 @@ class TRUTHFULQA_EU20_DE(TRUTHFULQA):
|
|
|
128
134
|
|
|
129
135
|
NAME = "TruthfulQA_EU20_DE"
|
|
130
136
|
DATASET_PATH = "openGPT-X/truthfulqax"
|
|
137
|
+
HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
|
|
131
138
|
LANGUAGE = Language.DEU
|
|
132
139
|
|
|
133
140
|
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
@@ -182,6 +189,7 @@ TRUTHFULQA_EU20_FR_FEWSHOT_ITEMS = [
|
|
|
182
189
|
class TRUTHFULQA_EU20_FR(TRUTHFULQA):
|
|
183
190
|
NAME = "TruthfulQA_EU20_FR"
|
|
184
191
|
DATASET_PATH = "openGPT-X/truthfulqax"
|
|
192
|
+
HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
|
|
185
193
|
LANGUAGE = Language.FRA
|
|
186
194
|
|
|
187
195
|
def _load_dataset(self, subject: SubjectType) -> None:
|
|
@@ -214,6 +222,7 @@ class MMLU_EU20_DE(MMLU):
|
|
|
214
222
|
|
|
215
223
|
NAME = "MMLU_EU20_DE"
|
|
216
224
|
DATASET_PATH = "openGPT-X/mmlux"
|
|
225
|
+
HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
|
|
217
226
|
SAMPLE_SPLIT = "test"
|
|
218
227
|
FEWSHOT_SPLIT = "dev" # one could merge dev and validation to have a larger pool of fewshot examples
|
|
219
228
|
SUBJECTS = [i + "_DE" for i in MMLU_SUBJECTS]
|
|
@@ -321,6 +330,7 @@ MMLU_SUBJECTS_TRANSLATION_FR = {
|
|
|
321
330
|
class MMLU_EU20_FR(MMLU):
|
|
322
331
|
NAME = "MMLU_EU20_FR"
|
|
323
332
|
DATASET_PATH = "openGPT-X/mmlux"
|
|
333
|
+
HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
|
|
324
334
|
SAMPLE_SPLIT = "test"
|
|
325
335
|
FEWSHOT_SPLIT = "dev"
|
|
326
336
|
SUBJECTS = [i + "_FR" for i in MMLU_SUBJECTS]
|
|
@@ -15,6 +15,7 @@ class PIQA(BaseTask[str]):
|
|
|
15
15
|
|
|
16
16
|
NAME = "PIQA"
|
|
17
17
|
DATASET_PATH = "ybisk/piqa"
|
|
18
|
+
HF_REVISION = "6b3aceb3276e5ab7e51895d73151a718690af38c"
|
|
18
19
|
SAMPLE_SPLIT = "validation" # 1838 examples (same split as lm-eval)
|
|
19
20
|
FEWSHOT_SPLIT = "test" # 3084 examples
|
|
20
21
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winox.py
RENAMED
|
@@ -1,5 +1,11 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
1
3
|
from typing import Any
|
|
2
4
|
|
|
5
|
+
from datasets import DownloadConfig, load_dataset
|
|
6
|
+
from huggingface_hub import HfApi
|
|
7
|
+
from huggingface_hub.errors import RevisionNotFoundError
|
|
8
|
+
|
|
3
9
|
from eval_framework.tasks.base import Language
|
|
4
10
|
from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
|
|
5
11
|
|
|
@@ -19,6 +25,7 @@ class WINOX(WINOGRANDE):
|
|
|
19
25
|
"""
|
|
20
26
|
|
|
21
27
|
DATASET_PATH = "demelin/wino_x"
|
|
28
|
+
HF_REVISION = "7d82697fd52ac8b03e62aadfddc61077320f21e7"
|
|
22
29
|
SAMPLE_SPLIT = "test"
|
|
23
30
|
FEWSHOT_SPLIT = "test"
|
|
24
31
|
LANGUAGE_SHORT_CODE = ""
|
|
@@ -42,6 +49,31 @@ class WINOX(WINOGRANDE):
|
|
|
42
49
|
]
|
|
43
50
|
return choices
|
|
44
51
|
|
|
52
|
+
def _load_hf_dataset(self, **kwargs: Any) -> Any:
|
|
53
|
+
"""Override to handle FLORES-200 encoding issues by using parquet files."""
|
|
54
|
+
# Check if the HF_REVISION is valid before loading the dataset
|
|
55
|
+
if self.HF_REVISION:
|
|
56
|
+
try:
|
|
57
|
+
_ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
if isinstance(e, RevisionNotFoundError):
|
|
60
|
+
raise e
|
|
61
|
+
|
|
62
|
+
cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
|
|
63
|
+
download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
|
|
64
|
+
|
|
65
|
+
dataset = load_dataset(
|
|
66
|
+
kwargs.get("path", self.DATASET_PATH),
|
|
67
|
+
name=kwargs.get("name"),
|
|
68
|
+
split=kwargs.get("split"),
|
|
69
|
+
data_files=None, # Let it auto-discover parquet files
|
|
70
|
+
revision=self.HF_REVISION,
|
|
71
|
+
cache_dir=cache_dir,
|
|
72
|
+
download_config=download_config,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return dataset
|
|
76
|
+
|
|
45
77
|
|
|
46
78
|
class WINOX_DE(WINOX):
|
|
47
79
|
NAME = "WINOX_DE"
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
@@ -16,6 +16,7 @@ class ZERO_SCROLLS_QUALITY(BaseTask[str]):
|
|
|
16
16
|
|
|
17
17
|
NAME = "ZeroSCROLLS QuALITY"
|
|
18
18
|
DATASET_PATH = "tau/zero_scrolls"
|
|
19
|
+
HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
|
|
19
20
|
SAMPLE_SPLIT = "validation"
|
|
20
21
|
FEWSHOT_SPLIT = "validation"
|
|
21
22
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
@@ -48,6 +49,7 @@ class ZERO_SCROLLS_COMPLETION(BaseTask[str]):
|
|
|
48
49
|
"""ZeroSCROLLS dataset: https://huggingface.co/datasets/tau/zero_scrolls"""
|
|
49
50
|
|
|
50
51
|
DATASET_PATH = "tau/zero_scrolls"
|
|
52
|
+
HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
|
|
51
53
|
SAMPLE_SPLIT = "validation"
|
|
52
54
|
FEWSHOT_SPLIT = "validation"
|
|
53
55
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/bleu.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/chrf.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/completion/ter.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/base.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/arc_fi.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmmlu.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/pawsx.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/sphyr.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.11 → eval_framework-0.2.12}/src/template_formatting/mistral_formatter.py
RENAMED
|
File without changes
|
|
File without changes
|