eval-framework 0.2.9__tar.gz → 0.2.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.9 → eval_framework-0.2.11}/PKG-INFO +2 -2
- {eval_framework-0.2.9 → eval_framework-0.2.11}/pyproject.toml +2 -2
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/aleph_alpha.py +11 -70
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/base.py +8 -6
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/huggingface.py +12 -13
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/vllm.py +24 -23
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc.py +1 -1
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/casehold.py +3 -1
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/gsm8k.py +1 -1
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/math_reasoning.py +1 -1
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/truthfulqa.py +1 -1
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winogrande.py +2 -2
- {eval_framework-0.2.9 → eval_framework-0.2.11}/LICENSE +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/README.md +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/determined.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/eval.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/context/local.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/response_generator.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/run.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/eval_config.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/task_names.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.9 → eval_framework-0.2.11}/src/template_formatting/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-framework
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.11
|
|
4
4
|
Summary: Evalulation Framework
|
|
5
5
|
Author: Aleph Alpha Research
|
|
6
6
|
License: Apache License
|
|
@@ -238,7 +238,7 @@ Requires-Dist: numpy>=1.26.4
|
|
|
238
238
|
Requires-Dist: antlr4-python3-runtime==4.11.0
|
|
239
239
|
Requires-Dist: accelerate ; extra == 'accelerate'
|
|
240
240
|
Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
|
|
241
|
-
Requires-Dist: aleph-alpha-client>=
|
|
241
|
+
Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
|
|
242
242
|
Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
|
|
243
243
|
Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
|
|
244
244
|
Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "eval-framework"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.11"
|
|
4
4
|
description = "Evalulation Framework"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { file = "LICENSE" }
|
|
@@ -53,7 +53,7 @@ determined = [
|
|
|
53
53
|
"determined>=0.38,<0.39",
|
|
54
54
|
"tensorboard==2.19.0"
|
|
55
55
|
]
|
|
56
|
-
api = ["aleph-alpha-client>=
|
|
56
|
+
api = ["aleph-alpha-client>=11.5.1"]
|
|
57
57
|
openai = [
|
|
58
58
|
"openai>=1.62,<2.8",
|
|
59
59
|
"tiktoken>=0.9,<0.10",
|
|
@@ -3,16 +3,12 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import math
|
|
5
5
|
import os
|
|
6
|
-
import random
|
|
7
6
|
import re
|
|
8
|
-
import time
|
|
9
7
|
import traceback
|
|
10
8
|
from collections.abc import Callable, Sequence
|
|
11
9
|
|
|
12
|
-
import aiohttp
|
|
13
10
|
from aleph_alpha_client import (
|
|
14
11
|
AsyncClient,
|
|
15
|
-
BusyError,
|
|
16
12
|
Client,
|
|
17
13
|
CompletionRequest,
|
|
18
14
|
CompletionResponse,
|
|
@@ -53,7 +49,6 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
53
49
|
max_retries: int = 100,
|
|
54
50
|
max_async_concurrent_requests: int = 32,
|
|
55
51
|
request_timeout_seconds: int = 30 * 60 + 5,
|
|
56
|
-
queue_full_timeout_seconds: int = 30 * 60 + 5,
|
|
57
52
|
bytes_per_token: float | None = None,
|
|
58
53
|
token: str = os.getenv("AA_TOKEN", "dummy"),
|
|
59
54
|
base_url: str = os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
|
|
@@ -70,7 +65,6 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
70
65
|
self.max_async_concurrent_requests = max_async_concurrent_requests
|
|
71
66
|
self.max_retries = max_retries
|
|
72
67
|
self.request_timeout_seconds = request_timeout_seconds
|
|
73
|
-
self.queue_full_timeout_seconds = queue_full_timeout_seconds
|
|
74
68
|
self.token = token
|
|
75
69
|
self.base_url = base_url
|
|
76
70
|
self._validate_model_availability(base_url, token)
|
|
@@ -101,56 +95,6 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
101
95
|
except Exception as e:
|
|
102
96
|
raise RuntimeError(f"Model '{self._llm_name}' is not available: {e}")
|
|
103
97
|
|
|
104
|
-
async def _request_with_backoff(
|
|
105
|
-
self, client: AsyncClient, request: CompletionRequest, id: int
|
|
106
|
-
) -> CompletionResponse:
|
|
107
|
-
"""
|
|
108
|
-
Query Aleph-Alpha API with complete. Retry with back-off until it responds.
|
|
109
|
-
"""
|
|
110
|
-
num_attempts = 0
|
|
111
|
-
start_time: float | None = None
|
|
112
|
-
|
|
113
|
-
while True:
|
|
114
|
-
try:
|
|
115
|
-
return await client.complete(request, model=self._llm_name)
|
|
116
|
-
|
|
117
|
-
except (TimeoutError, BusyError, RuntimeError, aiohttp.ClientError) as e:
|
|
118
|
-
status_code: str = safe_json_loads(e.args[1]).get("code", "") if len(e.args) >= 2 else ""
|
|
119
|
-
str_e = str(e)
|
|
120
|
-
if status_code == "QUEUE_FULL":
|
|
121
|
-
# Worker not available or missed a heartbeat (inference longer than scheduler's
|
|
122
|
-
# API_MODEL_AVAILABLE_TIMEOUT_DURATION_MILLIS) or the scheduler is overloaded.
|
|
123
|
-
if start_time is None:
|
|
124
|
-
start_time = time.time()
|
|
125
|
-
elapsed = time.time() - start_time
|
|
126
|
-
if elapsed <= self.queue_full_timeout_seconds:
|
|
127
|
-
logger.info(
|
|
128
|
-
f"Request {id}: {status_code or str_e[:256]} - retrying: attempt"
|
|
129
|
-
f" {num_attempts}/{self.max_retries}, elapsed {elapsed:.1f} sec"
|
|
130
|
-
)
|
|
131
|
-
# don't count as retry (request returns immediately, so just wait a bit not to DoS the server)
|
|
132
|
-
await asyncio.sleep(random.randint(5, 30))
|
|
133
|
-
continue
|
|
134
|
-
|
|
135
|
-
elif (
|
|
136
|
-
status_code == "TIMEOUT_TASK"
|
|
137
|
-
or isinstance(e, TimeoutError)
|
|
138
|
-
or "502 Bad Gateway" in str_e
|
|
139
|
-
or "504 Gateway Time-out" in str_e
|
|
140
|
-
or isinstance(e, aiohttp.ClientError)
|
|
141
|
-
):
|
|
142
|
-
# client timeout, either because task too long in a queue or inference too long
|
|
143
|
-
# (scheduler's API_CLIENT_TIMEOUT_DURATION_MILLIS). Retrying for the "inference too long"
|
|
144
|
-
# case makes no sense but we unfortunately don't know which case has happened.
|
|
145
|
-
num_attempts += 1
|
|
146
|
-
start_time = None
|
|
147
|
-
if num_attempts < self.max_retries:
|
|
148
|
-
logger.info(f"Request {id}: TIMEOUT_TASK - retrying: attempt {num_attempts}/{self.max_retries}")
|
|
149
|
-
await asyncio.sleep(random.randint(5, 30))
|
|
150
|
-
continue
|
|
151
|
-
|
|
152
|
-
raise e
|
|
153
|
-
|
|
154
98
|
def _error_from_exception(self, e: Exception) -> Error:
|
|
155
99
|
"""Convert an exception to an Error object."""
|
|
156
100
|
if len(e.args) >= 2:
|
|
@@ -171,39 +115,36 @@ class AlephAlphaAPIModel(BaseLLM):
|
|
|
171
115
|
async def _process_request_with_client(
|
|
172
116
|
self,
|
|
173
117
|
client: AsyncClient,
|
|
174
|
-
semaphore: asyncio.Semaphore,
|
|
175
118
|
request: CompletionRequest,
|
|
176
119
|
id: int,
|
|
177
120
|
) -> tuple[CompletionRequest, CompletionResponse | Error]:
|
|
178
121
|
"""Process a single request, returning the request and either a response or error."""
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
return (request, self._error_from_exception(e))
|
|
122
|
+
try:
|
|
123
|
+
response = await client.complete(request, model=self._llm_name)
|
|
124
|
+
logger.info(f"Request {id}: Success")
|
|
125
|
+
return (request, response)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
if raise_errors():
|
|
128
|
+
raise e
|
|
129
|
+
logger.info(f"Request {id}: Failure: {str(e)[:256]}")
|
|
130
|
+
return (request, self._error_from_exception(e))
|
|
189
131
|
|
|
190
132
|
async def _process_requests(
|
|
191
133
|
self,
|
|
192
134
|
requests: list[CompletionRequest],
|
|
193
135
|
) -> list[tuple[CompletionRequest, CompletionResponse | Error]]:
|
|
194
136
|
"""Process multiple requests concurrently, returning request/response pairs."""
|
|
195
|
-
semaphore = asyncio.Semaphore(self.max_async_concurrent_requests)
|
|
196
137
|
async with AsyncClient(
|
|
197
138
|
host=self.base_url,
|
|
198
139
|
nice=True,
|
|
199
140
|
request_timeout_seconds=self.request_timeout_seconds,
|
|
200
141
|
token=self.token,
|
|
201
|
-
total_retries=
|
|
142
|
+
total_retries=self.max_retries,
|
|
143
|
+
limit=self.max_async_concurrent_requests,
|
|
202
144
|
) as client:
|
|
203
145
|
tasks = (
|
|
204
146
|
self._process_request_with_client(
|
|
205
147
|
client,
|
|
206
|
-
semaphore,
|
|
207
148
|
request,
|
|
208
149
|
i,
|
|
209
150
|
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from collections.abc import Sequence
|
|
2
|
+
from collections.abc import Generator, Sequence
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
@@ -112,21 +113,22 @@ class BaseLLM(ABC):
|
|
|
112
113
|
"""
|
|
113
114
|
pass
|
|
114
115
|
|
|
116
|
+
@contextmanager
|
|
115
117
|
def _get_final_checkpoint(
|
|
116
118
|
self, checkpoint_path: str | Path | None = None, model_name: str | None = None, artifact_name: str | None = None
|
|
117
|
-
) -> tuple[str | Path | None, str | None]:
|
|
119
|
+
) -> Generator[tuple[str | Path | None, str | None], None, None]:
|
|
118
120
|
if (num_provided := sum(x is not None for x in [checkpoint_path, model_name, artifact_name])) == 0:
|
|
119
121
|
if not getattr(self, "LLM_NAME", ""):
|
|
120
122
|
raise ValueError("Either LLM_NAME, checkpoint_path, model_name, or artifact_name must be provided.")
|
|
121
|
-
|
|
123
|
+
yield None, None # no argument given, so will use the LLM_NAME of the class
|
|
122
124
|
elif num_provided > 1:
|
|
123
125
|
raise ValueError("At most one of `checkpoint_path`, `model_name`, or `artifact_name` must be provided.")
|
|
124
126
|
|
|
125
127
|
elif checkpoint_path is not None:
|
|
126
|
-
|
|
128
|
+
yield checkpoint_path, str(checkpoint_path)
|
|
127
129
|
|
|
128
130
|
elif model_name is not None:
|
|
129
|
-
|
|
131
|
+
yield model_name, model_name
|
|
130
132
|
|
|
131
133
|
else:
|
|
132
134
|
from eval_framework.utils.file_ops import WandbFs
|
|
@@ -139,7 +141,7 @@ class BaseLLM(ABC):
|
|
|
139
141
|
file_root = wandb_fs.find_hf_checkpoint_root_from_path_list()
|
|
140
142
|
if file_root is None:
|
|
141
143
|
raise ValueError(f"Could not find HuggingFace checkpoint in artifact {artifact_base}:{version}")
|
|
142
|
-
|
|
144
|
+
yield file_root, artifact_name
|
|
143
145
|
|
|
144
146
|
def _get_final_formatter(
|
|
145
147
|
self,
|
|
@@ -322,22 +322,21 @@ class HFLLM(BaseHFLLM):
|
|
|
322
322
|
bytes_per_token: float | None = None,
|
|
323
323
|
**kwargs: Any,
|
|
324
324
|
) -> None:
|
|
325
|
-
|
|
325
|
+
with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
|
|
326
|
+
self.checkpoint_name = checkpoint_name
|
|
327
|
+
if self.checkpoint_name is None and possible_name is not None:
|
|
328
|
+
self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
|
|
326
329
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
|
|
330
|
+
if final_path:
|
|
331
|
+
self.LLM_NAME = str(final_path)
|
|
330
332
|
|
|
331
|
-
|
|
332
|
-
self.LLM_NAME = str(final_path)
|
|
333
|
+
final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
|
|
333
334
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
**kwargs,
|
|
340
|
-
)
|
|
335
|
+
super().__init__(
|
|
336
|
+
formatter=final_formatter,
|
|
337
|
+
bytes_per_token=bytes_per_token,
|
|
338
|
+
**kwargs,
|
|
339
|
+
)
|
|
341
340
|
|
|
342
341
|
@property
|
|
343
342
|
def name(self) -> str:
|
|
@@ -137,10 +137,12 @@ class BaseVLLMModel(BaseLLM):
|
|
|
137
137
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
138
138
|
|
|
139
139
|
self.batch_size = batch_size
|
|
140
|
-
self._tokenizer: None | VLLMTokenizerAPI = None
|
|
141
140
|
|
|
142
141
|
self.model = LLM(**model_args, device=device)
|
|
143
142
|
|
|
143
|
+
self._tokenizer: None | VLLMTokenizerAPI = None
|
|
144
|
+
_ = self.tokenizer # make sure tokenizer is initialized
|
|
145
|
+
|
|
144
146
|
self.sampling_params: SamplingParams = self._process_sampling_params(sampling_params)
|
|
145
147
|
|
|
146
148
|
logger.info(
|
|
@@ -481,28 +483,27 @@ class VLLMModel(BaseVLLMModel):
|
|
|
481
483
|
sampling_params: SamplingParams | dict[str, Any] | None = None,
|
|
482
484
|
**kwargs: Any,
|
|
483
485
|
) -> None:
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
)
|
|
486
|
+
with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
|
|
487
|
+
if final_path:
|
|
488
|
+
self.LLM_NAME = str(final_path)
|
|
489
|
+
|
|
490
|
+
final_name = checkpoint_name
|
|
491
|
+
if final_name is None and possible_name is not None:
|
|
492
|
+
final_name = possible_name.replace("/", "_").replace(":", "_").strip("_") # sanitize pathname
|
|
493
|
+
|
|
494
|
+
final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
|
|
495
|
+
|
|
496
|
+
super().__init__(
|
|
497
|
+
formatter=final_formatter,
|
|
498
|
+
checkpoint_path=final_path,
|
|
499
|
+
checkpoint_name=final_name,
|
|
500
|
+
max_model_len=max_model_len,
|
|
501
|
+
tensor_parallel_size=tensor_parallel_size,
|
|
502
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
503
|
+
batch_size=batch_size,
|
|
504
|
+
sampling_params=sampling_params,
|
|
505
|
+
**kwargs,
|
|
506
|
+
)
|
|
506
507
|
|
|
507
508
|
|
|
508
509
|
class VLLMRegistryModel(VLLMModel): # deprecated
|
|
@@ -15,7 +15,7 @@ class ARC(BaseTask[str]):
|
|
|
15
15
|
"""ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc"""
|
|
16
16
|
|
|
17
17
|
NAME = "ARC"
|
|
18
|
-
DATASET_PATH = "ai2_arc"
|
|
18
|
+
DATASET_PATH = "allenai/ai2_arc"
|
|
19
19
|
SAMPLE_SPLIT = "test"
|
|
20
20
|
FEWSHOT_SPLIT = "train"
|
|
21
21
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
@@ -9,8 +9,10 @@ from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Languag
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class CASEHOLD(BaseTask[str]):
|
|
12
|
+
"""CASEHOLD dataset: https://huggingface.co/datasets/coastalcph/lex_glue"""
|
|
13
|
+
|
|
12
14
|
NAME = "CaseHold"
|
|
13
|
-
DATASET_PATH = "lex_glue"
|
|
15
|
+
DATASET_PATH = "coastalcph/lex_glue"
|
|
14
16
|
SAMPLE_SPLIT = "test"
|
|
15
17
|
FEWSHOT_SPLIT = "train"
|
|
16
18
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
@@ -38,7 +38,7 @@ class TRUTHFULQA(BaseTask[str]):
|
|
|
38
38
|
"""TRUTHFULQA dataset: https://huggingface.co/datasets/truthfulqa/truthful_qa"""
|
|
39
39
|
|
|
40
40
|
NAME = "TruthfulQA"
|
|
41
|
-
DATASET_PATH = "truthful_qa"
|
|
41
|
+
DATASET_PATH = "truthfulqa/truthful_qa"
|
|
42
42
|
SAMPLE_SPLIT = "validation"
|
|
43
43
|
FEWSHOT_SPLIT = ""
|
|
44
44
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
@@ -13,10 +13,10 @@ ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class WINOGRANDE(BaseTask[str]):
|
|
16
|
-
"""WINOGRANDE dataset: https://huggingface.co/datasets/winogrande"""
|
|
16
|
+
"""WINOGRANDE dataset: https://huggingface.co/datasets/allenai/winogrande"""
|
|
17
17
|
|
|
18
18
|
NAME = "Winogrande"
|
|
19
|
-
DATASET_PATH = "winogrande"
|
|
19
|
+
DATASET_PATH = "allenai/winogrande"
|
|
20
20
|
SAMPLE_SPLIT = "validation"
|
|
21
21
|
FEWSHOT_SPLIT = "train"
|
|
22
22
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/bleu.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/chrf.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/arc_fi.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.9 → eval_framework-0.2.11}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|