eval-framework 0.2.8__tar.gz → 0.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_framework-0.2.8 → eval_framework-0.2.10}/PKG-INFO +1 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/pyproject.toml +1 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/context/determined.py +3 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/context/eval.py +2 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/context/local.py +1 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/response_generator.py +25 -6
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/run.py +8 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/arc.py +1 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/casehold.py +3 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/gsm8k.py +1 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/math_reasoning.py +20 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/truthfulqa.py +1 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/winogrande.py +2 -2
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/eval_config.py +3 -1
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/task_names.py +1 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/LICENSE +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/README.md +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/base_config.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/context/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/evaluation_generator.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/exceptions.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/external/ifeval_impl/README.md +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/external/ifeval_impl/instructions.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/external/ifeval_impl/instructions_registry.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/external/ifeval_impl/instructions_util.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/external/ifeval_impl/utils.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/aleph_alpha.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/base.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/huggingface.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/mistral.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/models.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/openai.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/llm/vllm.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/logger.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/main.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/base.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/accuracy_completion.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/aidanbench.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/bleu.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/chrf.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/code_assertion.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/code_execution_pass_at_one.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/comet.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/concordance_index.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/csv_format.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/cwe_accuracy.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/exponential_similarity.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/f1.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/format_checker.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/grid_difference.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/ifeval.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/json_format.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/language_checker.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/length_control.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/math_reasoning_completion.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/niah_accuracy.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/placeholder_checker.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/repetition.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/rouge_1.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/rouge_2.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/rouge_geometric_mean.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/rouge_l.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/struct_eval_metrics.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/ter.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/text_counter.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/efficiency/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/efficiency/bytes_per_sequence_position.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/base.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/chatbot_style_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/coherence_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/comparison_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/conciseness_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/contains_names_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/format_correctness_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/instruction_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/language.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/long_context_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/models.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/refusal_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/sql_quality_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_chatbot_style.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_coherence.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_completion_accuracy.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_conciseness.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_contains_names.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_format_correctness.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_instruction.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_mtbench_pair.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_mtbench_single.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_refusal.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_sql.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_world_knowledge.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/utils.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/base.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/dcs.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/probability_mass.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/ternary.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/py.typed +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/result_processors/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/result_processors/base.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/result_processors/hf_uploader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/result_processors/result_processor.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/result_processors/wandb_uploader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/run_direct.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/shared/types.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/base.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/aidanbench.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/arc_de.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/arc_fi.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/belebele.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/bigcodebench.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/chembench.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/copa.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/duc.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/flores200.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/flores_plus.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/gpqa.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/hellaswag.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/hellaswag_de.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/humaneval.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/ifeval.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/include.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/infinitebench.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/mbpp.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/mmlu.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/mmlu_de.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/mmlu_pro.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/mmmlu.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/openbookqa.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/pawsx.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/piqa.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/quality.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/sciq.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/sphyr.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/squad.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/struct_eval.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/tablebench.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/triviaqa.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/winogender.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/winox.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/wmt.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/zero_scrolls.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/perturbation.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/registry.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/task_loader.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/utils.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/constants.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/file_ops.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/generate_task_docs.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/helpers.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/logging.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/packaging.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/tqdm_handler.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/template_formatting/README.md +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/template_formatting/__init__.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/template_formatting/formatter.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/template_formatting/mistral_formatter.py +0 -0
- {eval_framework-0.2.8 → eval_framework-0.2.10}/src/template_formatting/py.typed +0 -0
|
@@ -31,6 +31,7 @@ class TaskArgs(BaseModel):
|
|
|
31
31
|
task_subjects: list[str] | None = None
|
|
32
32
|
hf_revision: str | None = None
|
|
33
33
|
perturbation_config: PerturbationConfig | None = None
|
|
34
|
+
repeats: int | None = None
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
class Hyperparameters(BaseModel):
|
|
@@ -110,6 +111,7 @@ class DeterminedContext(EvalContext):
|
|
|
110
111
|
"judge_model_name",
|
|
111
112
|
"judge_model_args",
|
|
112
113
|
"perturbation_config",
|
|
114
|
+
"repeats",
|
|
113
115
|
]:
|
|
114
116
|
val_cli = getattr(self, name, None)
|
|
115
117
|
val_hparams = getattr(self.hparams.task_args, name, None)
|
|
@@ -152,6 +154,7 @@ class DeterminedContext(EvalContext):
|
|
|
152
154
|
randomize_judge_order=self.randomize_judge_order,
|
|
153
155
|
delete_output_dir_after_upload=self.hparams.delete_output_dir_after_upload
|
|
154
156
|
or self.delete_output_dir_after_upload,
|
|
157
|
+
repeats=self.hparams.task_args.repeats or self.repeats,
|
|
155
158
|
)
|
|
156
159
|
|
|
157
160
|
return self
|
|
@@ -75,6 +75,7 @@ class EvalContext(AbstractContextManager):
|
|
|
75
75
|
perturbation_seed: int | None = None,
|
|
76
76
|
randomize_judge_order: bool = False,
|
|
77
77
|
delete_output_dir_after_upload: bool | None = None,
|
|
78
|
+
repeats: int | None = None,
|
|
78
79
|
) -> None:
|
|
79
80
|
self.llm_name = llm_name
|
|
80
81
|
self.models_path = models_path
|
|
@@ -99,7 +100,7 @@ class EvalContext(AbstractContextManager):
|
|
|
99
100
|
self.description = description
|
|
100
101
|
self.randomize_judge_order = randomize_judge_order
|
|
101
102
|
self.delete_output_dir_after_upload = delete_output_dir_after_upload
|
|
102
|
-
|
|
103
|
+
self.repeats = repeats
|
|
103
104
|
if perturbation_type or perturbation_probability is not None:
|
|
104
105
|
perturbation = {
|
|
105
106
|
"type": perturbation_type,
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
3
|
import traceback
|
|
4
|
-
from collections.abc import Callable
|
|
4
|
+
from collections.abc import Callable, Iterable
|
|
5
5
|
from datetime import UTC, datetime
|
|
6
6
|
from functools import partial
|
|
7
|
-
from typing import Any
|
|
8
7
|
|
|
9
8
|
from eval_framework.tasks.registry import get_task
|
|
10
9
|
|
|
@@ -14,6 +13,8 @@ except ImportError:
|
|
|
14
13
|
get_cluster_info = None # type: ignore[assignment]
|
|
15
14
|
|
|
16
15
|
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
17
18
|
from tqdm import tqdm
|
|
18
19
|
|
|
19
20
|
from eval_framework import __version__ as eval_framework_version
|
|
@@ -234,18 +235,21 @@ class ResponseGenerator:
|
|
|
234
235
|
# If samples_batch_size = 1, samples are run sequentially; in any case, we return here after finishing each
|
|
235
236
|
# individual batch to honor preemption requests and save cached results.
|
|
236
237
|
samples_batch_size = self.config.batch_size
|
|
238
|
+
repeats = self.config.repeats
|
|
237
239
|
|
|
238
240
|
# Calculate total samples for progress bar - use num_samples or iterate to count
|
|
239
|
-
|
|
240
|
-
if total_num_samples is None:
|
|
241
|
+
if self.num_samples is None:
|
|
241
242
|
# Count samples by iterating (this might be expensive for large datasets)
|
|
242
|
-
total_num_samples = sum(1 for _ in self.task.iterate_samples(None))
|
|
243
|
+
total_num_samples = sum(1 for _ in self.task.iterate_samples(None)) * repeats
|
|
244
|
+
else:
|
|
245
|
+
total_num_samples = self.num_samples * repeats
|
|
243
246
|
|
|
244
247
|
samples_batch: list[Sample] = []
|
|
245
248
|
with tqdm(
|
|
246
249
|
total=total_num_samples, desc=f"Processing {self.response_type.value}", disable=get_disable_bar_flag()
|
|
247
250
|
) as pbar:
|
|
248
|
-
|
|
251
|
+
samples = self.task.iterate_samples(self.num_samples)
|
|
252
|
+
for i, sample in enumerate(repeat_samples(samples, repeats)):
|
|
249
253
|
subject = f" - Subject: {sample.subject}"
|
|
250
254
|
sample_index = i + 1
|
|
251
255
|
|
|
@@ -330,6 +334,7 @@ class ResponseGenerator:
|
|
|
330
334
|
"llm_name",
|
|
331
335
|
"llm_args",
|
|
332
336
|
"perturbation_config",
|
|
337
|
+
"repeats",
|
|
333
338
|
]
|
|
334
339
|
for key in keys:
|
|
335
340
|
if loaded_metadata[key] != current_metadata[key]:
|
|
@@ -349,3 +354,17 @@ class ResponseGenerator:
|
|
|
349
354
|
logger.info("Completions generated and saved.")
|
|
350
355
|
|
|
351
356
|
return responses, preempted
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def repeat_samples(samples: Iterable[Sample], repeats: int) -> Iterable[Sample]:
|
|
360
|
+
"""Flatten repeats into a single stream of samples.
|
|
361
|
+
|
|
362
|
+
After expansion original sample indices do not point to the same sample anymore. They
|
|
363
|
+
Original sample can be recovered by `original_index = expanded_index // repeats`.
|
|
364
|
+
"""
|
|
365
|
+
for sample in samples:
|
|
366
|
+
base_id = sample.id * repeats
|
|
367
|
+
for repeat_idx in range(repeats):
|
|
368
|
+
repeated_sample = sample.model_copy()
|
|
369
|
+
repeated_sample.id = base_id + repeat_idx
|
|
370
|
+
yield repeated_sample
|
|
@@ -77,6 +77,13 @@ def parse_args() -> argparse.Namespace:
|
|
|
77
77
|
parser.add_argument(
|
|
78
78
|
"--num-fewshot", type=int, required=False, default=0, help="The number of fewshot examples to use."
|
|
79
79
|
)
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"--repeats",
|
|
82
|
+
type=int,
|
|
83
|
+
required=False,
|
|
84
|
+
default=1,
|
|
85
|
+
help="The number of times to repeat each sample in the evaluation.",
|
|
86
|
+
)
|
|
80
87
|
parser.add_argument("--task-name", type=str, required=False, help="The name of the task to evaluate.")
|
|
81
88
|
parser.add_argument(
|
|
82
89
|
"--randomize-judge-order",
|
|
@@ -319,6 +326,7 @@ def run_with_kwargs(kwargs: dict) -> None:
|
|
|
319
326
|
num_samples=kwargs["num_samples"],
|
|
320
327
|
max_tokens=kwargs["max_tokens"],
|
|
321
328
|
num_fewshot=kwargs["num_fewshot"],
|
|
329
|
+
repeats=kwargs["repeats"],
|
|
322
330
|
task_name=kwargs["task_name"],
|
|
323
331
|
task_subjects=kwargs["task_subjects"],
|
|
324
332
|
hf_revision=kwargs["hf_revision"],
|
|
@@ -15,7 +15,7 @@ class ARC(BaseTask[str]):
|
|
|
15
15
|
"""ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc"""
|
|
16
16
|
|
|
17
17
|
NAME = "ARC"
|
|
18
|
-
DATASET_PATH = "ai2_arc"
|
|
18
|
+
DATASET_PATH = "allenai/ai2_arc"
|
|
19
19
|
SAMPLE_SPLIT = "test"
|
|
20
20
|
FEWSHOT_SPLIT = "train"
|
|
21
21
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/casehold.py
RENAMED
|
@@ -9,8 +9,10 @@ from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Languag
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class CASEHOLD(BaseTask[str]):
|
|
12
|
+
"""CASEHOLD dataset: https://huggingface.co/datasets/coastalcph/lex_glue"""
|
|
13
|
+
|
|
12
14
|
NAME = "CaseHold"
|
|
13
|
-
DATASET_PATH = "lex_glue"
|
|
15
|
+
DATASET_PATH = "coastalcph/lex_glue"
|
|
14
16
|
SAMPLE_SPLIT = "test"
|
|
15
17
|
FEWSHOT_SPLIT = "train"
|
|
16
18
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/math_reasoning.py
RENAMED
|
@@ -355,6 +355,25 @@ class AIME2024(MATHReasoning):
|
|
|
355
355
|
return item["answer"].lstrip("0") # valid answers in this dataset range from 0-999 and have leading zeros
|
|
356
356
|
|
|
357
357
|
|
|
358
|
+
class AIME2025(AIME2024):
|
|
359
|
+
"""AIME 2025 dataset: https://huggingface.co/datasets/math-ai/aime25
|
|
360
|
+
|
|
361
|
+
This dataset contains a single test split of 30 questions.
|
|
362
|
+
Data contains
|
|
363
|
+
problem | answer | id
|
|
364
|
+
|
|
365
|
+
pass@1 evaluation
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
NAME = "AIME2025"
|
|
369
|
+
DATASET_PATH = "math-ai/aime25"
|
|
370
|
+
SAMPLE_SPLIT = "test"
|
|
371
|
+
FEWSHOT_SPLIT = "test"
|
|
372
|
+
|
|
373
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
|
|
374
|
+
return item["answer"]
|
|
375
|
+
|
|
376
|
+
|
|
358
377
|
class MATH500(MATHReasoning):
|
|
359
378
|
"""MATH500 dataset: https://huggingface.co/datasets/HuggingFaceH4/MATH-500
|
|
360
379
|
|
|
@@ -520,7 +539,7 @@ class GSM8KReasoning(MATHReasoning):
|
|
|
520
539
|
"""
|
|
521
540
|
|
|
522
541
|
NAME = "GSM8KReasoning"
|
|
523
|
-
DATASET_PATH = "gsm8k"
|
|
542
|
+
DATASET_PATH = "openai/gsm8k"
|
|
524
543
|
SAMPLE_SPLIT = "test"
|
|
525
544
|
FEWSHOT_SPLIT = "train"
|
|
526
545
|
RESPONSE_TYPE = ResponseType.COMPLETION
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/truthfulqa.py
RENAMED
|
@@ -38,7 +38,7 @@ class TRUTHFULQA(BaseTask[str]):
|
|
|
38
38
|
"""TRUTHFULQA dataset: https://huggingface.co/datasets/truthfulqa/truthful_qa"""
|
|
39
39
|
|
|
40
40
|
NAME = "TruthfulQA"
|
|
41
|
-
DATASET_PATH = "truthful_qa"
|
|
41
|
+
DATASET_PATH = "truthfulqa/truthful_qa"
|
|
42
42
|
SAMPLE_SPLIT = "validation"
|
|
43
43
|
FEWSHOT_SPLIT = ""
|
|
44
44
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/winogrande.py
RENAMED
|
@@ -13,10 +13,10 @@ ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class WINOGRANDE(BaseTask[str]):
|
|
16
|
-
"""WINOGRANDE dataset: https://huggingface.co/datasets/winogrande"""
|
|
16
|
+
"""WINOGRANDE dataset: https://huggingface.co/datasets/allenai/winogrande"""
|
|
17
17
|
|
|
18
18
|
NAME = "Winogrande"
|
|
19
|
-
DATASET_PATH = "winogrande"
|
|
19
|
+
DATASET_PATH = "allenai/winogrande"
|
|
20
20
|
SAMPLE_SPLIT = "validation"
|
|
21
21
|
FEWSHOT_SPLIT = "train"
|
|
22
22
|
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
@@ -54,7 +54,9 @@ class EvalConfig(BaseConfig):
|
|
|
54
54
|
save_intermediate_results: Annotated[bool, BeforeValidator(lambda v: True if v is None else v)] = True
|
|
55
55
|
save_logs: Annotated[bool, BeforeValidator(lambda v: True if v is None else v)] = True
|
|
56
56
|
delete_output_dir_after_upload: Annotated[bool, BeforeValidator(lambda v: False if v is None else v)] = False
|
|
57
|
-
|
|
57
|
+
# how many times to repeat a single sample
|
|
58
|
+
# can be used to reduce variance of tasks with low number of samples, e.g. AIME24
|
|
59
|
+
repeats: Annotated[int, BeforeValidator(lambda v: 1 if v is None else v), Field(ge=1)] = 1
|
|
58
60
|
# Adding a new member? Remember to update KEYS_UNRELATED_TO_RESULTS if it doesn't impact eval results.
|
|
59
61
|
|
|
60
62
|
@property
|
|
@@ -18,6 +18,7 @@ class TaskNameEnum(Enum):
|
|
|
18
18
|
def register_all_tasks() -> None:
|
|
19
19
|
"""Register all the benchmark tasks with the eval framework."""
|
|
20
20
|
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2024")
|
|
21
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2025")
|
|
21
22
|
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC")
|
|
22
23
|
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_IDK")
|
|
23
24
|
register_lazy_task("eval_framework.tasks.benchmarks.arc_de.ARC_DE")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/external/ifeval_impl/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/external/ifeval_impl/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/aidanbench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/bleu.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/chrf.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/comet.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/csv_format.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/cwe_accuracy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/json_format.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/repetition.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/rouge_1.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/rouge_2.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/rouge_l.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/completion/text_counter.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/efficiency/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/language.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/graders/models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_coherence.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_refusal.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/llm/llm_judge_sql.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/base.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/dcs.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/metrics/loglikelihood/ternary.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/result_processors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/result_processors/hf_uploader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/__init__.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/aidanbench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/arc_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/arc_fi.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/belebele.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/bigcodebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/chembench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/flores200.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/flores_plus.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/hellaswag.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/hellaswag_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/humaneval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/ifeval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/include.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/infinitebench.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/mmlu_de.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/mmlu_pro.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/openbookqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/quality.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/struct_eval.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/tablebench.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/triviaqa.py
RENAMED
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/winogender.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/tasks/benchmarks/zero_scrolls.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_framework-0.2.8 → eval_framework-0.2.10}/src/eval_framework/utils/generate_task_docs.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|