eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import importlib.util
|
|
3
|
+
import re
|
|
4
|
+
from collections.abc import Generator, Iterator, Sequence
|
|
5
|
+
from typing import Annotated, Any
|
|
6
|
+
|
|
7
|
+
import pydantic
|
|
8
|
+
from pydantic import AfterValidator
|
|
9
|
+
|
|
10
|
+
from eval_framework.tasks.base import BaseTask
|
|
11
|
+
from eval_framework.utils.packaging import is_extra_installed, validate_package_extras
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"register_task",
|
|
15
|
+
"register_lazy_task",
|
|
16
|
+
"Registry",
|
|
17
|
+
"with_registry",
|
|
18
|
+
"get_task",
|
|
19
|
+
"registered_tasks_iter",
|
|
20
|
+
"is_registered",
|
|
21
|
+
"validate_task_name",
|
|
22
|
+
"registered_task_names",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def validate_import_path(import_path: str) -> str:
|
|
27
|
+
if importlib.util.find_spec(import_path) is None:
|
|
28
|
+
raise ValueError(f"Invalid import path: {import_path}")
|
|
29
|
+
return import_path
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TaskPlaceholder(pydantic.BaseModel, extra="forbid", frozen=True):
|
|
33
|
+
name: Annotated[
|
|
34
|
+
str,
|
|
35
|
+
"The name of the Task class that we want to import",
|
|
36
|
+
]
|
|
37
|
+
module: Annotated[
|
|
38
|
+
str,
|
|
39
|
+
"The module from where to import the task",
|
|
40
|
+
validate_import_path,
|
|
41
|
+
]
|
|
42
|
+
extras: Annotated[
|
|
43
|
+
tuple[str, ...],
|
|
44
|
+
"Extra dependencies that are required for the task",
|
|
45
|
+
AfterValidator(validate_package_extras),
|
|
46
|
+
] = ()
|
|
47
|
+
|
|
48
|
+
def load(self) -> type[BaseTask]:
|
|
49
|
+
for extra in self.extras:
|
|
50
|
+
if not is_extra_installed(extra):
|
|
51
|
+
raise ImportError(f"The required package eval_framework[{extra}] is not installed.")
|
|
52
|
+
module = importlib.import_module(self.module)
|
|
53
|
+
return getattr(module, self.name)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Registry:
|
|
57
|
+
"""A registry for tasks with support for lazy loading.
|
|
58
|
+
|
|
59
|
+
Task names are hashed based on the upper-case name, to avoid issues with
|
|
60
|
+
ambiguous naming.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self) -> None:
|
|
64
|
+
# TODO: Lookup only with upper names
|
|
65
|
+
self._registry: dict[str, tuple[str, type[BaseTask] | TaskPlaceholder]] = dict()
|
|
66
|
+
|
|
67
|
+
def __iter__(self) -> Iterator[str]:
|
|
68
|
+
for name, _ in self._registry.values():
|
|
69
|
+
yield name
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def _task_key(name: str, /) -> str:
|
|
73
|
+
name = re.sub(r"[\s\-_]+", "", name).upper()
|
|
74
|
+
if not name.isalnum():
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"Task name '{name}' contains invalid characters. Only alphanumeric characters are allowed."
|
|
77
|
+
)
|
|
78
|
+
return name
|
|
79
|
+
|
|
80
|
+
def __contains__(self, name: str) -> bool:
|
|
81
|
+
task_key = self._task_key(name)
|
|
82
|
+
return task_key in self._registry
|
|
83
|
+
|
|
84
|
+
def __getitem__(self, name: str, /) -> type[BaseTask]:
|
|
85
|
+
task_key = self._task_key(name)
|
|
86
|
+
try:
|
|
87
|
+
name, task = self._registry[task_key]
|
|
88
|
+
except KeyError:
|
|
89
|
+
raise KeyError(f"Task not found: {name}")
|
|
90
|
+
|
|
91
|
+
if isinstance(task, TaskPlaceholder):
|
|
92
|
+
task = task.load()
|
|
93
|
+
self._registry[task_key] = (name, task)
|
|
94
|
+
return task
|
|
95
|
+
|
|
96
|
+
def add(self, task: type[BaseTask]) -> None:
|
|
97
|
+
task_key = self._task_key(task.NAME)
|
|
98
|
+
self._registry[task_key] = (task.NAME, task)
|
|
99
|
+
|
|
100
|
+
def __setitem__(self, name: str, task: type[BaseTask] | TaskPlaceholder) -> None:
|
|
101
|
+
task_key = self._task_key(name)
|
|
102
|
+
if task_key in self._registry:
|
|
103
|
+
raise ValueError(f"Cannot register duplicate task with key: {task_key}")
|
|
104
|
+
|
|
105
|
+
self._registry[task_key] = (name, task)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
_REGISTRY = Registry()
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@contextlib.contextmanager
|
|
112
|
+
def with_registry(registry: Registry) -> Generator[None, Any, None]:
|
|
113
|
+
"""Contextmanager to change the current registry."""
|
|
114
|
+
global _REGISTRY
|
|
115
|
+
old_registry = _REGISTRY
|
|
116
|
+
try:
|
|
117
|
+
_REGISTRY = registry
|
|
118
|
+
yield
|
|
119
|
+
finally:
|
|
120
|
+
_REGISTRY = old_registry
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def registered_task_names() -> list[str]:
|
|
124
|
+
"""Return the names of all registered tasks."""
|
|
125
|
+
return list(_REGISTRY)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def is_registered(name: str, /) -> bool:
|
|
129
|
+
"""Return True if a task is registered."""
|
|
130
|
+
return name in _REGISTRY
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def validate_task_name(name: str) -> str:
|
|
134
|
+
"""Pydantic-style validator for task names."""
|
|
135
|
+
if not is_registered(name):
|
|
136
|
+
raise ValueError(f"Task not registered: {name}")
|
|
137
|
+
return name
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def registered_tasks_iter() -> Iterator[tuple[str, type[BaseTask]]]:
|
|
141
|
+
"""Iterate over the names and classes of all registered tasks.
|
|
142
|
+
|
|
143
|
+
Note: This method will import any lazily registered task.
|
|
144
|
+
"""
|
|
145
|
+
for name in registered_task_names():
|
|
146
|
+
yield name, get_task(name)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_task(name: str, /) -> type[BaseTask]:
|
|
150
|
+
"""Return a registered task for a given name.
|
|
151
|
+
|
|
152
|
+
Note: This method will import any lazily registered task.
|
|
153
|
+
"""
|
|
154
|
+
return _REGISTRY[name]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def register_task(task: type[BaseTask]) -> str:
|
|
158
|
+
"""The class name is used as the task name."""
|
|
159
|
+
if not issubclass(task, BaseTask):
|
|
160
|
+
raise ValueError(f"Can only register subclasses of BaseTask, got {task}")
|
|
161
|
+
name = task.__name__
|
|
162
|
+
_REGISTRY[name] = task
|
|
163
|
+
return name
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def register_lazy_task(class_path: str, /, *, extras: Sequence[str] = ()) -> None:
|
|
167
|
+
"""Register a task without importing it.
|
|
168
|
+
|
|
169
|
+
Lazily register a task without importing the module.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
class_path: The full path to the task class. For example,
|
|
173
|
+
`eval_framework.tasks.benchmarks.truthfulqa.TRUTHFULQA`.
|
|
174
|
+
extras: Any extra dependencies of `eval_framework` that need to be installed for this task.
|
|
175
|
+
"""
|
|
176
|
+
if isinstance(extras, str):
|
|
177
|
+
extras = [extras]
|
|
178
|
+
if "." not in class_path:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
f"Invalid class path `{class_path}`. This needs to be a global path like "
|
|
181
|
+
"`eval_framework.tasks.benchmarks.truthfulqa.TRUTHFULQA`): "
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
base_module, class_name = class_path.rsplit(".", maxsplit=1)
|
|
185
|
+
placeholder = TaskPlaceholder(name=class_name, module=base_module, extras=extras)
|
|
186
|
+
_REGISTRY[class_name] = placeholder
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import inspect
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from eval_framework.tasks.base import BaseTask
|
|
11
|
+
from eval_framework.tasks.registry import is_registered, register_task
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def find_all_python_files(*module_paths: str | os.PathLike) -> set[Path]:
|
|
17
|
+
"""Recursively walk through all paths and return all Python files."""
|
|
18
|
+
all_files: set[Path] = set()
|
|
19
|
+
for path in module_paths:
|
|
20
|
+
path = Path(path).resolve()
|
|
21
|
+
|
|
22
|
+
if not path.exists():
|
|
23
|
+
raise FileNotFoundError(f"[User Task Loader] Path does not exist: {path}")
|
|
24
|
+
if path.is_dir():
|
|
25
|
+
all_files.update(path.glob("**/*.py"))
|
|
26
|
+
elif path.is_file():
|
|
27
|
+
if path.suffix != ".py":
|
|
28
|
+
raise ValueError(f"The provided path {path} is not a Python file.")
|
|
29
|
+
all_files.add(path)
|
|
30
|
+
else:
|
|
31
|
+
raise ValueError(f"Path is not a .py file or directory: {path}")
|
|
32
|
+
return all_files
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def import_file(f: str | os.PathLike, /) -> Any:
|
|
36
|
+
"""Import a file as a Python module."""
|
|
37
|
+
file_path = Path(f)
|
|
38
|
+
try:
|
|
39
|
+
spec = importlib.util.spec_from_file_location("user_task_module", file_path)
|
|
40
|
+
if spec is None or spec.loader is None:
|
|
41
|
+
raise RuntimeError(f"Could not create a module spec for {file_path}")
|
|
42
|
+
user_module: ModuleType = importlib.util.module_from_spec(spec)
|
|
43
|
+
spec.loader.exec_module(user_module)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
raise ImportError(f"Failed to import {file_path}: {e}") from e
|
|
46
|
+
return user_module
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def load_extra_tasks(module_paths: Sequence[str | os.PathLike]) -> None:
|
|
50
|
+
"""Dynamically load and register user-defined tasks from a list of files or directories.
|
|
51
|
+
|
|
52
|
+
Each .py file found will be imported, and any BaseTask subclass will be registered
|
|
53
|
+
in the TaskName enum for use by name.
|
|
54
|
+
Provides clear error messages for missing/invalid files or import errors.
|
|
55
|
+
"""
|
|
56
|
+
for file_path in find_all_python_files(*module_paths):
|
|
57
|
+
user_module = import_file(file_path)
|
|
58
|
+
|
|
59
|
+
for name, obj in inspect.getmembers(user_module):
|
|
60
|
+
if inspect.isclass(obj) and issubclass(obj, BaseTask) and obj is not BaseTask:
|
|
61
|
+
# check unique name of BaseTask subclasses (when they have a NAME attribute)
|
|
62
|
+
if not hasattr(obj, "NAME"):
|
|
63
|
+
logger.info(f"[User Task Loader] Skipping {obj.__module__} - no NAME attribute present.")
|
|
64
|
+
else:
|
|
65
|
+
if is_registered(obj.NAME):
|
|
66
|
+
# two classes with the same NAME attribute
|
|
67
|
+
logger.info(obj.__module__)
|
|
68
|
+
|
|
69
|
+
# if it comes from eval_framework's built-in tasks then will just skip (no need to register
|
|
70
|
+
# again; this can happen if a class is imported so that a new task can be derived from it)
|
|
71
|
+
# but if it is one of the user defined task then raise a duplicated name error
|
|
72
|
+
if "eval_framework.tasks.benchmarks" not in obj.__module__:
|
|
73
|
+
# skip if import comes from eval_framework's built-in tasks
|
|
74
|
+
raise ValueError(f"Duplicate user task name found (case-insensitive): {obj.NAME}")
|
|
75
|
+
|
|
76
|
+
else:
|
|
77
|
+
# if there is no duplicate name conflict then register the new task
|
|
78
|
+
class_obj = getattr(user_module, name)
|
|
79
|
+
register_task(class_obj)
|
|
80
|
+
logger.info(f"[User Task Loader] Registered task: {class_obj.NAME}")
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import random
|
|
3
|
+
import time
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
from eval_framework.tasks.base import BaseTask
|
|
7
|
+
from eval_framework.tasks.registry import register_lazy_task, registered_tasks_iter
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TaskNameEnum(Enum):
|
|
13
|
+
@property
|
|
14
|
+
def value(self) -> type[BaseTask]:
|
|
15
|
+
return super().value
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def register_all_tasks() -> None:
|
|
19
|
+
"""Register all the benchmark tasks with the eval framework."""
|
|
20
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2024")
|
|
21
|
+
register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC")
|
|
22
|
+
register_lazy_task("eval_framework.tasks.benchmarks.arc_de.ARC_DE")
|
|
23
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.ARC_EU20_DE")
|
|
24
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.ARC_EU20_FR")
|
|
25
|
+
register_lazy_task("eval_framework.tasks.benchmarks.arc_fi.ARC_FI")
|
|
26
|
+
register_lazy_task("eval_framework.tasks.benchmarks.belebele.BELEBELE")
|
|
27
|
+
register_lazy_task("eval_framework.tasks.benchmarks.bigcodebench.BigCodeBench")
|
|
28
|
+
register_lazy_task("eval_framework.tasks.benchmarks.bigcodebench.BigCodeBenchInstruct")
|
|
29
|
+
register_lazy_task("eval_framework.tasks.benchmarks.bigcodebench.BigCodeBenchHard")
|
|
30
|
+
register_lazy_task("eval_framework.tasks.benchmarks.bigcodebench.BigCodeBenchHardInstruct")
|
|
31
|
+
register_lazy_task("eval_framework.tasks.benchmarks.casehold.CASEHOLD")
|
|
32
|
+
register_lazy_task("eval_framework.tasks.benchmarks.chembench.ChemBench")
|
|
33
|
+
register_lazy_task("eval_framework.tasks.benchmarks.copa.COPA")
|
|
34
|
+
register_lazy_task("eval_framework.tasks.benchmarks.duc.DUC_ABSTRACTIVE")
|
|
35
|
+
register_lazy_task("eval_framework.tasks.benchmarks.duc.DUC_EXTRACTIVE")
|
|
36
|
+
register_lazy_task("eval_framework.tasks.benchmarks.flores200.Flores200")
|
|
37
|
+
register_lazy_task("eval_framework.tasks.benchmarks.flores_plus.FloresPlus", extras=["comet"])
|
|
38
|
+
register_lazy_task("eval_framework.tasks.benchmarks.gpqa.GPQA")
|
|
39
|
+
register_lazy_task("eval_framework.tasks.benchmarks.gpqa.GPQA_COT")
|
|
40
|
+
register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8K")
|
|
41
|
+
register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8KLlamaVersion")
|
|
42
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.GSM8KReasoning")
|
|
43
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.GSM8K_EU20_DE")
|
|
44
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.GSM8K_EU20_FR")
|
|
45
|
+
register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG")
|
|
46
|
+
register_lazy_task("eval_framework.tasks.benchmarks.hellaswag_de.HELLASWAG_DE")
|
|
47
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_DE")
|
|
48
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_FR")
|
|
49
|
+
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval")
|
|
50
|
+
register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalInstruct")
|
|
51
|
+
register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEval")
|
|
52
|
+
register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalDe")
|
|
53
|
+
register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalFiSv")
|
|
54
|
+
register_lazy_task("eval_framework.tasks.benchmarks.include.INCLUDE")
|
|
55
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_CodeDebug")
|
|
56
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_CodeRun")
|
|
57
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_EnDia")
|
|
58
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_EnMC")
|
|
59
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_EnQA")
|
|
60
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_MathFind")
|
|
61
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_RetrieveKV2")
|
|
62
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_RetrieveNumber")
|
|
63
|
+
register_lazy_task("eval_framework.tasks.benchmarks.infinitebench.InfiniteBench_RetrievePassKey1")
|
|
64
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATH")
|
|
65
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHLvl5")
|
|
66
|
+
register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATH500")
|
|
67
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP")
|
|
68
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_SANITIZED")
|
|
69
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS")
|
|
70
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED")
|
|
71
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU")
|
|
72
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.FullTextMMLU")
|
|
73
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.MMLU_EU20_DE")
|
|
74
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.MMLU_EU20_FR")
|
|
75
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmlu_de.MMLU_DE")
|
|
76
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmlu_pro.MMLU_PRO")
|
|
77
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmlu_pro.MMLU_PRO_COT")
|
|
78
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_COT")
|
|
79
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU")
|
|
80
|
+
register_lazy_task("eval_framework.tasks.benchmarks.mmmlu.MMMLU_GERMAN_COT")
|
|
81
|
+
register_lazy_task("eval_framework.tasks.benchmarks.pawsx.PAWSX")
|
|
82
|
+
register_lazy_task("eval_framework.tasks.benchmarks.piqa.PIQA")
|
|
83
|
+
register_lazy_task("eval_framework.tasks.benchmarks.openbookqa.OPENBOOKQA")
|
|
84
|
+
register_lazy_task("eval_framework.tasks.benchmarks.sciq.SCIQ")
|
|
85
|
+
register_lazy_task("eval_framework.tasks.benchmarks.squad.SQUAD")
|
|
86
|
+
register_lazy_task("eval_framework.tasks.benchmarks.squad.SQUAD2")
|
|
87
|
+
register_lazy_task("eval_framework.tasks.benchmarks.tablebench.TableBench")
|
|
88
|
+
register_lazy_task("eval_framework.tasks.benchmarks.triviaqa.TRIVIAQA")
|
|
89
|
+
register_lazy_task("eval_framework.tasks.benchmarks.truthfulqa.TRUTHFULQA")
|
|
90
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.TRUTHFULQA_EU20_DE")
|
|
91
|
+
register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.TRUTHFULQA_EU20_FR")
|
|
92
|
+
register_lazy_task("eval_framework.tasks.benchmarks.winogender.WINOGENDER")
|
|
93
|
+
register_lazy_task("eval_framework.tasks.benchmarks.winogrande.WINOGRANDE")
|
|
94
|
+
register_lazy_task("eval_framework.tasks.benchmarks.winox.WINOX_DE")
|
|
95
|
+
register_lazy_task("eval_framework.tasks.benchmarks.winox.WINOX_FR")
|
|
96
|
+
register_lazy_task("eval_framework.tasks.benchmarks.wmt.WMT14")
|
|
97
|
+
register_lazy_task("eval_framework.tasks.benchmarks.wmt.WMT16")
|
|
98
|
+
register_lazy_task("eval_framework.tasks.benchmarks.wmt.WMT20")
|
|
99
|
+
register_lazy_task("eval_framework.tasks.benchmarks.wmt.WMT14_INSTRUCT")
|
|
100
|
+
register_lazy_task("eval_framework.tasks.benchmarks.wmt.WMT16_INSTRUCT")
|
|
101
|
+
register_lazy_task("eval_framework.tasks.benchmarks.wmt.WMT20_INSTRUCT")
|
|
102
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_QUALITY")
|
|
103
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_SQUALITY")
|
|
104
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_QMSUM")
|
|
105
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_QASPER")
|
|
106
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_GOV_REPORT")
|
|
107
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_NARRATIVEQA")
|
|
108
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_MUSIQUE")
|
|
109
|
+
register_lazy_task("eval_framework.tasks.benchmarks.zero_scrolls.ZERO_SCROLLS_SPACE_DIGEST")
|
|
110
|
+
register_lazy_task("eval_framework.tasks.benchmarks.quality.QUALITY")
|
|
111
|
+
register_lazy_task("eval_framework.tasks.benchmarks.sphyr.SPHYR")
|
|
112
|
+
register_lazy_task("eval_framework.tasks.benchmarks.struct_eval.StructEval")
|
|
113
|
+
register_lazy_task("eval_framework.tasks.benchmarks.struct_eval.RenderableStructEval")
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
# Importing the companion registers the additional tasks from the module.
|
|
117
|
+
# This is mostly for convenience for internal use-cases
|
|
118
|
+
import eval_framework_companion # noqa
|
|
119
|
+
except ImportError:
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def make_sure_all_hf_datasets_are_in_cache() -> None:
|
|
124
|
+
for task_name, task_class in registered_tasks_iter():
|
|
125
|
+
task = task_class()
|
|
126
|
+
for attempt in range(10):
|
|
127
|
+
try:
|
|
128
|
+
for _ in task.iterate_samples(num_samples=1):
|
|
129
|
+
pass
|
|
130
|
+
break
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.info(f"{e} Will retry loading {task_name} in a few seconds, attempt #{attempt + 1}.")
|
|
133
|
+
time.sleep(random.randint(1, 5))
|
|
134
|
+
logger.info(f"Processed {task_name}")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
if __name__ == "__main__":
|
|
138
|
+
print(list(registered_tasks_iter()))
|