EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py
CHANGED
|
@@ -1,85 +1,61 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import json
|
|
4
5
|
import pathlib
|
|
5
6
|
import re
|
|
6
7
|
import typing as t
|
|
8
|
+
from copy import deepcopy
|
|
7
9
|
from dataclasses import dataclass, field
|
|
8
10
|
|
|
9
11
|
import pydantic
|
|
10
12
|
import torch
|
|
11
13
|
|
|
12
|
-
from .enums import Device,
|
|
13
|
-
from .
|
|
14
|
+
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
15
|
+
from .exceptions import InvalidBenchmark
|
|
16
|
+
from .languages import (
|
|
17
|
+
ENGLISH,
|
|
18
|
+
EUROPEAN_PORTUGUESE,
|
|
19
|
+
NORWEGIAN,
|
|
20
|
+
NORWEGIAN_BOKMÅL,
|
|
21
|
+
NORWEGIAN_NYNORSK,
|
|
22
|
+
PORTUGUESE,
|
|
23
|
+
Language,
|
|
24
|
+
)
|
|
25
|
+
from .metrics.base import Metric
|
|
14
26
|
from .types import ScoreDict
|
|
15
27
|
from .utils import get_package_version
|
|
16
28
|
|
|
29
|
+
if t.TYPE_CHECKING:
|
|
30
|
+
from .enums import InferenceBackend
|
|
31
|
+
|
|
17
32
|
|
|
18
33
|
@dataclass
|
|
19
|
-
class
|
|
20
|
-
"""
|
|
34
|
+
class PromptConfig:
|
|
35
|
+
"""Configuration for task-specific prompting across languages.
|
|
36
|
+
|
|
37
|
+
Defines the prompt templates needed for evaluating a specific task in a given
|
|
38
|
+
language.
|
|
21
39
|
|
|
22
40
|
Attributes:
|
|
23
|
-
|
|
24
|
-
The
|
|
25
|
-
|
|
26
|
-
The
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
41
|
+
default_prompt_prefix:
|
|
42
|
+
The default prefix to use in the few-shot prompt.
|
|
43
|
+
default_prompt_template:
|
|
44
|
+
The default template for the prompt to use when benchmarking the dataset
|
|
45
|
+
using few-shot evaluation.
|
|
46
|
+
default_instruction_prompt:
|
|
47
|
+
The default prompt to use when benchmarking the dataset using
|
|
48
|
+
instruction-based evaluation.
|
|
49
|
+
default_prompt_label_mapping:
|
|
50
|
+
The default mapping from the labels to another phrase which is used as a
|
|
51
|
+
substitute for the label in few-shot evaluation. If set to "auto", the
|
|
52
|
+
mapping will be set to a 1:1 mapping between the labels and themselves.
|
|
31
53
|
"""
|
|
32
54
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def __hash__(self) -> int:
|
|
39
|
-
"""Return a hash of the language."""
|
|
40
|
-
return hash(self.code)
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def and_separator(self) -> str:
|
|
44
|
-
"""Get the word 'and' in the language.
|
|
45
|
-
|
|
46
|
-
Returns:
|
|
47
|
-
The word 'and' in the language.
|
|
48
|
-
|
|
49
|
-
Raises:
|
|
50
|
-
NotImplementedError:
|
|
51
|
-
If `and_separator` is `None`.
|
|
52
|
-
"""
|
|
53
|
-
if not self._and_separator:
|
|
54
|
-
raise NotImplementedError(
|
|
55
|
-
f"Separator for the word 'and' has not been defined for {self.name}."
|
|
56
|
-
)
|
|
57
|
-
return self._and_separator
|
|
58
|
-
|
|
59
|
-
@and_separator.setter
|
|
60
|
-
def and_separator(self, value: str | None) -> None:
|
|
61
|
-
self._and_separator = value
|
|
62
|
-
|
|
63
|
-
@property
|
|
64
|
-
def or_separator(self) -> str:
|
|
65
|
-
"""Get the word 'or' in the language.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
The word 'or' in the language.
|
|
69
|
-
|
|
70
|
-
Raises:
|
|
71
|
-
NotImplementedError:
|
|
72
|
-
If `or_separator` is `None`.
|
|
73
|
-
"""
|
|
74
|
-
if not self._or_separator:
|
|
75
|
-
raise NotImplementedError(
|
|
76
|
-
f"Separator for the word 'or' has not been defined for {self.name}."
|
|
77
|
-
)
|
|
78
|
-
return self._or_separator
|
|
79
|
-
|
|
80
|
-
@or_separator.setter
|
|
81
|
-
def or_separator(self, value: str | None) -> None:
|
|
82
|
-
self._or_separator = value
|
|
55
|
+
default_prompt_prefix: str
|
|
56
|
+
default_prompt_template: str
|
|
57
|
+
default_instruction_prompt: str
|
|
58
|
+
default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
|
|
83
59
|
|
|
84
60
|
|
|
85
61
|
@dataclass
|
|
@@ -104,210 +80,68 @@ class Task:
|
|
|
104
80
|
using few-shot evaluation.
|
|
105
81
|
default_labels:
|
|
106
82
|
The default labels for datasets using this task.
|
|
83
|
+
requires_zero_shot (optional):
|
|
84
|
+
Whether to only allow zero-shot evaluation for this task. If True, the
|
|
85
|
+
task will not be evaluated using few-shot examples.
|
|
86
|
+
uses_structured_output (optional):
|
|
87
|
+
Whether the task uses structured output. If True, the task will return
|
|
88
|
+
structured output (e.g., BIO tags for NER). Defaults to False.
|
|
89
|
+
uses_logprobs (optional):
|
|
90
|
+
Whether the task uses log probabilities. If True, the task will return
|
|
91
|
+
log probabilities for the generated tokens. Defaults to False.
|
|
92
|
+
requires_logprobs (optional):
|
|
93
|
+
Whether the task requires log probabilities. Implies `uses_logprobs`.
|
|
94
|
+
default_allowed_model_types (optional):
|
|
95
|
+
A list of model types that are allowed to be evaluated on this task.
|
|
96
|
+
Defaults to all model types being allowed.
|
|
97
|
+
default_allowed_generative_types (optional):
|
|
98
|
+
A list of generative model types that are allowed to be evaluated on this
|
|
99
|
+
task. If None, all generative model types are allowed. Only relevant if
|
|
100
|
+
`allowed_model_types` includes generative models.
|
|
101
|
+
default_allow_invalid_model_outputs (optional):
|
|
102
|
+
Whether to allow invalid model outputs. This is only relevant for generative
|
|
103
|
+
models on classification tasks, where the model may generate an output
|
|
104
|
+
which is not one of the allowed labels. If True, the model output will be
|
|
105
|
+
mapped to the closest valid label. If False, the model output will be
|
|
106
|
+
considered incorrect and the evaluation will be aborted. Defaults to True.
|
|
107
107
|
"""
|
|
108
108
|
|
|
109
|
+
model_config = pydantic.ConfigDict(
|
|
110
|
+
protected_namespaces=(), arbitrary_types_allowed=True
|
|
111
|
+
)
|
|
112
|
+
|
|
109
113
|
name: str
|
|
110
114
|
task_group: TaskGroup
|
|
111
|
-
template_dict: dict[
|
|
112
|
-
metrics:
|
|
115
|
+
template_dict: dict[Language, PromptConfig]
|
|
116
|
+
metrics: c.Sequence[Metric]
|
|
113
117
|
default_num_few_shot_examples: int
|
|
114
118
|
default_max_generated_tokens: int
|
|
115
|
-
default_labels:
|
|
119
|
+
default_labels: c.Sequence[str] | None
|
|
120
|
+
requires_zero_shot: bool = False
|
|
121
|
+
uses_structured_output: bool = False
|
|
122
|
+
uses_logprobs: bool = False
|
|
123
|
+
requires_logprobs: bool = False
|
|
124
|
+
default_allowed_model_types: c.Sequence[ModelType] = field(
|
|
125
|
+
default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
|
|
126
|
+
)
|
|
127
|
+
default_allowed_generative_types: c.Sequence[GenerativeType] = field(
|
|
128
|
+
default_factory=lambda: [
|
|
129
|
+
GenerativeType.BASE,
|
|
130
|
+
GenerativeType.INSTRUCTION_TUNED,
|
|
131
|
+
GenerativeType.REASONING,
|
|
132
|
+
]
|
|
133
|
+
)
|
|
134
|
+
default_allow_invalid_model_outputs: bool = True
|
|
135
|
+
|
|
136
|
+
def __post_init__(self) -> None:
|
|
137
|
+
"""Post-initialisation checks."""
|
|
138
|
+
self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
|
|
116
139
|
|
|
117
140
|
def __hash__(self) -> int:
|
|
118
141
|
"""Return a hash of the task."""
|
|
119
142
|
return hash(self.name)
|
|
120
143
|
|
|
121
144
|
|
|
122
|
-
@dataclass
|
|
123
|
-
class BenchmarkConfig:
|
|
124
|
-
"""General benchmarking configuration, across datasets and models.
|
|
125
|
-
|
|
126
|
-
Attributes:
|
|
127
|
-
model_languages:
|
|
128
|
-
The languages of the models to benchmark.
|
|
129
|
-
dataset_languages:
|
|
130
|
-
The languages of the datasets in the benchmark.
|
|
131
|
-
tasks:
|
|
132
|
-
The tasks benchmark the model(s) on.
|
|
133
|
-
datasets:
|
|
134
|
-
The datasets to benchmark on.
|
|
135
|
-
batch_size:
|
|
136
|
-
The batch size to use.
|
|
137
|
-
raise_errors:
|
|
138
|
-
Whether to raise errors instead of skipping them.
|
|
139
|
-
cache_dir:
|
|
140
|
-
Directory to store cached models and datasets.
|
|
141
|
-
api_key:
|
|
142
|
-
The API key to use for a given inference API.
|
|
143
|
-
force:
|
|
144
|
-
Whether to force the benchmark to run even if the results are already
|
|
145
|
-
cached.
|
|
146
|
-
progress_bar:
|
|
147
|
-
Whether to show a progress bar.
|
|
148
|
-
save_results:
|
|
149
|
-
Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
|
|
150
|
-
device:
|
|
151
|
-
The device to use for benchmarking.
|
|
152
|
-
verbose:
|
|
153
|
-
Whether to print verbose output.
|
|
154
|
-
trust_remote_code:
|
|
155
|
-
Whether to trust remote code when loading models from the Hugging Face Hub.
|
|
156
|
-
clear_model_cache:
|
|
157
|
-
Whether to clear the model cache after benchmarking each model.
|
|
158
|
-
evaluate_test_split:
|
|
159
|
-
Whether to evaluate on the test split.
|
|
160
|
-
few_shot:
|
|
161
|
-
Whether to only evaluate the model using few-shot evaluation. Only relevant
|
|
162
|
-
if the model is generative.
|
|
163
|
-
num_iterations:
|
|
164
|
-
The number of iterations each model should be evaluated for.
|
|
165
|
-
api_base:
|
|
166
|
-
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
167
|
-
model on an inference API.
|
|
168
|
-
api_version:
|
|
169
|
-
The version of the API to use. Only relevant if `model` refers to a model on
|
|
170
|
-
an inference API.
|
|
171
|
-
gpu_memory_utilization:
|
|
172
|
-
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
173
|
-
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
174
|
-
this if you are running out of GPU memory. Only relevant if the model is
|
|
175
|
-
generative.
|
|
176
|
-
debug:
|
|
177
|
-
Whether to run the benchmark in debug mode.
|
|
178
|
-
run_with_cli:
|
|
179
|
-
Whether the benchmark is being run with the CLI.
|
|
180
|
-
only_allow_safetensors:
|
|
181
|
-
Whether to only allow models that use the safetensors format.
|
|
182
|
-
"""
|
|
183
|
-
|
|
184
|
-
model_languages: list[Language]
|
|
185
|
-
dataset_languages: list[Language]
|
|
186
|
-
tasks: list[Task]
|
|
187
|
-
datasets: list[str]
|
|
188
|
-
batch_size: int
|
|
189
|
-
raise_errors: bool
|
|
190
|
-
cache_dir: str
|
|
191
|
-
api_key: str | None
|
|
192
|
-
force: bool
|
|
193
|
-
progress_bar: bool
|
|
194
|
-
save_results: bool
|
|
195
|
-
device: torch.device
|
|
196
|
-
verbose: bool
|
|
197
|
-
trust_remote_code: bool
|
|
198
|
-
clear_model_cache: bool
|
|
199
|
-
evaluate_test_split: bool
|
|
200
|
-
few_shot: bool
|
|
201
|
-
num_iterations: int
|
|
202
|
-
api_base: str | None
|
|
203
|
-
api_version: str | None
|
|
204
|
-
gpu_memory_utilization: float
|
|
205
|
-
debug: bool
|
|
206
|
-
run_with_cli: bool
|
|
207
|
-
only_allow_safetensors: bool
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
211
|
-
"""The parameters for the benchmark configuration."""
|
|
212
|
-
|
|
213
|
-
model_config = pydantic.ConfigDict(protected_namespaces=())
|
|
214
|
-
|
|
215
|
-
progress_bar: bool
|
|
216
|
-
save_results: bool
|
|
217
|
-
task: str | list[str] | None
|
|
218
|
-
dataset: str | list[str] | None
|
|
219
|
-
language: str | list[str]
|
|
220
|
-
model_language: str | list[str] | None
|
|
221
|
-
dataset_language: str | list[str] | None
|
|
222
|
-
device: Device | None
|
|
223
|
-
batch_size: int
|
|
224
|
-
raise_errors: bool
|
|
225
|
-
cache_dir: str
|
|
226
|
-
api_key: str | None
|
|
227
|
-
force: bool
|
|
228
|
-
verbose: bool
|
|
229
|
-
trust_remote_code: bool
|
|
230
|
-
clear_model_cache: bool
|
|
231
|
-
evaluate_test_split: bool
|
|
232
|
-
few_shot: bool
|
|
233
|
-
num_iterations: int
|
|
234
|
-
api_base: str | None
|
|
235
|
-
api_version: str | None
|
|
236
|
-
gpu_memory_utilization: float
|
|
237
|
-
debug: bool
|
|
238
|
-
run_with_cli: bool
|
|
239
|
-
only_allow_safetensors: bool
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
class BenchmarkResult(pydantic.BaseModel):
|
|
243
|
-
"""A benchmark result."""
|
|
244
|
-
|
|
245
|
-
dataset: str
|
|
246
|
-
task: str
|
|
247
|
-
dataset_languages: list[str]
|
|
248
|
-
model: str
|
|
249
|
-
results: ScoreDict
|
|
250
|
-
num_model_parameters: int
|
|
251
|
-
max_sequence_length: int
|
|
252
|
-
vocabulary_size: int
|
|
253
|
-
merge: bool
|
|
254
|
-
generative: bool
|
|
255
|
-
generative_type: str | None
|
|
256
|
-
few_shot: bool
|
|
257
|
-
validation_split: bool
|
|
258
|
-
euroeval_version: str | None = get_package_version("euroeval")
|
|
259
|
-
transformers_version: str | None = get_package_version("transformers")
|
|
260
|
-
torch_version: str | None = get_package_version("torch")
|
|
261
|
-
vllm_version: str | None = get_package_version("vllm")
|
|
262
|
-
outlines_version: str | None = get_package_version("outlines")
|
|
263
|
-
|
|
264
|
-
@classmethod
|
|
265
|
-
def from_dict(cls, config: dict) -> "BenchmarkResult":
|
|
266
|
-
"""Create a benchmark result from a dictionary.
|
|
267
|
-
|
|
268
|
-
Args:
|
|
269
|
-
config:
|
|
270
|
-
The configuration dictionary.
|
|
271
|
-
|
|
272
|
-
Returns:
|
|
273
|
-
The benchmark result.
|
|
274
|
-
"""
|
|
275
|
-
# To be backwards compatible, we accept old results which changed the model
|
|
276
|
-
# name with parameters rather than adding them as explicit parameters
|
|
277
|
-
val_matches = re.search(r"\(.*val.*\)$", config["model"])
|
|
278
|
-
few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
|
|
279
|
-
zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
|
|
280
|
-
config["model"] = re.sub(
|
|
281
|
-
r"\(.*(few-shot|val).*\)$", "", config["model"]
|
|
282
|
-
).strip()
|
|
283
|
-
|
|
284
|
-
if "merge" not in config:
|
|
285
|
-
config["merge"] = False
|
|
286
|
-
if "generative" not in config:
|
|
287
|
-
config["generative"] = (
|
|
288
|
-
few_shot_matches is not None or zero_shot_matches is not None
|
|
289
|
-
)
|
|
290
|
-
if "generative_type" not in config:
|
|
291
|
-
config["generative_type"] = None
|
|
292
|
-
if "few_shot" not in config:
|
|
293
|
-
config["few_shot"] = zero_shot_matches is None
|
|
294
|
-
if "validation_split" not in config:
|
|
295
|
-
config["validation_split"] = val_matches is not None
|
|
296
|
-
|
|
297
|
-
return cls(**config)
|
|
298
|
-
|
|
299
|
-
def append_to_results(self, results_path: pathlib.Path) -> None:
|
|
300
|
-
"""Append the benchmark result to the results file.
|
|
301
|
-
|
|
302
|
-
Args:
|
|
303
|
-
results_path:
|
|
304
|
-
The path to the results file.
|
|
305
|
-
"""
|
|
306
|
-
json_str = json.dumps(self.model_dump())
|
|
307
|
-
with results_path.open("a") as f:
|
|
308
|
-
f.write("\n" + json_str)
|
|
309
|
-
|
|
310
|
-
|
|
311
145
|
@dataclass
|
|
312
146
|
class DatasetConfig:
|
|
313
147
|
"""Configuration for a dataset.
|
|
@@ -318,8 +152,9 @@ class DatasetConfig:
|
|
|
318
152
|
pretty_name:
|
|
319
153
|
A longer prettier name for the dataset, which allows cases and spaces. Used
|
|
320
154
|
for logging.
|
|
321
|
-
|
|
322
|
-
The Hugging Face ID
|
|
155
|
+
source:
|
|
156
|
+
The source of the dataset, which can be a Hugging Face ID or a dictionary
|
|
157
|
+
with keys "train", "val" and "test" mapping to local CSV file paths.
|
|
323
158
|
task:
|
|
324
159
|
The task of the dataset.
|
|
325
160
|
languages:
|
|
@@ -356,63 +191,154 @@ class DatasetConfig:
|
|
|
356
191
|
to a 1:1 mapping between the labels and themselves. If None then the mapping
|
|
357
192
|
will be set to the default mapping for the task and language. Defaults to
|
|
358
193
|
None.
|
|
194
|
+
_allowed_model_types (optional):
|
|
195
|
+
A list of model types that are allowed to be evaluated on this dataset.
|
|
196
|
+
Defaults to the one for the task.
|
|
197
|
+
_allowed_generative_types (optional):
|
|
198
|
+
A list of generative model types that are allowed to be evaluated on this
|
|
199
|
+
dataset. If None, all generative model types are allowed. Only relevant if
|
|
200
|
+
`allowed_model_types` includes generative models. Defaults to the one for
|
|
201
|
+
the task.
|
|
202
|
+
_allow_invalid_model_outputs (optional):
|
|
203
|
+
Whether to allow invalid model outputs. This is only relevant for
|
|
204
|
+
generative models on classification tasks, where the model may generate an
|
|
205
|
+
output which is not one of the allowed labels. If True, the model output
|
|
206
|
+
will be mapped to the closest valid label. If False, the model output will
|
|
207
|
+
be considered incorrect and the evaluation will be aborted. Defaults to
|
|
208
|
+
the one for the task.
|
|
209
|
+
_logging_string (optional):
|
|
210
|
+
The string used to describe evaluation on the dataset in logging. If not
|
|
211
|
+
provided, a default string will be generated, based on the pretty name. Only
|
|
212
|
+
use this if the default string is not suitable.
|
|
213
|
+
splits (optional):
|
|
214
|
+
The names of the splits in the dataset. If not provided, defaults to
|
|
215
|
+
["train", "val", "test"].
|
|
216
|
+
bootstrap_samples (optional):
|
|
217
|
+
Whether to bootstrap the dataset samples. Defaults to True.
|
|
359
218
|
unofficial (optional):
|
|
360
219
|
Whether the dataset is unofficial. Defaults to False.
|
|
361
220
|
"""
|
|
362
221
|
|
|
363
222
|
name: str
|
|
364
223
|
pretty_name: str
|
|
365
|
-
|
|
224
|
+
source: str | dict[str, str]
|
|
366
225
|
task: Task
|
|
367
|
-
languages:
|
|
226
|
+
languages: c.Sequence[Language]
|
|
368
227
|
_prompt_prefix: str | None = None
|
|
369
228
|
_prompt_template: str | None = None
|
|
370
229
|
_instruction_prompt: str | None = None
|
|
371
230
|
_num_few_shot_examples: int | None = None
|
|
372
231
|
_max_generated_tokens: int | None = None
|
|
373
|
-
_labels:
|
|
232
|
+
_labels: c.Sequence[str] | None = None
|
|
374
233
|
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
|
|
234
|
+
_allowed_model_types: c.Sequence[ModelType] | None = None
|
|
235
|
+
_allowed_generative_types: c.Sequence[GenerativeType] | None = None
|
|
236
|
+
_allow_invalid_model_outputs: bool | None = None
|
|
237
|
+
_logging_string: str | None = None
|
|
238
|
+
splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
|
|
239
|
+
bootstrap_samples: bool = True
|
|
375
240
|
unofficial: bool = False
|
|
376
241
|
|
|
242
|
+
@property
|
|
243
|
+
def main_language(self) -> Language:
|
|
244
|
+
"""Get the main language of the dataset.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
The main language.
|
|
248
|
+
"""
|
|
249
|
+
match len(self.languages):
|
|
250
|
+
case 0:
|
|
251
|
+
raise InvalidBenchmark(
|
|
252
|
+
f"Dataset {self.name!r} must have at least one language."
|
|
253
|
+
)
|
|
254
|
+
case 1:
|
|
255
|
+
return self.languages[0]
|
|
256
|
+
case _:
|
|
257
|
+
if ENGLISH in self.languages:
|
|
258
|
+
return ENGLISH
|
|
259
|
+
elif NORWEGIAN in self.languages:
|
|
260
|
+
return NORWEGIAN
|
|
261
|
+
elif PORTUGUESE in self.languages:
|
|
262
|
+
return PORTUGUESE
|
|
263
|
+
else:
|
|
264
|
+
return self.languages[0]
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def logging_string(self) -> str:
|
|
268
|
+
"""The string used to describe evaluation on the dataset in logging."""
|
|
269
|
+
if self._logging_string is not None:
|
|
270
|
+
return self._logging_string
|
|
271
|
+
|
|
272
|
+
truncated_str = (
|
|
273
|
+
"truncated version of the "
|
|
274
|
+
if isinstance(self.source, str) and self.source.endswith("-mini")
|
|
275
|
+
else ""
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
logging_languages = list(deepcopy(self.languages))
|
|
279
|
+
if len(self.languages) > 1:
|
|
280
|
+
if (
|
|
281
|
+
NORWEGIAN_BOKMÅL in self.languages
|
|
282
|
+
and NORWEGIAN_NYNORSK in self.languages
|
|
283
|
+
and NORWEGIAN in self.languages
|
|
284
|
+
):
|
|
285
|
+
logging_languages.remove(NORWEGIAN_BOKMÅL)
|
|
286
|
+
logging_languages.remove(NORWEGIAN_NYNORSK)
|
|
287
|
+
elif (
|
|
288
|
+
NORWEGIAN_BOKMÅL in self.languages
|
|
289
|
+
or NORWEGIAN_NYNORSK in self.languages
|
|
290
|
+
) and NORWEGIAN in self.languages:
|
|
291
|
+
logging_languages.remove(NORWEGIAN)
|
|
292
|
+
if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
|
|
293
|
+
logging_languages.remove(EUROPEAN_PORTUGUESE)
|
|
294
|
+
|
|
295
|
+
if len(logging_languages) > 1:
|
|
296
|
+
languages_str = (
|
|
297
|
+
", ".join([lang.name for lang in logging_languages[:-1]])
|
|
298
|
+
+ f" and {logging_languages[-1].name}"
|
|
299
|
+
)
|
|
300
|
+
else:
|
|
301
|
+
languages_str = logging_languages[0].name
|
|
302
|
+
|
|
303
|
+
task_str = self.task.name.replace("-", " ")
|
|
304
|
+
dataset_name_str = (
|
|
305
|
+
self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
|
|
306
|
+
)
|
|
307
|
+
return (
|
|
308
|
+
f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
|
|
309
|
+
)
|
|
310
|
+
|
|
377
311
|
@property
|
|
378
312
|
def prompt_prefix(self) -> str:
|
|
379
313
|
"""The prefix to use in the few-shot prompt."""
|
|
380
|
-
|
|
381
|
-
prompt_config = self.task.template_dict[main_language]
|
|
314
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
382
315
|
prompt_prefix = (
|
|
383
316
|
prompt_config.default_prompt_prefix
|
|
384
317
|
if self._prompt_prefix is None
|
|
385
318
|
else self._prompt_prefix
|
|
386
319
|
)
|
|
387
|
-
prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
|
|
388
320
|
return prompt_prefix
|
|
389
321
|
|
|
390
322
|
@property
|
|
391
323
|
def prompt_template(self) -> str:
|
|
392
324
|
"""The template used during few-shot evaluation."""
|
|
393
|
-
|
|
394
|
-
prompt_config = self.task.template_dict[main_language]
|
|
325
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
395
326
|
prompt_template = (
|
|
396
327
|
prompt_config.default_prompt_template
|
|
397
328
|
if self._prompt_template is None
|
|
398
329
|
else self._prompt_template
|
|
399
330
|
)
|
|
400
|
-
prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
|
|
401
331
|
return prompt_template
|
|
402
332
|
|
|
403
333
|
@property
|
|
404
334
|
def instruction_prompt(self) -> str:
|
|
405
335
|
"""The prompt to use when evaluating instruction-tuned models."""
|
|
406
|
-
|
|
407
|
-
prompt_config = self.task.template_dict[main_language]
|
|
336
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
408
337
|
instruction_prompt = (
|
|
409
338
|
prompt_config.default_instruction_prompt
|
|
410
339
|
if self._instruction_prompt is None
|
|
411
340
|
else self._instruction_prompt
|
|
412
341
|
)
|
|
413
|
-
instruction_prompt = instruction_prompt.replace(
|
|
414
|
-
"{labels_str}", self._labels_str
|
|
415
|
-
)
|
|
416
342
|
return instruction_prompt
|
|
417
343
|
|
|
418
344
|
@property
|
|
@@ -434,9 +360,18 @@ class DatasetConfig:
|
|
|
434
360
|
)
|
|
435
361
|
|
|
436
362
|
@property
|
|
437
|
-
def labels(self) ->
|
|
363
|
+
def labels(self) -> c.Sequence[str]:
|
|
438
364
|
"""The labels in the dataset."""
|
|
439
|
-
|
|
365
|
+
if self._labels is not None:
|
|
366
|
+
return self._labels
|
|
367
|
+
elif self.task.default_labels is not None:
|
|
368
|
+
return self.task.default_labels
|
|
369
|
+
else:
|
|
370
|
+
raise ValueError(
|
|
371
|
+
f"Labels must be specified for dataset {self.name!r} with the "
|
|
372
|
+
f"attribute `_labels`, as the task {self.task.name!r} does not have "
|
|
373
|
+
"default labels."
|
|
374
|
+
)
|
|
440
375
|
|
|
441
376
|
@property
|
|
442
377
|
def prompt_label_mapping(self) -> dict[str, str]:
|
|
@@ -445,24 +380,48 @@ class DatasetConfig:
|
|
|
445
380
|
return {label: label for label in self.labels}
|
|
446
381
|
elif self._prompt_label_mapping is not None:
|
|
447
382
|
return self._prompt_label_mapping
|
|
448
|
-
|
|
449
|
-
main_language = self.languages[0]
|
|
450
|
-
prompt_config = self.task.template_dict[main_language]
|
|
451
|
-
|
|
383
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
452
384
|
if prompt_config.default_prompt_label_mapping == "auto":
|
|
453
385
|
return {label: label for label in self.labels}
|
|
454
386
|
else:
|
|
455
387
|
return prompt_config.default_prompt_label_mapping
|
|
456
388
|
|
|
457
389
|
@property
|
|
458
|
-
def
|
|
390
|
+
def allowed_model_types(self) -> c.Sequence[ModelType]:
|
|
391
|
+
"""A list of model types that are allowed to be evaluated on this dataset."""
|
|
392
|
+
return (
|
|
393
|
+
self._allowed_model_types
|
|
394
|
+
if self._allowed_model_types is not None
|
|
395
|
+
else self.task.default_allowed_model_types
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
@property
|
|
399
|
+
def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
|
|
400
|
+
"""A list of generative model types that are allowed on this dataset."""
|
|
401
|
+
return (
|
|
402
|
+
self._allowed_generative_types
|
|
403
|
+
if self._allowed_generative_types is not None
|
|
404
|
+
else self.task.default_allowed_generative_types
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def allow_invalid_model_outputs(self) -> bool:
|
|
409
|
+
"""Whether to allow invalid model outputs."""
|
|
410
|
+
return (
|
|
411
|
+
self._allow_invalid_model_outputs
|
|
412
|
+
if self._allow_invalid_model_outputs is not None
|
|
413
|
+
else self.task.default_allow_invalid_model_outputs
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
@property
|
|
417
|
+
def id2label(self) -> "HashableDict":
|
|
459
418
|
"""The mapping from ID to label."""
|
|
460
|
-
return {idx: label for idx, label in enumerate(self.labels)}
|
|
419
|
+
return HashableDict({idx: label for idx, label in enumerate(self.labels)})
|
|
461
420
|
|
|
462
421
|
@property
|
|
463
|
-
def label2id(self) ->
|
|
422
|
+
def label2id(self) -> "HashableDict":
|
|
464
423
|
"""The mapping from label to ID."""
|
|
465
|
-
return {label: i for i, label in enumerate(self.labels)}
|
|
424
|
+
return HashableDict({label: i for i, label in enumerate(self.labels)})
|
|
466
425
|
|
|
467
426
|
@property
|
|
468
427
|
def num_labels(self) -> int:
|
|
@@ -473,36 +432,36 @@ class DatasetConfig:
|
|
|
473
432
|
"""Return a hash of the dataset configuration."""
|
|
474
433
|
return hash(self.name)
|
|
475
434
|
|
|
476
|
-
|
|
477
|
-
def _labels_str(self) -> str:
|
|
435
|
+
def get_labels_str(self, labels: c.Sequence[str] | None = None) -> str:
|
|
478
436
|
"""Converts a set of labels to a natural string, in the specified language.
|
|
479
437
|
|
|
480
438
|
If the task is NER, we separate using 'and' and use the mapped labels instead of
|
|
481
439
|
the BIO NER labels.
|
|
482
440
|
|
|
483
441
|
Args:
|
|
484
|
-
|
|
442
|
+
labels (optional):
|
|
443
|
+
The labels to convert to a natural string. If None, uses all the labels
|
|
444
|
+
in the dataset. Defaults to None.
|
|
485
445
|
|
|
486
446
|
Returns:
|
|
487
447
|
The natural string representation of the labels in specified language.
|
|
488
448
|
"""
|
|
489
|
-
main_language = self.languages[0]
|
|
490
|
-
|
|
491
449
|
if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
|
|
492
|
-
sep_word = main_language.and_separator
|
|
450
|
+
sep_word = self.main_language.and_separator
|
|
493
451
|
else:
|
|
494
|
-
sep_word = main_language.or_separator
|
|
452
|
+
sep_word = self.main_language.or_separator
|
|
495
453
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
454
|
+
if labels is None:
|
|
455
|
+
labels = list()
|
|
456
|
+
for english_label in self.labels:
|
|
457
|
+
if english_label not in self.prompt_label_mapping:
|
|
458
|
+
continue
|
|
459
|
+
label = self.prompt_label_mapping[english_label]
|
|
460
|
+
if label not in labels:
|
|
461
|
+
labels.append(label)
|
|
503
462
|
|
|
504
463
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
505
|
-
quoted_labels = [f"'{label}'" for label in
|
|
464
|
+
quoted_labels = [f"'{label}'" for label in labels]
|
|
506
465
|
|
|
507
466
|
if not quoted_labels:
|
|
508
467
|
return ""
|
|
@@ -514,6 +473,213 @@ class DatasetConfig:
|
|
|
514
473
|
return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
|
|
515
474
|
|
|
516
475
|
|
|
476
|
+
@dataclass
|
|
477
|
+
class BenchmarkConfig:
|
|
478
|
+
"""General benchmarking configuration, across datasets and models.
|
|
479
|
+
|
|
480
|
+
Attributes:
|
|
481
|
+
datasets:
|
|
482
|
+
The datasets to benchmark on.
|
|
483
|
+
finetuning_batch_size:
|
|
484
|
+
The batch size to use for finetuning.
|
|
485
|
+
raise_errors:
|
|
486
|
+
Whether to raise errors instead of skipping them.
|
|
487
|
+
cache_dir:
|
|
488
|
+
Directory to store cached models and datasets.
|
|
489
|
+
api_key:
|
|
490
|
+
The API key to use for a given inference API.
|
|
491
|
+
api_base:
|
|
492
|
+
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
493
|
+
model on an inference API.
|
|
494
|
+
api_version:
|
|
495
|
+
The version of the API to use. Only relevant if `model` refers to a model on
|
|
496
|
+
an inference API.
|
|
497
|
+
progress_bar:
|
|
498
|
+
Whether to show a progress bar.
|
|
499
|
+
save_results:
|
|
500
|
+
Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
|
|
501
|
+
device:
|
|
502
|
+
The device to use for benchmarking.
|
|
503
|
+
trust_remote_code:
|
|
504
|
+
Whether to trust remote code when loading models from the Hugging Face Hub.
|
|
505
|
+
clear_model_cache:
|
|
506
|
+
Whether to clear the model cache after benchmarking each model.
|
|
507
|
+
evaluate_test_split:
|
|
508
|
+
Whether to evaluate on the test split.
|
|
509
|
+
few_shot:
|
|
510
|
+
Whether to only evaluate the model using few-shot evaluation. Only relevant
|
|
511
|
+
if the model is generative.
|
|
512
|
+
num_iterations:
|
|
513
|
+
The number of iterations each model should be evaluated for.
|
|
514
|
+
gpu_memory_utilization:
|
|
515
|
+
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
516
|
+
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
517
|
+
this if you are running out of GPU memory. Only relevant if the model is
|
|
518
|
+
generative.
|
|
519
|
+
requires_safetensors:
|
|
520
|
+
Whether to only allow models that use the safetensors format.
|
|
521
|
+
generative_type:
|
|
522
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
523
|
+
generative.
|
|
524
|
+
download_only:
|
|
525
|
+
Whether to only download the models, metrics and datasets without
|
|
526
|
+
evaluating.
|
|
527
|
+
force:
|
|
528
|
+
Whether to force the benchmark to run even if the results are already
|
|
529
|
+
cached.
|
|
530
|
+
verbose:
|
|
531
|
+
Whether to print verbose output.
|
|
532
|
+
debug:
|
|
533
|
+
Whether to run the benchmark in debug mode.
|
|
534
|
+
run_with_cli:
|
|
535
|
+
Whether the benchmark is being run with the CLI.
|
|
536
|
+
"""
|
|
537
|
+
|
|
538
|
+
datasets: c.Sequence[DatasetConfig]
|
|
539
|
+
languages: c.Sequence[Language]
|
|
540
|
+
finetuning_batch_size: int
|
|
541
|
+
raise_errors: bool
|
|
542
|
+
cache_dir: str
|
|
543
|
+
api_key: str | None
|
|
544
|
+
api_base: str | None
|
|
545
|
+
api_version: str | None
|
|
546
|
+
progress_bar: bool
|
|
547
|
+
save_results: bool
|
|
548
|
+
device: torch.device
|
|
549
|
+
trust_remote_code: bool
|
|
550
|
+
clear_model_cache: bool
|
|
551
|
+
evaluate_test_split: bool
|
|
552
|
+
few_shot: bool
|
|
553
|
+
num_iterations: int
|
|
554
|
+
gpu_memory_utilization: float
|
|
555
|
+
requires_safetensors: bool
|
|
556
|
+
generative_type: GenerativeType | None
|
|
557
|
+
download_only: bool
|
|
558
|
+
force: bool
|
|
559
|
+
verbose: bool
|
|
560
|
+
debug: bool
|
|
561
|
+
run_with_cli: bool
|
|
562
|
+
|
|
563
|
+
@property
|
|
564
|
+
def tasks(self) -> c.Sequence[Task]:
|
|
565
|
+
"""Get the tasks in the benchmark configuration."""
|
|
566
|
+
return list({dataset_config.task for dataset_config in self.datasets})
|
|
567
|
+
|
|
568
|
+
def __post_init__(self) -> None:
|
|
569
|
+
"""Post-initialisation checks."""
|
|
570
|
+
# Set dummy API key if it has not been set and we're benchmarking a model on an
|
|
571
|
+
# inference API
|
|
572
|
+
if self.api_key is None and self.api_base is not None:
|
|
573
|
+
self.api_key = "dummy"
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
577
|
+
"""The parameters for the benchmark configuration."""
|
|
578
|
+
|
|
579
|
+
model_config = pydantic.ConfigDict(
|
|
580
|
+
protected_namespaces=(), arbitrary_types_allowed=True
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
task: str | Task | c.Sequence[str | Task] | None
|
|
584
|
+
dataset: str | DatasetConfig | c.Sequence[str | DatasetConfig] | None
|
|
585
|
+
progress_bar: bool
|
|
586
|
+
save_results: bool
|
|
587
|
+
language: str | c.Sequence[str]
|
|
588
|
+
device: Device | None
|
|
589
|
+
finetuning_batch_size: int
|
|
590
|
+
raise_errors: bool
|
|
591
|
+
cache_dir: str
|
|
592
|
+
api_key: str | None
|
|
593
|
+
api_base: str | None
|
|
594
|
+
api_version: str | None
|
|
595
|
+
trust_remote_code: bool
|
|
596
|
+
clear_model_cache: bool
|
|
597
|
+
evaluate_test_split: bool
|
|
598
|
+
few_shot: bool
|
|
599
|
+
num_iterations: int
|
|
600
|
+
requires_safetensors: bool
|
|
601
|
+
download_only: bool
|
|
602
|
+
gpu_memory_utilization: float
|
|
603
|
+
generative_type: GenerativeType | None
|
|
604
|
+
force: bool
|
|
605
|
+
verbose: bool
|
|
606
|
+
debug: bool
|
|
607
|
+
run_with_cli: bool
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
class BenchmarkResult(pydantic.BaseModel):
|
|
611
|
+
"""A benchmark result."""
|
|
612
|
+
|
|
613
|
+
dataset: str
|
|
614
|
+
task: str
|
|
615
|
+
languages: c.Sequence[str]
|
|
616
|
+
model: str
|
|
617
|
+
results: ScoreDict
|
|
618
|
+
num_model_parameters: int
|
|
619
|
+
max_sequence_length: int
|
|
620
|
+
vocabulary_size: int
|
|
621
|
+
merge: bool
|
|
622
|
+
generative: bool
|
|
623
|
+
generative_type: str | None
|
|
624
|
+
few_shot: bool
|
|
625
|
+
validation_split: bool
|
|
626
|
+
euroeval_version: str | None = get_package_version("euroeval")
|
|
627
|
+
transformers_version: str | None = get_package_version("transformers")
|
|
628
|
+
torch_version: str | None = get_package_version("torch")
|
|
629
|
+
vllm_version: str | None = get_package_version("vllm")
|
|
630
|
+
xgrammar_version: str | None = get_package_version("xgrammar")
|
|
631
|
+
|
|
632
|
+
@classmethod
|
|
633
|
+
def from_dict(cls, config: dict) -> "BenchmarkResult":
|
|
634
|
+
"""Create a benchmark result from a dictionary.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
config:
|
|
638
|
+
The configuration dictionary.
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
The benchmark result.
|
|
642
|
+
"""
|
|
643
|
+
# To be backwards compatible, we accept old results which changed the model
|
|
644
|
+
# name with parameters rather than adding them as explicit parameters
|
|
645
|
+
val_matches = re.search(r"\(.*val.*\)$", config["model"])
|
|
646
|
+
few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
|
|
647
|
+
zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
|
|
648
|
+
config["model"] = re.sub(
|
|
649
|
+
r"\(.*(few-shot|val).*\)$", "", config["model"]
|
|
650
|
+
).strip()
|
|
651
|
+
|
|
652
|
+
if "merge" not in config:
|
|
653
|
+
config["merge"] = False
|
|
654
|
+
if "generative" not in config:
|
|
655
|
+
config["generative"] = (
|
|
656
|
+
few_shot_matches is not None or zero_shot_matches is not None
|
|
657
|
+
)
|
|
658
|
+
if "generative_type" not in config:
|
|
659
|
+
config["generative_type"] = None
|
|
660
|
+
if "few_shot" not in config:
|
|
661
|
+
config["few_shot"] = zero_shot_matches is None
|
|
662
|
+
if "validation_split" not in config:
|
|
663
|
+
config["validation_split"] = val_matches is not None
|
|
664
|
+
|
|
665
|
+
# Backwards compatibility
|
|
666
|
+
if "dataset_languages" in config:
|
|
667
|
+
config["languages"] = config.pop("dataset_languages")
|
|
668
|
+
|
|
669
|
+
return cls(**config)
|
|
670
|
+
|
|
671
|
+
def append_to_results(self, results_path: pathlib.Path) -> None:
|
|
672
|
+
"""Append the benchmark result to the results file.
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
results_path:
|
|
676
|
+
The path to the results file.
|
|
677
|
+
"""
|
|
678
|
+
json_str = json.dumps(self.model_dump())
|
|
679
|
+
with results_path.open("a") as f:
|
|
680
|
+
f.write("\n" + json_str)
|
|
681
|
+
|
|
682
|
+
|
|
517
683
|
@dataclass
|
|
518
684
|
class ModelConfig:
|
|
519
685
|
"""Configuration for a model.
|
|
@@ -523,6 +689,8 @@ class ModelConfig:
|
|
|
523
689
|
The ID of the model.
|
|
524
690
|
revision:
|
|
525
691
|
The revision of the model.
|
|
692
|
+
param:
|
|
693
|
+
The parameter of the model, or None if the model has no parameters.
|
|
526
694
|
task:
|
|
527
695
|
The task that the model was trained on.
|
|
528
696
|
languages:
|
|
@@ -544,9 +712,10 @@ class ModelConfig:
|
|
|
544
712
|
|
|
545
713
|
model_id: str
|
|
546
714
|
revision: str
|
|
715
|
+
param: str | None
|
|
547
716
|
task: str
|
|
548
|
-
languages:
|
|
549
|
-
inference_backend: InferenceBackend
|
|
717
|
+
languages: c.Sequence[Language]
|
|
718
|
+
inference_backend: "InferenceBackend"
|
|
550
719
|
merge: bool
|
|
551
720
|
model_type: ModelType
|
|
552
721
|
fresh: bool
|
|
@@ -573,7 +742,7 @@ class PreparedModelInputs:
|
|
|
573
742
|
instead.
|
|
574
743
|
"""
|
|
575
744
|
|
|
576
|
-
texts:
|
|
745
|
+
texts: c.Sequence[str] | None = None
|
|
577
746
|
input_ids: torch.Tensor | None = None
|
|
578
747
|
attention_mask: torch.Tensor | None = None
|
|
579
748
|
|
|
@@ -591,8 +760,8 @@ class GenerativeModelOutput:
|
|
|
591
760
|
token and its logprob. Can be None if the scores are not available.
|
|
592
761
|
"""
|
|
593
762
|
|
|
594
|
-
sequences:
|
|
595
|
-
scores:
|
|
763
|
+
sequences: c.Sequence[str]
|
|
764
|
+
scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] | None = None
|
|
596
765
|
|
|
597
766
|
|
|
598
767
|
@dataclass
|
|
@@ -609,7 +778,7 @@ class SingleGenerativeModelOutput:
|
|
|
609
778
|
"""
|
|
610
779
|
|
|
611
780
|
sequence: str
|
|
612
|
-
scores:
|
|
781
|
+
scores: c.Sequence[c.Sequence[tuple[str, float]]] | None = None
|
|
613
782
|
|
|
614
783
|
|
|
615
784
|
@dataclass
|
|
@@ -627,33 +796,31 @@ class HFModelInfo:
|
|
|
627
796
|
"""
|
|
628
797
|
|
|
629
798
|
pipeline_tag: str
|
|
630
|
-
tags:
|
|
799
|
+
tags: c.Sequence[str]
|
|
631
800
|
adapter_base_model_id: str | None
|
|
632
801
|
|
|
633
802
|
|
|
634
803
|
@dataclass
|
|
635
|
-
class
|
|
636
|
-
"""
|
|
637
|
-
|
|
638
|
-
Defines the prompt templates needed for evaluating a specific task in a given
|
|
639
|
-
language.
|
|
804
|
+
class ModelIdComponents:
|
|
805
|
+
"""A model ID split into its components.
|
|
640
806
|
|
|
641
807
|
Attributes:
|
|
642
|
-
|
|
643
|
-
The
|
|
644
|
-
|
|
645
|
-
The
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
The default prompt to use when benchmarking the dataset using
|
|
649
|
-
instruction-based evaluation.
|
|
650
|
-
default_prompt_label_mapping:
|
|
651
|
-
The default mapping from the labels to another phrase which is used as a
|
|
652
|
-
substitute for the label in few-shot evaluation. If set to "auto", the
|
|
653
|
-
mapping will be set to a 1:1 mapping between the labels and themselves.
|
|
808
|
+
model_id:
|
|
809
|
+
The main model ID without revision or parameters.
|
|
810
|
+
revision:
|
|
811
|
+
The revision of the model, if any.
|
|
812
|
+
param:
|
|
813
|
+
The parameter of the model, if any.
|
|
654
814
|
"""
|
|
655
815
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
816
|
+
model_id: str
|
|
817
|
+
revision: str
|
|
818
|
+
param: str | None
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
class HashableDict(dict):
|
|
822
|
+
"""A hashable dictionary."""
|
|
823
|
+
|
|
824
|
+
def __hash__(self) -> int: # type: ignore[override]
|
|
825
|
+
"""Return the hash of the dictionary."""
|
|
826
|
+
return hash(frozenset(self.items()))
|