EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import json
|
|
4
5
|
import pathlib
|
|
5
6
|
import re
|
|
@@ -10,79 +11,42 @@ import pydantic
|
|
|
10
11
|
import torch
|
|
11
12
|
|
|
12
13
|
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
14
|
+
from .exceptions import InvalidBenchmark
|
|
15
|
+
from .languages import ENGLISH, NORWEGIAN, PORTUGUESE, Language
|
|
16
|
+
from .metrics.base import Metric
|
|
13
17
|
from .types import ScoreDict
|
|
14
18
|
from .utils import get_package_version
|
|
15
19
|
|
|
16
20
|
if t.TYPE_CHECKING:
|
|
17
21
|
from .enums import InferenceBackend
|
|
18
|
-
from .metrics import Metric
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
@dataclass
|
|
22
|
-
class
|
|
23
|
-
"""
|
|
25
|
+
class PromptConfig:
|
|
26
|
+
"""Configuration for task-specific prompting across languages.
|
|
27
|
+
|
|
28
|
+
Defines the prompt templates needed for evaluating a specific task in a given
|
|
29
|
+
language.
|
|
24
30
|
|
|
25
31
|
Attributes:
|
|
26
|
-
|
|
27
|
-
The
|
|
28
|
-
|
|
29
|
-
The
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
default_prompt_prefix:
|
|
33
|
+
The default prefix to use in the few-shot prompt.
|
|
34
|
+
default_prompt_template:
|
|
35
|
+
The default template for the prompt to use when benchmarking the dataset
|
|
36
|
+
using few-shot evaluation.
|
|
37
|
+
default_instruction_prompt:
|
|
38
|
+
The default prompt to use when benchmarking the dataset using
|
|
39
|
+
instruction-based evaluation.
|
|
40
|
+
default_prompt_label_mapping:
|
|
41
|
+
The default mapping from the labels to another phrase which is used as a
|
|
42
|
+
substitute for the label in few-shot evaluation. If set to "auto", the
|
|
43
|
+
mapping will be set to a 1:1 mapping between the labels and themselves.
|
|
34
44
|
"""
|
|
35
45
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def __hash__(self) -> int:
|
|
42
|
-
"""Return a hash of the language."""
|
|
43
|
-
return hash(self.code)
|
|
44
|
-
|
|
45
|
-
@property
|
|
46
|
-
def and_separator(self) -> str:
|
|
47
|
-
"""Get the word 'and' in the language.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
The word 'and' in the language.
|
|
51
|
-
|
|
52
|
-
Raises:
|
|
53
|
-
NotImplementedError:
|
|
54
|
-
If `and_separator` is `None`.
|
|
55
|
-
"""
|
|
56
|
-
if not self._and_separator:
|
|
57
|
-
raise NotImplementedError(
|
|
58
|
-
f"Separator for the word 'and' has not been defined for {self.name}."
|
|
59
|
-
)
|
|
60
|
-
return self._and_separator
|
|
61
|
-
|
|
62
|
-
@and_separator.setter
|
|
63
|
-
def and_separator(self, value: str | None) -> None:
|
|
64
|
-
self._and_separator = value
|
|
65
|
-
|
|
66
|
-
@property
|
|
67
|
-
def or_separator(self) -> str:
|
|
68
|
-
"""Get the word 'or' in the language.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
The word 'or' in the language.
|
|
72
|
-
|
|
73
|
-
Raises:
|
|
74
|
-
NotImplementedError:
|
|
75
|
-
If `or_separator` is `None`.
|
|
76
|
-
"""
|
|
77
|
-
if not self._or_separator:
|
|
78
|
-
raise NotImplementedError(
|
|
79
|
-
f"Separator for the word 'or' has not been defined for {self.name}."
|
|
80
|
-
)
|
|
81
|
-
return self._or_separator
|
|
82
|
-
|
|
83
|
-
@or_separator.setter
|
|
84
|
-
def or_separator(self, value: str | None) -> None:
|
|
85
|
-
self._or_separator = value
|
|
46
|
+
default_prompt_prefix: str
|
|
47
|
+
default_prompt_template: str
|
|
48
|
+
default_instruction_prompt: str
|
|
49
|
+
default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
|
|
86
50
|
|
|
87
51
|
|
|
88
52
|
@dataclass
|
|
@@ -133,21 +97,25 @@ class Task:
|
|
|
133
97
|
considered incorrect and the evaluation will be aborted. Defaults to True.
|
|
134
98
|
"""
|
|
135
99
|
|
|
100
|
+
model_config = pydantic.ConfigDict(
|
|
101
|
+
protected_namespaces=(), arbitrary_types_allowed=True
|
|
102
|
+
)
|
|
103
|
+
|
|
136
104
|
name: str
|
|
137
105
|
task_group: TaskGroup
|
|
138
|
-
template_dict: dict[
|
|
139
|
-
metrics:
|
|
106
|
+
template_dict: dict[Language, PromptConfig]
|
|
107
|
+
metrics: c.Sequence[Metric]
|
|
140
108
|
default_num_few_shot_examples: int
|
|
141
109
|
default_max_generated_tokens: int
|
|
142
|
-
default_labels:
|
|
110
|
+
default_labels: c.Sequence[str] | None
|
|
143
111
|
requires_zero_shot: bool = False
|
|
144
112
|
uses_structured_output: bool = False
|
|
145
113
|
uses_logprobs: bool = False
|
|
146
114
|
requires_logprobs: bool = False
|
|
147
|
-
default_allowed_model_types:
|
|
115
|
+
default_allowed_model_types: c.Sequence[ModelType] = field(
|
|
148
116
|
default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
|
|
149
117
|
)
|
|
150
|
-
default_allowed_generative_types:
|
|
118
|
+
default_allowed_generative_types: c.Sequence[GenerativeType] = field(
|
|
151
119
|
default_factory=lambda: [
|
|
152
120
|
GenerativeType.BASE,
|
|
153
121
|
GenerativeType.INSTRUCTION_TUNED,
|
|
@@ -165,205 +133,6 @@ class Task:
|
|
|
165
133
|
return hash(self.name)
|
|
166
134
|
|
|
167
135
|
|
|
168
|
-
@dataclass
|
|
169
|
-
class BenchmarkConfig:
|
|
170
|
-
"""General benchmarking configuration, across datasets and models.
|
|
171
|
-
|
|
172
|
-
Attributes:
|
|
173
|
-
tasks:
|
|
174
|
-
The tasks benchmark the model(s) on.
|
|
175
|
-
datasets:
|
|
176
|
-
The datasets to benchmark on.
|
|
177
|
-
model_languages:
|
|
178
|
-
The languages of the models to benchmark.
|
|
179
|
-
dataset_languages:
|
|
180
|
-
The languages of the datasets in the benchmark.
|
|
181
|
-
device:
|
|
182
|
-
The device to use for benchmarking.
|
|
183
|
-
batch_size:
|
|
184
|
-
The batch size to use.
|
|
185
|
-
raise_errors:
|
|
186
|
-
Whether to raise errors instead of skipping them.
|
|
187
|
-
cache_dir:
|
|
188
|
-
Directory to store cached models and datasets.
|
|
189
|
-
api_key:
|
|
190
|
-
The API key to use for a given inference API.
|
|
191
|
-
api_base:
|
|
192
|
-
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
193
|
-
model on an inference API.
|
|
194
|
-
api_version:
|
|
195
|
-
The version of the API to use. Only relevant if `model` refers to a model on
|
|
196
|
-
an inference API.
|
|
197
|
-
progress_bar:
|
|
198
|
-
Whether to show a progress bar.
|
|
199
|
-
save_results:
|
|
200
|
-
Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
|
|
201
|
-
trust_remote_code:
|
|
202
|
-
Whether to trust remote code when loading models from the Hugging Face Hub.
|
|
203
|
-
clear_model_cache:
|
|
204
|
-
Whether to clear the model cache after benchmarking each model.
|
|
205
|
-
evaluate_test_split:
|
|
206
|
-
Whether to evaluate on the test split.
|
|
207
|
-
few_shot:
|
|
208
|
-
Whether to only evaluate the model using few-shot evaluation. Only relevant
|
|
209
|
-
if the model is generative.
|
|
210
|
-
num_iterations:
|
|
211
|
-
The number of iterations each model should be evaluated for.
|
|
212
|
-
gpu_memory_utilization:
|
|
213
|
-
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
214
|
-
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
215
|
-
this if you are running out of GPU memory. Only relevant if the model is
|
|
216
|
-
generative.
|
|
217
|
-
requires_safetensors:
|
|
218
|
-
Whether to only allow models that use the safetensors format.
|
|
219
|
-
generative_type:
|
|
220
|
-
The type of generative model to benchmark. Only relevant if the model is
|
|
221
|
-
generative.
|
|
222
|
-
download_only:
|
|
223
|
-
Whether to only download the models, metrics and datasets without
|
|
224
|
-
evaluating.
|
|
225
|
-
force:
|
|
226
|
-
Whether to force the benchmark to run even if the results are already
|
|
227
|
-
cached.
|
|
228
|
-
verbose:
|
|
229
|
-
Whether to print verbose output.
|
|
230
|
-
debug:
|
|
231
|
-
Whether to run the benchmark in debug mode.
|
|
232
|
-
run_with_cli:
|
|
233
|
-
Whether the benchmark is being run with the CLI.
|
|
234
|
-
"""
|
|
235
|
-
|
|
236
|
-
model_languages: list[Language]
|
|
237
|
-
dataset_languages: list[Language]
|
|
238
|
-
tasks: list[Task]
|
|
239
|
-
datasets: list[str]
|
|
240
|
-
batch_size: int
|
|
241
|
-
raise_errors: bool
|
|
242
|
-
cache_dir: str
|
|
243
|
-
api_key: str | None
|
|
244
|
-
api_base: str | None
|
|
245
|
-
api_version: str | None
|
|
246
|
-
progress_bar: bool
|
|
247
|
-
save_results: bool
|
|
248
|
-
device: torch.device
|
|
249
|
-
trust_remote_code: bool
|
|
250
|
-
clear_model_cache: bool
|
|
251
|
-
evaluate_test_split: bool
|
|
252
|
-
few_shot: bool
|
|
253
|
-
num_iterations: int
|
|
254
|
-
gpu_memory_utilization: float
|
|
255
|
-
requires_safetensors: bool
|
|
256
|
-
generative_type: GenerativeType | None
|
|
257
|
-
download_only: bool
|
|
258
|
-
force: bool
|
|
259
|
-
verbose: bool
|
|
260
|
-
debug: bool
|
|
261
|
-
run_with_cli: bool
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
265
|
-
"""The parameters for the benchmark configuration."""
|
|
266
|
-
|
|
267
|
-
model_config = pydantic.ConfigDict(protected_namespaces=())
|
|
268
|
-
|
|
269
|
-
task: str | list[str] | None
|
|
270
|
-
dataset: str | list[str] | None
|
|
271
|
-
progress_bar: bool
|
|
272
|
-
save_results: bool
|
|
273
|
-
language: str | list[str]
|
|
274
|
-
model_language: str | list[str] | None
|
|
275
|
-
dataset_language: str | list[str] | None
|
|
276
|
-
device: Device | None
|
|
277
|
-
batch_size: int
|
|
278
|
-
raise_errors: bool
|
|
279
|
-
cache_dir: str
|
|
280
|
-
api_key: str | None
|
|
281
|
-
api_base: str | None
|
|
282
|
-
api_version: str | None
|
|
283
|
-
trust_remote_code: bool
|
|
284
|
-
clear_model_cache: bool
|
|
285
|
-
evaluate_test_split: bool
|
|
286
|
-
few_shot: bool
|
|
287
|
-
num_iterations: int
|
|
288
|
-
requires_safetensors: bool
|
|
289
|
-
download_only: bool
|
|
290
|
-
gpu_memory_utilization: float
|
|
291
|
-
generative_type: GenerativeType | None
|
|
292
|
-
force: bool
|
|
293
|
-
verbose: bool
|
|
294
|
-
debug: bool
|
|
295
|
-
run_with_cli: bool
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
class BenchmarkResult(pydantic.BaseModel):
|
|
299
|
-
"""A benchmark result."""
|
|
300
|
-
|
|
301
|
-
dataset: str
|
|
302
|
-
task: str
|
|
303
|
-
dataset_languages: list[str]
|
|
304
|
-
model: str
|
|
305
|
-
results: ScoreDict
|
|
306
|
-
num_model_parameters: int
|
|
307
|
-
max_sequence_length: int
|
|
308
|
-
vocabulary_size: int
|
|
309
|
-
merge: bool
|
|
310
|
-
generative: bool
|
|
311
|
-
generative_type: str | None
|
|
312
|
-
few_shot: bool
|
|
313
|
-
validation_split: bool
|
|
314
|
-
euroeval_version: str | None = get_package_version("euroeval")
|
|
315
|
-
transformers_version: str | None = get_package_version("transformers")
|
|
316
|
-
torch_version: str | None = get_package_version("torch")
|
|
317
|
-
vllm_version: str | None = get_package_version("vllm")
|
|
318
|
-
xgrammar_version: str | None = get_package_version("xgrammar")
|
|
319
|
-
|
|
320
|
-
@classmethod
|
|
321
|
-
def from_dict(cls, config: dict) -> "BenchmarkResult":
|
|
322
|
-
"""Create a benchmark result from a dictionary.
|
|
323
|
-
|
|
324
|
-
Args:
|
|
325
|
-
config:
|
|
326
|
-
The configuration dictionary.
|
|
327
|
-
|
|
328
|
-
Returns:
|
|
329
|
-
The benchmark result.
|
|
330
|
-
"""
|
|
331
|
-
# To be backwards compatible, we accept old results which changed the model
|
|
332
|
-
# name with parameters rather than adding them as explicit parameters
|
|
333
|
-
val_matches = re.search(r"\(.*val.*\)$", config["model"])
|
|
334
|
-
few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
|
|
335
|
-
zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
|
|
336
|
-
config["model"] = re.sub(
|
|
337
|
-
r"\(.*(few-shot|val).*\)$", "", config["model"]
|
|
338
|
-
).strip()
|
|
339
|
-
|
|
340
|
-
if "merge" not in config:
|
|
341
|
-
config["merge"] = False
|
|
342
|
-
if "generative" not in config:
|
|
343
|
-
config["generative"] = (
|
|
344
|
-
few_shot_matches is not None or zero_shot_matches is not None
|
|
345
|
-
)
|
|
346
|
-
if "generative_type" not in config:
|
|
347
|
-
config["generative_type"] = None
|
|
348
|
-
if "few_shot" not in config:
|
|
349
|
-
config["few_shot"] = zero_shot_matches is None
|
|
350
|
-
if "validation_split" not in config:
|
|
351
|
-
config["validation_split"] = val_matches is not None
|
|
352
|
-
|
|
353
|
-
return cls(**config)
|
|
354
|
-
|
|
355
|
-
def append_to_results(self, results_path: pathlib.Path) -> None:
|
|
356
|
-
"""Append the benchmark result to the results file.
|
|
357
|
-
|
|
358
|
-
Args:
|
|
359
|
-
results_path:
|
|
360
|
-
The path to the results file.
|
|
361
|
-
"""
|
|
362
|
-
json_str = json.dumps(self.model_dump())
|
|
363
|
-
with results_path.open("a") as f:
|
|
364
|
-
f.write("\n" + json_str)
|
|
365
|
-
|
|
366
|
-
|
|
367
136
|
@dataclass
|
|
368
137
|
class DatasetConfig:
|
|
369
138
|
"""Configuration for a dataset.
|
|
@@ -374,8 +143,9 @@ class DatasetConfig:
|
|
|
374
143
|
pretty_name:
|
|
375
144
|
A longer prettier name for the dataset, which allows cases and spaces. Used
|
|
376
145
|
for logging.
|
|
377
|
-
|
|
378
|
-
The Hugging Face ID
|
|
146
|
+
source:
|
|
147
|
+
The source of the dataset, which can be a Hugging Face ID or a dictionary
|
|
148
|
+
with keys "train", "val" and "test" mapping to local CSV file paths.
|
|
379
149
|
task:
|
|
380
150
|
The task of the dataset.
|
|
381
151
|
languages:
|
|
@@ -427,6 +197,10 @@ class DatasetConfig:
|
|
|
427
197
|
will be mapped to the closest valid label. If False, the model output will
|
|
428
198
|
be considered incorrect and the evaluation will be aborted. Defaults to
|
|
429
199
|
the one for the task.
|
|
200
|
+
_logging_string (optional):
|
|
201
|
+
The string used to describe evaluation on the dataset in logging. If not
|
|
202
|
+
provided, a default string will be generated, based on the pretty name. Only
|
|
203
|
+
use this if the default string is not suitable.
|
|
430
204
|
splits (optional):
|
|
431
205
|
The names of the splits in the dataset. If not provided, defaults to
|
|
432
206
|
["train", "val", "test"].
|
|
@@ -438,28 +212,77 @@ class DatasetConfig:
|
|
|
438
212
|
|
|
439
213
|
name: str
|
|
440
214
|
pretty_name: str
|
|
441
|
-
|
|
215
|
+
source: str | dict[str, str]
|
|
442
216
|
task: Task
|
|
443
|
-
languages:
|
|
217
|
+
languages: c.Sequence[Language]
|
|
444
218
|
_prompt_prefix: str | None = None
|
|
445
219
|
_prompt_template: str | None = None
|
|
446
220
|
_instruction_prompt: str | None = None
|
|
447
221
|
_num_few_shot_examples: int | None = None
|
|
448
222
|
_max_generated_tokens: int | None = None
|
|
449
|
-
_labels:
|
|
223
|
+
_labels: c.Sequence[str] | None = None
|
|
450
224
|
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
|
|
451
|
-
_allowed_model_types:
|
|
452
|
-
_allowed_generative_types:
|
|
225
|
+
_allowed_model_types: c.Sequence[ModelType] | None = None
|
|
226
|
+
_allowed_generative_types: c.Sequence[GenerativeType] | None = None
|
|
453
227
|
_allow_invalid_model_outputs: bool | None = None
|
|
454
|
-
|
|
228
|
+
_logging_string: str | None = None
|
|
229
|
+
splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
|
|
455
230
|
bootstrap_samples: bool = True
|
|
456
231
|
unofficial: bool = False
|
|
457
232
|
|
|
233
|
+
@property
|
|
234
|
+
def main_language(self) -> Language:
|
|
235
|
+
"""Get the main language of the dataset.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
The main language.
|
|
239
|
+
"""
|
|
240
|
+
match len(self.languages):
|
|
241
|
+
case 0:
|
|
242
|
+
raise InvalidBenchmark(
|
|
243
|
+
f"Dataset {self.name!r} must have at least one language."
|
|
244
|
+
)
|
|
245
|
+
case 1:
|
|
246
|
+
return self.languages[0]
|
|
247
|
+
case _:
|
|
248
|
+
if ENGLISH in self.languages:
|
|
249
|
+
return ENGLISH
|
|
250
|
+
elif NORWEGIAN in self.languages:
|
|
251
|
+
return NORWEGIAN
|
|
252
|
+
elif PORTUGUESE in self.languages:
|
|
253
|
+
return PORTUGUESE
|
|
254
|
+
else:
|
|
255
|
+
return self.languages[0]
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def logging_string(self) -> str:
|
|
259
|
+
"""The string used to describe evaluation on the dataset in logging."""
|
|
260
|
+
if self._logging_string is not None:
|
|
261
|
+
return self._logging_string
|
|
262
|
+
truncated_str = (
|
|
263
|
+
"truncated version of the "
|
|
264
|
+
if isinstance(self.source, str) and self.source.endswith("-mini")
|
|
265
|
+
else ""
|
|
266
|
+
)
|
|
267
|
+
if len(self.languages) > 1:
|
|
268
|
+
languages_str = (
|
|
269
|
+
", ".join([lang.name for lang in self.languages[:-1]])
|
|
270
|
+
+ f" and {self.languages[-1].name}"
|
|
271
|
+
)
|
|
272
|
+
else:
|
|
273
|
+
languages_str = self.languages[0].name
|
|
274
|
+
task_str = self.task.name.replace("-", " ")
|
|
275
|
+
dataset_name_str = (
|
|
276
|
+
self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
|
|
277
|
+
)
|
|
278
|
+
return (
|
|
279
|
+
f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
|
|
280
|
+
)
|
|
281
|
+
|
|
458
282
|
@property
|
|
459
283
|
def prompt_prefix(self) -> str:
|
|
460
284
|
"""The prefix to use in the few-shot prompt."""
|
|
461
|
-
|
|
462
|
-
prompt_config = self.task.template_dict[main_language]
|
|
285
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
463
286
|
prompt_prefix = (
|
|
464
287
|
prompt_config.default_prompt_prefix
|
|
465
288
|
if self._prompt_prefix is None
|
|
@@ -470,8 +293,7 @@ class DatasetConfig:
|
|
|
470
293
|
@property
|
|
471
294
|
def prompt_template(self) -> str:
|
|
472
295
|
"""The template used during few-shot evaluation."""
|
|
473
|
-
|
|
474
|
-
prompt_config = self.task.template_dict[main_language]
|
|
296
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
475
297
|
prompt_template = (
|
|
476
298
|
prompt_config.default_prompt_template
|
|
477
299
|
if self._prompt_template is None
|
|
@@ -482,8 +304,7 @@ class DatasetConfig:
|
|
|
482
304
|
@property
|
|
483
305
|
def instruction_prompt(self) -> str:
|
|
484
306
|
"""The prompt to use when evaluating instruction-tuned models."""
|
|
485
|
-
|
|
486
|
-
prompt_config = self.task.template_dict[main_language]
|
|
307
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
487
308
|
instruction_prompt = (
|
|
488
309
|
prompt_config.default_instruction_prompt
|
|
489
310
|
if self._instruction_prompt is None
|
|
@@ -510,9 +331,18 @@ class DatasetConfig:
|
|
|
510
331
|
)
|
|
511
332
|
|
|
512
333
|
@property
|
|
513
|
-
def labels(self) ->
|
|
334
|
+
def labels(self) -> c.Sequence[str]:
|
|
514
335
|
"""The labels in the dataset."""
|
|
515
|
-
|
|
336
|
+
if self._labels is not None:
|
|
337
|
+
return self._labels
|
|
338
|
+
elif self.task.default_labels is not None:
|
|
339
|
+
return self.task.default_labels
|
|
340
|
+
else:
|
|
341
|
+
raise ValueError(
|
|
342
|
+
f"Labels must be specified for dataset {self.name!r} with the "
|
|
343
|
+
f"attribute `_labels`, as the task {self.task.name!r} does not have "
|
|
344
|
+
"default labels."
|
|
345
|
+
)
|
|
516
346
|
|
|
517
347
|
@property
|
|
518
348
|
def prompt_label_mapping(self) -> dict[str, str]:
|
|
@@ -521,17 +351,14 @@ class DatasetConfig:
|
|
|
521
351
|
return {label: label for label in self.labels}
|
|
522
352
|
elif self._prompt_label_mapping is not None:
|
|
523
353
|
return self._prompt_label_mapping
|
|
524
|
-
|
|
525
|
-
main_language = self.languages[0]
|
|
526
|
-
prompt_config = self.task.template_dict[main_language]
|
|
527
|
-
|
|
354
|
+
prompt_config = self.task.template_dict[self.main_language]
|
|
528
355
|
if prompt_config.default_prompt_label_mapping == "auto":
|
|
529
356
|
return {label: label for label in self.labels}
|
|
530
357
|
else:
|
|
531
358
|
return prompt_config.default_prompt_label_mapping
|
|
532
359
|
|
|
533
360
|
@property
|
|
534
|
-
def allowed_model_types(self) ->
|
|
361
|
+
def allowed_model_types(self) -> c.Sequence[ModelType]:
|
|
535
362
|
"""A list of model types that are allowed to be evaluated on this dataset."""
|
|
536
363
|
return (
|
|
537
364
|
self._allowed_model_types
|
|
@@ -540,7 +367,7 @@ class DatasetConfig:
|
|
|
540
367
|
)
|
|
541
368
|
|
|
542
369
|
@property
|
|
543
|
-
def allowed_generative_types(self) ->
|
|
370
|
+
def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
|
|
544
371
|
"""A list of generative model types that are allowed on this dataset."""
|
|
545
372
|
return (
|
|
546
373
|
self._allowed_generative_types
|
|
@@ -576,7 +403,7 @@ class DatasetConfig:
|
|
|
576
403
|
"""Return a hash of the dataset configuration."""
|
|
577
404
|
return hash(self.name)
|
|
578
405
|
|
|
579
|
-
def get_labels_str(self, labels:
|
|
406
|
+
def get_labels_str(self, labels: c.Sequence[str] | None = None) -> str:
|
|
580
407
|
"""Converts a set of labels to a natural string, in the specified language.
|
|
581
408
|
|
|
582
409
|
If the task is NER, we separate using 'and' and use the mapped labels instead of
|
|
@@ -590,12 +417,10 @@ class DatasetConfig:
|
|
|
590
417
|
Returns:
|
|
591
418
|
The natural string representation of the labels in specified language.
|
|
592
419
|
"""
|
|
593
|
-
main_language = self.languages[0]
|
|
594
|
-
|
|
595
420
|
if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
|
|
596
|
-
sep_word = main_language.and_separator
|
|
421
|
+
sep_word = self.main_language.and_separator
|
|
597
422
|
else:
|
|
598
|
-
sep_word = main_language.or_separator
|
|
423
|
+
sep_word = self.main_language.or_separator
|
|
599
424
|
|
|
600
425
|
if labels is None:
|
|
601
426
|
labels = list()
|
|
@@ -619,6 +444,209 @@ class DatasetConfig:
|
|
|
619
444
|
return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
|
|
620
445
|
|
|
621
446
|
|
|
447
|
+
@dataclass
|
|
448
|
+
class BenchmarkConfig:
|
|
449
|
+
"""General benchmarking configuration, across datasets and models.
|
|
450
|
+
|
|
451
|
+
Attributes:
|
|
452
|
+
datasets:
|
|
453
|
+
The datasets to benchmark on.
|
|
454
|
+
model_languages:
|
|
455
|
+
The languages of the models to benchmark.
|
|
456
|
+
dataset_languages:
|
|
457
|
+
The languages of the datasets in the benchmark.
|
|
458
|
+
batch_size:
|
|
459
|
+
The batch size to use.
|
|
460
|
+
raise_errors:
|
|
461
|
+
Whether to raise errors instead of skipping them.
|
|
462
|
+
cache_dir:
|
|
463
|
+
Directory to store cached models and datasets.
|
|
464
|
+
api_key:
|
|
465
|
+
The API key to use for a given inference API.
|
|
466
|
+
api_base:
|
|
467
|
+
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
468
|
+
model on an inference API.
|
|
469
|
+
api_version:
|
|
470
|
+
The version of the API to use. Only relevant if `model` refers to a model on
|
|
471
|
+
an inference API.
|
|
472
|
+
progress_bar:
|
|
473
|
+
Whether to show a progress bar.
|
|
474
|
+
save_results:
|
|
475
|
+
Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
|
|
476
|
+
device:
|
|
477
|
+
The device to use for benchmarking.
|
|
478
|
+
trust_remote_code:
|
|
479
|
+
Whether to trust remote code when loading models from the Hugging Face Hub.
|
|
480
|
+
clear_model_cache:
|
|
481
|
+
Whether to clear the model cache after benchmarking each model.
|
|
482
|
+
evaluate_test_split:
|
|
483
|
+
Whether to evaluate on the test split.
|
|
484
|
+
few_shot:
|
|
485
|
+
Whether to only evaluate the model using few-shot evaluation. Only relevant
|
|
486
|
+
if the model is generative.
|
|
487
|
+
num_iterations:
|
|
488
|
+
The number of iterations each model should be evaluated for.
|
|
489
|
+
gpu_memory_utilization:
|
|
490
|
+
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
491
|
+
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
492
|
+
this if you are running out of GPU memory. Only relevant if the model is
|
|
493
|
+
generative.
|
|
494
|
+
requires_safetensors:
|
|
495
|
+
Whether to only allow models that use the safetensors format.
|
|
496
|
+
generative_type:
|
|
497
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
498
|
+
generative.
|
|
499
|
+
download_only:
|
|
500
|
+
Whether to only download the models, metrics and datasets without
|
|
501
|
+
evaluating.
|
|
502
|
+
force:
|
|
503
|
+
Whether to force the benchmark to run even if the results are already
|
|
504
|
+
cached.
|
|
505
|
+
verbose:
|
|
506
|
+
Whether to print verbose output.
|
|
507
|
+
debug:
|
|
508
|
+
Whether to run the benchmark in debug mode.
|
|
509
|
+
run_with_cli:
|
|
510
|
+
Whether the benchmark is being run with the CLI.
|
|
511
|
+
"""
|
|
512
|
+
|
|
513
|
+
datasets: c.Sequence[DatasetConfig]
|
|
514
|
+
model_languages: c.Sequence[Language]
|
|
515
|
+
dataset_languages: c.Sequence[Language]
|
|
516
|
+
batch_size: int
|
|
517
|
+
raise_errors: bool
|
|
518
|
+
cache_dir: str
|
|
519
|
+
api_key: str | None
|
|
520
|
+
api_base: str | None
|
|
521
|
+
api_version: str | None
|
|
522
|
+
progress_bar: bool
|
|
523
|
+
save_results: bool
|
|
524
|
+
device: torch.device
|
|
525
|
+
trust_remote_code: bool
|
|
526
|
+
clear_model_cache: bool
|
|
527
|
+
evaluate_test_split: bool
|
|
528
|
+
few_shot: bool
|
|
529
|
+
num_iterations: int
|
|
530
|
+
gpu_memory_utilization: float
|
|
531
|
+
requires_safetensors: bool
|
|
532
|
+
generative_type: GenerativeType | None
|
|
533
|
+
download_only: bool
|
|
534
|
+
force: bool
|
|
535
|
+
verbose: bool
|
|
536
|
+
debug: bool
|
|
537
|
+
run_with_cli: bool
|
|
538
|
+
|
|
539
|
+
@property
|
|
540
|
+
def tasks(self) -> c.Sequence[Task]:
|
|
541
|
+
"""Get the tasks in the benchmark configuration."""
|
|
542
|
+
return list({dataset_config.task for dataset_config in self.datasets})
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
546
|
+
"""The parameters for the benchmark configuration."""
|
|
547
|
+
|
|
548
|
+
model_config = pydantic.ConfigDict(
|
|
549
|
+
protected_namespaces=(), arbitrary_types_allowed=True
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
task: str | Task | c.Sequence[str | Task] | None
|
|
553
|
+
dataset: str | DatasetConfig | c.Sequence[str | DatasetConfig] | None
|
|
554
|
+
progress_bar: bool
|
|
555
|
+
save_results: bool
|
|
556
|
+
language: str | c.Sequence[str]
|
|
557
|
+
model_language: str | c.Sequence[str] | None
|
|
558
|
+
dataset_language: str | c.Sequence[str] | None
|
|
559
|
+
device: Device | None
|
|
560
|
+
batch_size: int
|
|
561
|
+
raise_errors: bool
|
|
562
|
+
cache_dir: str
|
|
563
|
+
api_key: str | None
|
|
564
|
+
api_base: str | None
|
|
565
|
+
api_version: str | None
|
|
566
|
+
trust_remote_code: bool
|
|
567
|
+
clear_model_cache: bool
|
|
568
|
+
evaluate_test_split: bool
|
|
569
|
+
few_shot: bool
|
|
570
|
+
num_iterations: int
|
|
571
|
+
requires_safetensors: bool
|
|
572
|
+
download_only: bool
|
|
573
|
+
gpu_memory_utilization: float
|
|
574
|
+
generative_type: GenerativeType | None
|
|
575
|
+
force: bool
|
|
576
|
+
verbose: bool
|
|
577
|
+
debug: bool
|
|
578
|
+
run_with_cli: bool
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
class BenchmarkResult(pydantic.BaseModel):
|
|
582
|
+
"""A benchmark result."""
|
|
583
|
+
|
|
584
|
+
dataset: str
|
|
585
|
+
task: str
|
|
586
|
+
dataset_languages: c.Sequence[str]
|
|
587
|
+
model: str
|
|
588
|
+
results: ScoreDict
|
|
589
|
+
num_model_parameters: int
|
|
590
|
+
max_sequence_length: int
|
|
591
|
+
vocabulary_size: int
|
|
592
|
+
merge: bool
|
|
593
|
+
generative: bool
|
|
594
|
+
generative_type: str | None
|
|
595
|
+
few_shot: bool
|
|
596
|
+
validation_split: bool
|
|
597
|
+
euroeval_version: str | None = get_package_version("euroeval")
|
|
598
|
+
transformers_version: str | None = get_package_version("transformers")
|
|
599
|
+
torch_version: str | None = get_package_version("torch")
|
|
600
|
+
vllm_version: str | None = get_package_version("vllm")
|
|
601
|
+
xgrammar_version: str | None = get_package_version("xgrammar")
|
|
602
|
+
|
|
603
|
+
@classmethod
|
|
604
|
+
def from_dict(cls, config: dict) -> "BenchmarkResult":
|
|
605
|
+
"""Create a benchmark result from a dictionary.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
config:
|
|
609
|
+
The configuration dictionary.
|
|
610
|
+
|
|
611
|
+
Returns:
|
|
612
|
+
The benchmark result.
|
|
613
|
+
"""
|
|
614
|
+
# To be backwards compatible, we accept old results which changed the model
|
|
615
|
+
# name with parameters rather than adding them as explicit parameters
|
|
616
|
+
val_matches = re.search(r"\(.*val.*\)$", config["model"])
|
|
617
|
+
few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
|
|
618
|
+
zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
|
|
619
|
+
config["model"] = re.sub(
|
|
620
|
+
r"\(.*(few-shot|val).*\)$", "", config["model"]
|
|
621
|
+
).strip()
|
|
622
|
+
|
|
623
|
+
if "merge" not in config:
|
|
624
|
+
config["merge"] = False
|
|
625
|
+
if "generative" not in config:
|
|
626
|
+
config["generative"] = (
|
|
627
|
+
few_shot_matches is not None or zero_shot_matches is not None
|
|
628
|
+
)
|
|
629
|
+
if "generative_type" not in config:
|
|
630
|
+
config["generative_type"] = None
|
|
631
|
+
if "few_shot" not in config:
|
|
632
|
+
config["few_shot"] = zero_shot_matches is None
|
|
633
|
+
if "validation_split" not in config:
|
|
634
|
+
config["validation_split"] = val_matches is not None
|
|
635
|
+
|
|
636
|
+
return cls(**config)
|
|
637
|
+
|
|
638
|
+
def append_to_results(self, results_path: pathlib.Path) -> None:
|
|
639
|
+
"""Append the benchmark result to the results file.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
results_path:
|
|
643
|
+
The path to the results file.
|
|
644
|
+
"""
|
|
645
|
+
json_str = json.dumps(self.model_dump())
|
|
646
|
+
with results_path.open("a") as f:
|
|
647
|
+
f.write("\n" + json_str)
|
|
648
|
+
|
|
649
|
+
|
|
622
650
|
@dataclass
|
|
623
651
|
class ModelConfig:
|
|
624
652
|
"""Configuration for a model.
|
|
@@ -653,7 +681,7 @@ class ModelConfig:
|
|
|
653
681
|
revision: str
|
|
654
682
|
param: str | None
|
|
655
683
|
task: str
|
|
656
|
-
languages:
|
|
684
|
+
languages: c.Sequence[Language]
|
|
657
685
|
inference_backend: "InferenceBackend"
|
|
658
686
|
merge: bool
|
|
659
687
|
model_type: ModelType
|
|
@@ -681,7 +709,7 @@ class PreparedModelInputs:
|
|
|
681
709
|
instead.
|
|
682
710
|
"""
|
|
683
711
|
|
|
684
|
-
texts:
|
|
712
|
+
texts: c.Sequence[str] | None = None
|
|
685
713
|
input_ids: torch.Tensor | None = None
|
|
686
714
|
attention_mask: torch.Tensor | None = None
|
|
687
715
|
|
|
@@ -699,8 +727,8 @@ class GenerativeModelOutput:
|
|
|
699
727
|
token and its logprob. Can be None if the scores are not available.
|
|
700
728
|
"""
|
|
701
729
|
|
|
702
|
-
sequences:
|
|
703
|
-
scores:
|
|
730
|
+
sequences: c.Sequence[str]
|
|
731
|
+
scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] | None = None
|
|
704
732
|
|
|
705
733
|
|
|
706
734
|
@dataclass
|
|
@@ -717,7 +745,7 @@ class SingleGenerativeModelOutput:
|
|
|
717
745
|
"""
|
|
718
746
|
|
|
719
747
|
sequence: str
|
|
720
|
-
scores:
|
|
748
|
+
scores: c.Sequence[c.Sequence[tuple[str, float]]] | None = None
|
|
721
749
|
|
|
722
750
|
|
|
723
751
|
@dataclass
|
|
@@ -735,38 +763,10 @@ class HFModelInfo:
|
|
|
735
763
|
"""
|
|
736
764
|
|
|
737
765
|
pipeline_tag: str
|
|
738
|
-
tags:
|
|
766
|
+
tags: c.Sequence[str]
|
|
739
767
|
adapter_base_model_id: str | None
|
|
740
768
|
|
|
741
769
|
|
|
742
|
-
@dataclass
|
|
743
|
-
class PromptConfig:
|
|
744
|
-
"""Configuration for task-specific prompting across languages.
|
|
745
|
-
|
|
746
|
-
Defines the prompt templates needed for evaluating a specific task in a given
|
|
747
|
-
language.
|
|
748
|
-
|
|
749
|
-
Attributes:
|
|
750
|
-
default_prompt_prefix:
|
|
751
|
-
The default prefix to use in the few-shot prompt.
|
|
752
|
-
default_prompt_template:
|
|
753
|
-
The default template for the prompt to use when benchmarking the dataset
|
|
754
|
-
using few-shot evaluation.
|
|
755
|
-
default_instruction_prompt:
|
|
756
|
-
The default prompt to use when benchmarking the dataset using
|
|
757
|
-
instruction-based evaluation.
|
|
758
|
-
default_prompt_label_mapping:
|
|
759
|
-
The default mapping from the labels to another phrase which is used as a
|
|
760
|
-
substitute for the label in few-shot evaluation. If set to "auto", the
|
|
761
|
-
mapping will be set to a 1:1 mapping between the labels and themselves.
|
|
762
|
-
"""
|
|
763
|
-
|
|
764
|
-
default_prompt_prefix: str
|
|
765
|
-
default_prompt_template: str
|
|
766
|
-
default_instruction_prompt: str
|
|
767
|
-
default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
|
|
768
|
-
|
|
769
|
-
|
|
770
770
|
@dataclass
|
|
771
771
|
class ModelIdComponents:
|
|
772
772
|
"""A model ID split into its components.
|