EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,474 @@
1
+ """Data models used in EuroEval."""
2
+
3
+ import collections.abc as c
4
+ import importlib.metadata
5
+ import json
6
+ import pathlib
7
+ import re
8
+ import typing as t
9
+ from dataclasses import dataclass, field
10
+
11
+ import pydantic
12
+ import torch
13
+
14
+ from .enums import Device, InferenceBackend, ModelType, TaskGroup
15
+ from .types import ScoreDict
16
+
17
+
18
+ @dataclass
19
+ class MetricConfig:
20
+ """Configuration for a metric.
21
+
22
+ Attributes:
23
+ name:
24
+ The name of the metric.
25
+ pretty_name:
26
+ A longer prettier name for the metric, which allows cases and spaces. Used
27
+ for logging.
28
+ huggingface_id:
29
+ The Hugging Face ID of the metric.
30
+ results_key:
31
+ The name of the key used to extract the metric scores from the results
32
+ dictionary.
33
+ compute_kwargs:
34
+ Keyword arguments to pass to the metric's compute function. Defaults to
35
+ an empty dictionary.
36
+ postprocessing_fn:
37
+ A function to apply to the metric scores after they are computed, taking
38
+ the score to the postprocessed score along with its string representation.
39
+ Defaults to x -> (100 * x, f"{x:.2%}").
40
+ """
41
+
42
+ name: str
43
+ pretty_name: str
44
+ huggingface_id: str
45
+ results_key: str
46
+ compute_kwargs: dict[str, t.Any] = field(default_factory=dict)
47
+ postprocessing_fn: c.Callable[[float], tuple[float, str]] = field(
48
+ default_factory=lambda: lambda raw_score: (100 * raw_score, f"{raw_score:.2%}")
49
+ )
50
+
51
+ def __hash__(self) -> int:
52
+ """Return a hash of the metric configuration."""
53
+ return hash(self.name)
54
+
55
+
56
+ @dataclass
57
+ class Task:
58
+ """A dataset task.
59
+
60
+ Attributes:
61
+ name:
62
+ The name of the task.
63
+ task_group:
64
+ The task group of the task.
65
+ metrics:
66
+ The metrics used to evaluate the task.
67
+ """
68
+
69
+ name: str
70
+ task_group: TaskGroup
71
+ metrics: list[MetricConfig]
72
+
73
+ def __hash__(self) -> int:
74
+ """Return a hash of the task."""
75
+ return hash(self.name)
76
+
77
+
78
+ @dataclass
79
+ class Language:
80
+ """A benchmarkable language.
81
+
82
+ Attributes:
83
+ code:
84
+ The ISO 639-1 language code of the language.
85
+ name:
86
+ The name of the language.
87
+ """
88
+
89
+ code: str
90
+ name: str
91
+
92
+ def __hash__(self) -> int:
93
+ """Return a hash of the language."""
94
+ return hash(self.code)
95
+
96
+
97
+ @dataclass
98
+ class BenchmarkConfig:
99
+ """General benchmarking configuration, across datasets and models.
100
+
101
+ Attributes:
102
+ model_languages:
103
+ The languages of the models to benchmark.
104
+ dataset_languages:
105
+ The languages of the datasets in the benchmark.
106
+ tasks:
107
+ The tasks benchmark the model(s) on.
108
+ datasets:
109
+ The datasets to benchmark on.
110
+ batch_size:
111
+ The batch size to use.
112
+ raise_errors:
113
+ Whether to raise errors instead of skipping them.
114
+ cache_dir:
115
+ Directory to store cached models and datasets.
116
+ api_key:
117
+ The API key to use for a given inference API.
118
+ force:
119
+ Whether to force the benchmark to run even if the results are already
120
+ cached.
121
+ progress_bar:
122
+ Whether to show a progress bar.
123
+ save_results:
124
+ Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
125
+ device:
126
+ The device to use for benchmarking.
127
+ verbose:
128
+ Whether to print verbose output.
129
+ trust_remote_code:
130
+ Whether to trust remote code when loading models from the Hugging Face Hub.
131
+ use_flash_attention:
132
+ Whether to use Flash Attention. If None then this will be used for
133
+ generative models.
134
+ clear_model_cache:
135
+ Whether to clear the model cache after benchmarking each model.
136
+ evaluate_test_split:
137
+ Whether to evaluate on the test split.
138
+ few_shot:
139
+ Whether to only evaluate the model using few-shot evaluation. Only relevant
140
+ if the model is generative.
141
+ num_iterations:
142
+ The number of iterations each model should be evaluated for.
143
+ api_base:
144
+ The base URL for a given inference API. Only relevant if `model` refers to a
145
+ model on an inference API.
146
+ api_version:
147
+ The version of the API to use. Only relevant if `model` refers to a model on
148
+ an inference API.
149
+ debug:
150
+ Whether to run the benchmark in debug mode.
151
+ run_with_cli:
152
+ Whether the benchmark is being run with the CLI.
153
+ only_allow_safetensors:
154
+ Whether to only allow models that use the safetensors format.
155
+ """
156
+
157
+ model_languages: list[Language]
158
+ dataset_languages: list[Language]
159
+ tasks: list[Task]
160
+ datasets: list[str]
161
+ batch_size: int
162
+ raise_errors: bool
163
+ cache_dir: str
164
+ api_key: str | None
165
+ force: bool
166
+ progress_bar: bool
167
+ save_results: bool
168
+ device: torch.device
169
+ verbose: bool
170
+ trust_remote_code: bool
171
+ use_flash_attention: bool | None
172
+ clear_model_cache: bool
173
+ evaluate_test_split: bool
174
+ few_shot: bool
175
+ num_iterations: int
176
+ api_base: str | None
177
+ api_version: str | None
178
+ debug: bool
179
+ run_with_cli: bool
180
+ only_allow_safetensors: bool
181
+
182
+
183
+ class BenchmarkConfigParams(pydantic.BaseModel):
184
+ """The parameters for the benchmark configuration."""
185
+
186
+ model_config = pydantic.ConfigDict(protected_namespaces=())
187
+
188
+ progress_bar: bool
189
+ save_results: bool
190
+ task: str | list[str] | None
191
+ dataset: str | list[str] | None
192
+ language: str | list[str]
193
+ model_language: str | list[str] | None
194
+ dataset_language: str | list[str] | None
195
+ device: Device | None
196
+ batch_size: int
197
+ raise_errors: bool
198
+ cache_dir: str
199
+ api_key: str | None
200
+ force: bool
201
+ verbose: bool
202
+ trust_remote_code: bool
203
+ use_flash_attention: bool | None
204
+ clear_model_cache: bool
205
+ evaluate_test_split: bool
206
+ few_shot: bool
207
+ num_iterations: int
208
+ api_base: str | None
209
+ api_version: str | None
210
+ debug: bool
211
+ run_with_cli: bool
212
+ only_allow_safetensors: bool
213
+
214
+
215
+ class BenchmarkResult(pydantic.BaseModel):
216
+ """A benchmark result."""
217
+
218
+ dataset: str
219
+ task: str
220
+ dataset_languages: list[str]
221
+ model: str
222
+ results: ScoreDict
223
+ num_model_parameters: int
224
+ max_sequence_length: int
225
+ vocabulary_size: int
226
+ merge: bool
227
+ generative: bool
228
+ generative_type: str | None
229
+ few_shot: bool
230
+ validation_split: bool
231
+ euroeval_version: str = importlib.metadata.version("euroeval")
232
+
233
+ @classmethod
234
+ def from_dict(cls, config: dict) -> "BenchmarkResult":
235
+ """Create a benchmark result from a dictionary.
236
+
237
+ Args:
238
+ config:
239
+ The configuration dictionary.
240
+
241
+ Returns:
242
+ The benchmark result.
243
+ """
244
+ # To be backwards compatible, we accept old results which changed the model
245
+ # name with parameters rather than adding them as explicit parameters
246
+ val_matches = re.search(r"\(.*val.*\)$", config["model"])
247
+ few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
248
+ zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
249
+ config["model"] = re.sub(
250
+ r"\(.*(few-shot|val).*\)$", "", config["model"]
251
+ ).strip()
252
+
253
+ if "merge" not in config:
254
+ config["merge"] = False
255
+ if "generative" not in config:
256
+ config["generative"] = (
257
+ few_shot_matches is not None or zero_shot_matches is not None
258
+ )
259
+ if "generative_type" not in config:
260
+ config["generative_type"] = None
261
+ if "few_shot" not in config:
262
+ config["few_shot"] = zero_shot_matches is None
263
+ if "validation_split" not in config:
264
+ config["validation_split"] = val_matches is not None
265
+
266
+ return cls(**config)
267
+
268
+ def append_to_results(self, results_path: pathlib.Path) -> None:
269
+ """Append the benchmark result to the results file.
270
+
271
+ Args:
272
+ results_path:
273
+ The path to the results file.
274
+ """
275
+ json_str = json.dumps(self.model_dump())
276
+ with results_path.open("a") as f:
277
+ f.write("\n" + json_str)
278
+
279
+
280
+ @dataclass
281
+ class DatasetConfig:
282
+ """Configuration for a dataset.
283
+
284
+ Attributes:
285
+ name:
286
+ The name of the dataset. Must be lower case with no spaces.
287
+ pretty_name:
288
+ A longer prettier name for the dataset, which allows cases and spaces. Used
289
+ for logging.
290
+ huggingface_id:
291
+ The Hugging Face ID of the dataset.
292
+ task:
293
+ The task of the dataset.
294
+ languages:
295
+ The ISO 639-1 language codes of the entries in the dataset.
296
+ id2label:
297
+ The mapping from ID to label.
298
+ label2id:
299
+ The mapping from label to ID.
300
+ num_labels:
301
+ The number of labels in the dataset.
302
+ prompt_template:
303
+ The template for the prompt to use when benchmarking the dataset using
304
+ few-shot evaluation.
305
+ max_generated_tokens:
306
+ The maximum number of tokens to generate when benchmarking the dataset
307
+ using few-shot evaluation.
308
+ prompt_prefix:
309
+ The prefix to use in the few-shot prompt.
310
+ num_few_shot_examples:
311
+ The number of examples to use when benchmarking the dataset using few-shot
312
+ evaluation. For a classification task, these will be drawn evenly from
313
+ each label.
314
+ instruction_prompt:
315
+ The prompt to use when benchmarking the dataset using instruction-based
316
+ evaluation.
317
+ labels (optional):
318
+ The labels in the dataset. Defaults to an empty list.
319
+ prompt_label_mapping (optional):
320
+ A mapping from the labels to another phrase which is used as a substitute
321
+ for the label in few-shot evaluation. Defaults to an empty dictionary.
322
+ unofficial (optional):
323
+ Whether the dataset is unofficial. Defaults to False.
324
+ """
325
+
326
+ name: str
327
+ pretty_name: str
328
+ huggingface_id: str
329
+ task: Task
330
+ languages: list[Language]
331
+ prompt_template: str
332
+ max_generated_tokens: int
333
+ prompt_prefix: str
334
+ num_few_shot_examples: int
335
+ instruction_prompt: str
336
+ labels: list[str] = field(default_factory=list)
337
+ prompt_label_mapping: dict[str, str] = field(default_factory=dict)
338
+ unofficial: bool = False
339
+
340
+ @property
341
+ def id2label(self) -> dict[int, str]:
342
+ """The mapping from ID to label."""
343
+ return {idx: label for idx, label in enumerate(self.labels)}
344
+
345
+ @property
346
+ def label2id(self) -> dict[str, int]:
347
+ """The mapping from label to ID."""
348
+ return {label: i for i, label in enumerate(self.labels)}
349
+
350
+ @property
351
+ def num_labels(self) -> int:
352
+ """The number of labels in the dataset."""
353
+ return len(self.labels)
354
+
355
+ def __hash__(self) -> int:
356
+ """Return a hash of the dataset configuration."""
357
+ return hash(self.name)
358
+
359
+
360
+ @dataclass
361
+ class ModelConfig:
362
+ """Configuration for a model.
363
+
364
+ Attributes:
365
+ model_id:
366
+ The ID of the model.
367
+ revision:
368
+ The revision of the model.
369
+ task:
370
+ The task that the model was trained on.
371
+ languages:
372
+ The languages of the model.
373
+ inference_backend:
374
+ The backend used to perform inference with the model.
375
+ merge:
376
+ Whether the model is a merged model.
377
+ model_type:
378
+ The type of the model (e.g., encoder, base decoder, instruction tuned).
379
+ fresh:
380
+ Whether the model is freshly initialised.
381
+ model_cache_dir:
382
+ The directory to cache the model in.
383
+ adapter_base_model_id:
384
+ The model ID of the base model if the model is an adapter model. Can be None
385
+ if the model is not an adapter model.
386
+ """
387
+
388
+ model_id: str
389
+ revision: str
390
+ task: str
391
+ languages: list[Language]
392
+ inference_backend: InferenceBackend
393
+ merge: bool
394
+ model_type: ModelType
395
+ fresh: bool
396
+ model_cache_dir: str
397
+ adapter_base_model_id: str | None
398
+
399
+ def __hash__(self) -> int:
400
+ """Return a hash of the model configuration."""
401
+ return hash(self.model_id)
402
+
403
+
404
+ @dataclass
405
+ class PreparedModelInputs:
406
+ """The inputs to a model.
407
+
408
+ Attributes:
409
+ texts:
410
+ The texts to input to the model. Can be None if the input IDs and attention
411
+ mask are provided instead.
412
+ input_ids:
413
+ The input IDs of the texts. Can be None if the texts are provided instead.
414
+ attention_mask:
415
+ The attention mask of the texts. Can be None if the texts are provided
416
+ instead.
417
+ """
418
+
419
+ texts: list[str] | None = None
420
+ input_ids: torch.Tensor | None = None
421
+ attention_mask: torch.Tensor | None = None
422
+
423
+
424
+ @dataclass
425
+ class GenerativeModelOutput:
426
+ """The output of a generative model.
427
+
428
+ Attributes:
429
+ sequences:
430
+ The generated sequences.
431
+ scores:
432
+ The scores of the sequences. This is an array of shape (batch_size,
433
+ num_tokens, num_logprobs, 2), where the last dimension contains the
434
+ token and its logprob. Can be None if the scores are not available.
435
+ """
436
+
437
+ sequences: list[str]
438
+ scores: list[list[list[tuple[str, float]]]] | None = None
439
+
440
+
441
+ @dataclass
442
+ class SingleGenerativeModelOutput:
443
+ """A single output of a generative model.
444
+
445
+ Attributes:
446
+ sequence:
447
+ The generated sequence.
448
+ scores:
449
+ The scores of the sequence. This is an array of shape (num_tokens,
450
+ num_logprobs, 2), where the last dimension contains the token and its
451
+ logprob. Can be None if the scores are not available.
452
+ """
453
+
454
+ sequence: str
455
+ scores: list[list[tuple[str, float]]] | None = None
456
+
457
+
458
+ @dataclass
459
+ class HFModelInfo:
460
+ """Information about a Hugging Face model.
461
+
462
+ Attributes:
463
+ pipeline_tag:
464
+ The pipeline tag of the model.
465
+ tags:
466
+ The other tags of the model.
467
+ adapter_base_model_id:
468
+ The model ID of the base model if the model is an adapter model. Can be None
469
+ if the model is not an adapter model.
470
+ """
471
+
472
+ pipeline_tag: str
473
+ tags: list[str]
474
+ adapter_base_model_id: str | None