EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
euroeval/generation.py ADDED
@@ -0,0 +1,296 @@
1
+ """Functions related to text generation of models."""
2
+
3
+ import logging
4
+ import sys
5
+ import typing as t
6
+ from pathlib import Path
7
+
8
+ import more_itertools as mit
9
+ from datasets import Dataset, DatasetDict
10
+ from tqdm.auto import tqdm
11
+
12
+ from .benchmark_modules import BenchmarkModule
13
+ from .enums import BatchingPreference, TaskGroup
14
+ from .exceptions import InvalidBenchmark
15
+ from .model_cache import (
16
+ ModelCache,
17
+ load_cached_model_outputs,
18
+ split_dataset_into_cached_and_non_cached,
19
+ )
20
+ from .utils import clear_memory
21
+
22
+ if t.TYPE_CHECKING:
23
+ from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
24
+
25
+ logger = logging.getLogger("euroeval")
26
+
27
+
28
+ def generate(
29
+ model: "BenchmarkModule",
30
+ datasets: list[DatasetDict],
31
+ model_config: "ModelConfig",
32
+ dataset_config: "DatasetConfig",
33
+ benchmark_config: "BenchmarkConfig",
34
+ ) -> list[dict[str, float]]:
35
+ """Evaluate a model on a dataset through generation.
36
+
37
+ Args:
38
+ model:
39
+ The model to evaluate.
40
+ datasets:
41
+ The datasets to evaluate on.
42
+ model_config:
43
+ The configuration of the model.
44
+ benchmark_config:
45
+ The configuration of the benchmark.
46
+ dataset_config:
47
+ The configuration of the dataset.
48
+
49
+ Returns:
50
+ A list of dictionaries containing the test scores.
51
+ """
52
+ # Set up the name of the model output cache. If we are testing then we save the
53
+ # model outputs to a different cache and ensure that that cache is deleted before
54
+ # the next test, to ensure that the tests are independent of each other
55
+ if benchmark_config.debug:
56
+ model_cache_dir = Path.cwd()
57
+ else:
58
+ model_cache_dir = Path(model_config.model_cache_dir)
59
+ if hasattr(sys, "_called_from_test"):
60
+ cache_name = f"{dataset_config.name}-model-outputs-test.json"
61
+ (model_cache_dir / cache_name).unlink(missing_ok=True)
62
+ elif benchmark_config.debug:
63
+ cache_name = f"{model_config.model_id}-{dataset_config.name}-model-outputs.json"
64
+ else:
65
+ cache_name = f"{dataset_config.name}-model-outputs.json"
66
+
67
+ cache = ModelCache(
68
+ model_cache_dir=model_cache_dir,
69
+ cache_name=cache_name,
70
+ max_generated_tokens=dataset_config.max_generated_tokens,
71
+ )
72
+
73
+ scores: list[dict[str, float]] = list()
74
+ for idx in tqdm(
75
+ iterable=range(benchmark_config.num_iterations),
76
+ desc="Benchmarking",
77
+ disable=not benchmark_config.progress_bar,
78
+ ):
79
+ test_scores = generate_single_iteration(
80
+ model=model,
81
+ dataset=datasets[idx]["test"],
82
+ cache=cache,
83
+ dataset_config=dataset_config,
84
+ benchmark_config=benchmark_config,
85
+ )
86
+
87
+ logger.debug(f"Test scores for iteration {idx}: {test_scores}")
88
+ scores.append(test_scores)
89
+ clear_memory()
90
+
91
+ if not benchmark_config.debug:
92
+ cache.remove()
93
+
94
+ return scores
95
+
96
+
97
+ def generate_single_iteration(
98
+ dataset: Dataset,
99
+ model: "BenchmarkModule",
100
+ dataset_config: "DatasetConfig",
101
+ benchmark_config: "BenchmarkConfig",
102
+ cache: ModelCache,
103
+ ) -> dict[str, float]:
104
+ """Evaluate a model on a dataset in a single iteration through generation.
105
+
106
+ Args:
107
+ dataset:
108
+ The dataset to evaluate on.
109
+ model:
110
+ The model to evaluate.
111
+ dataset_config:
112
+ The configuration of the dataset.
113
+ benchmark_config:
114
+ The configuration of the benchmark.
115
+ cache:
116
+ The model output cache.
117
+
118
+ Returns:
119
+ A list of dictionaries containing the scores for each metric.
120
+ """
121
+ cache.load()
122
+
123
+ # Split up the dataset into a cached and non-cached part
124
+ cached_dataset, non_cached_dataset = split_dataset_into_cached_and_non_cached(
125
+ dataset=dataset, cache=cache
126
+ )
127
+
128
+ all_preds: list[str] = list()
129
+
130
+ if len(non_cached_dataset) > 0:
131
+ match model.batching_preference:
132
+ case BatchingPreference.SINGLE_SAMPLE:
133
+ itr = tqdm(iterable=non_cached_dataset, leave=False)
134
+ case BatchingPreference.ALL_AT_ONCE:
135
+ itr = [non_cached_dataset[:]]
136
+ case _:
137
+ num_batches = len(non_cached_dataset) // benchmark_config.batch_size
138
+ if len(non_cached_dataset) % benchmark_config.batch_size != 0:
139
+ num_batches += 1
140
+ itr = tqdm(
141
+ iterable=mit.batched(
142
+ iterable=non_cached_dataset, n=benchmark_config.batch_size
143
+ ),
144
+ total=len(non_cached_dataset) // benchmark_config.batch_size,
145
+ )
146
+
147
+ # Generate the completions for the non-cached examples
148
+ for batch in itr:
149
+ assert isinstance(batch, dict)
150
+
151
+ single_sample_batch = (
152
+ "text" in batch and isinstance(batch["text"], str)
153
+ ) or ("messages" in batch and isinstance(batch["messages"][0], dict))
154
+ if single_sample_batch:
155
+ batch = {key: [value] for key, value in batch.items()}
156
+
157
+ model_output = model.generate(inputs=batch)
158
+ extracted_labels = model.extract_labels_from_generation(
159
+ input_batch=batch, model_output=model_output
160
+ )
161
+
162
+ # Extended logging if we are running in debug mode
163
+ if benchmark_config.debug:
164
+ debug_log(
165
+ batch=batch,
166
+ extracted_labels=extracted_labels, # type: ignore[arg-type]
167
+ dataset_config=dataset_config,
168
+ )
169
+
170
+ cache.add_to_cache(model_inputs=batch, model_output=model_output)
171
+ all_preds.extend(extracted_labels)
172
+
173
+ # If we are debugging then we save the cache often, but since this makes
174
+ # evaluation slower, we do not do this by default
175
+ if benchmark_config.debug:
176
+ cache.save()
177
+
178
+ if isinstance(itr, tqdm):
179
+ itr.close()
180
+
181
+ # Store the cache to disk
182
+ cache.save()
183
+
184
+ # Fetch the cached predictions for the cached examples
185
+ if len(cached_dataset) > 0:
186
+ model_output = load_cached_model_outputs(
187
+ cached_dataset=cached_dataset, cache=cache
188
+ )
189
+ extracted_labels = model.extract_labels_from_generation(
190
+ input_batch=cached_dataset[:], model_output=model_output
191
+ )
192
+ all_preds.extend(extracted_labels)
193
+
194
+ if "label" in non_cached_dataset.column_names:
195
+ ground_truth = [
196
+ label.lower() if isinstance(label, str) else label
197
+ for label in non_cached_dataset["label"] + cached_dataset["label"]
198
+ ]
199
+ elif "labels" in non_cached_dataset.column_names:
200
+ ground_truth = [
201
+ [label.lower() if isinstance(label, str) else label for label in label_list]
202
+ for label_list in non_cached_dataset["labels"] + cached_dataset["labels"]
203
+ ]
204
+ elif "target_text" in non_cached_dataset.column_names:
205
+ ground_truth = non_cached_dataset["target_text"] + cached_dataset["target_text"]
206
+ else:
207
+ raise ValueError(
208
+ "The dataset must have either a 'label', 'labels', or 'target_text' column"
209
+ )
210
+
211
+ itr_scores: dict[str, float] = model.compute_metrics(
212
+ model_outputs_and_labels=(all_preds, ground_truth)
213
+ )
214
+
215
+ return itr_scores
216
+
217
+
218
+ def debug_log(
219
+ batch: dict[str, t.Any],
220
+ extracted_labels: list[dict | str | list[str]],
221
+ dataset_config: "DatasetConfig",
222
+ ) -> None:
223
+ """Log inputs and outputs for debugging purposes.
224
+
225
+ Args:
226
+ batch:
227
+ The batch of examples to evaluate on.
228
+ extracted_labels:
229
+ The extracted labels from the model output.
230
+ dataset_config:
231
+ The configuration of the dataset.
232
+ """
233
+ match dataset_config.task.task_group:
234
+ case TaskGroup.TOKEN_CLASSIFICATION:
235
+ log_msgs = [""]
236
+ for tokens, predictions, labels in zip(
237
+ batch["tokens"], extracted_labels, batch["labels"]
238
+ ):
239
+ predictions = [tag.upper() for tag in predictions]
240
+ sample = list(zip(tokens, predictions, labels))
241
+ log_batches = [
242
+ [("Tokens: ", "Predictions: ", "Labels: ")] + sample[i : i + 10]
243
+ for i in range(0, len(sample), 10)
244
+ ]
245
+ for log_batch in log_batches:
246
+ lengths = [len(max(triple, key=len)) for triple in log_batch]
247
+ log_batch = [
248
+ [f"{x:<{length}}" for x in triple]
249
+ for triple, length in zip(log_batch, lengths)
250
+ ]
251
+ tokens = [triple[0] for triple in log_batch]
252
+ predictions = [triple[1] for triple in log_batch]
253
+ labels = [triple[2] for triple in log_batch]
254
+ log_msgs.append(
255
+ "\t".join(tokens)
256
+ + "\n"
257
+ + "\t".join(predictions)
258
+ + "\n"
259
+ + "\t".join(labels)
260
+ )
261
+ logger.info("\n\n".join(log_msgs))
262
+ return
263
+
264
+ case (
265
+ TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
266
+ ):
267
+ labels = [
268
+ dataset_config.prompt_label_mapping.get(label, label).lower()
269
+ for label in batch["label"]
270
+ ]
271
+
272
+ case TaskGroup.QUESTION_ANSWERING:
273
+ extracted_labels = [
274
+ prediction["prediction_text"]
275
+ for prediction in extracted_labels
276
+ if isinstance(prediction, dict)
277
+ ]
278
+ labels = [label["answers"]["text"][0] for label in batch["label"]]
279
+
280
+ case TaskGroup.TEXT_TO_TEXT:
281
+ labels = batch["target_text"]
282
+
283
+ case _:
284
+ raise InvalidBenchmark(
285
+ f"The task group '{dataset_config.task.task_group}' is not supported."
286
+ )
287
+
288
+ if "messages" in batch:
289
+ input_texts = [messages[-1]["content"] for messages in batch["messages"]]
290
+ else:
291
+ input_texts = batch["text"]
292
+
293
+ for input_text, prediction, label in zip(input_texts, extracted_labels, labels):
294
+ logger.info(
295
+ f"Input: '{input_text}'\nPrediction: '{prediction}'\nLabel: '{label}'"
296
+ )