EuroEval 15.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +72 -0
- euroeval/benchmark_config_factory.py +358 -0
- euroeval/benchmark_modules/__init__.py +7 -0
- euroeval/benchmark_modules/base.py +354 -0
- euroeval/benchmark_modules/fresh.py +286 -0
- euroeval/benchmark_modules/hf.py +1185 -0
- euroeval/benchmark_modules/litellm.py +905 -0
- euroeval/benchmark_modules/vllm.py +1171 -0
- euroeval/benchmarker.py +1074 -0
- euroeval/callbacks.py +72 -0
- euroeval/cli.py +281 -0
- euroeval/constants.py +50 -0
- euroeval/data_loading.py +96 -0
- euroeval/data_models.py +474 -0
- euroeval/dataset_configs.py +2001 -0
- euroeval/enums.py +144 -0
- euroeval/exceptions.py +191 -0
- euroeval/finetuning.py +324 -0
- euroeval/generation.py +296 -0
- euroeval/human_evaluation.py +737 -0
- euroeval/languages.py +200 -0
- euroeval/model_cache.py +253 -0
- euroeval/model_config.py +77 -0
- euroeval/model_loading.py +78 -0
- euroeval/scores.py +90 -0
- euroeval/speed_benchmark.py +124 -0
- euroeval/task_utils/__init__.py +1 -0
- euroeval/task_utils/multiple_choice_classification.py +176 -0
- euroeval/task_utils/question_answering.py +698 -0
- euroeval/task_utils/sequence_classification.py +237 -0
- euroeval/task_utils/text_to_text.py +150 -0
- euroeval/task_utils/token_classification.py +464 -0
- euroeval/tasks.py +202 -0
- euroeval/types.py +97 -0
- euroeval/utils.py +574 -0
- euroeval-15.2.0.dist-info/METADATA +234 -0
- euroeval-15.2.0.dist-info/RECORD +40 -0
- euroeval-15.2.0.dist-info/WHEEL +4 -0
- euroeval-15.2.0.dist-info/entry_points.txt +4 -0
- euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
euroeval/generation.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Functions related to text generation of models."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
import typing as t
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import more_itertools as mit
|
|
9
|
+
from datasets import Dataset, DatasetDict
|
|
10
|
+
from tqdm.auto import tqdm
|
|
11
|
+
|
|
12
|
+
from .benchmark_modules import BenchmarkModule
|
|
13
|
+
from .enums import BatchingPreference, TaskGroup
|
|
14
|
+
from .exceptions import InvalidBenchmark
|
|
15
|
+
from .model_cache import (
|
|
16
|
+
ModelCache,
|
|
17
|
+
load_cached_model_outputs,
|
|
18
|
+
split_dataset_into_cached_and_non_cached,
|
|
19
|
+
)
|
|
20
|
+
from .utils import clear_memory
|
|
21
|
+
|
|
22
|
+
if t.TYPE_CHECKING:
|
|
23
|
+
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("euroeval")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def generate(
|
|
29
|
+
model: "BenchmarkModule",
|
|
30
|
+
datasets: list[DatasetDict],
|
|
31
|
+
model_config: "ModelConfig",
|
|
32
|
+
dataset_config: "DatasetConfig",
|
|
33
|
+
benchmark_config: "BenchmarkConfig",
|
|
34
|
+
) -> list[dict[str, float]]:
|
|
35
|
+
"""Evaluate a model on a dataset through generation.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
model:
|
|
39
|
+
The model to evaluate.
|
|
40
|
+
datasets:
|
|
41
|
+
The datasets to evaluate on.
|
|
42
|
+
model_config:
|
|
43
|
+
The configuration of the model.
|
|
44
|
+
benchmark_config:
|
|
45
|
+
The configuration of the benchmark.
|
|
46
|
+
dataset_config:
|
|
47
|
+
The configuration of the dataset.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
A list of dictionaries containing the test scores.
|
|
51
|
+
"""
|
|
52
|
+
# Set up the name of the model output cache. If we are testing then we save the
|
|
53
|
+
# model outputs to a different cache and ensure that that cache is deleted before
|
|
54
|
+
# the next test, to ensure that the tests are independent of each other
|
|
55
|
+
if benchmark_config.debug:
|
|
56
|
+
model_cache_dir = Path.cwd()
|
|
57
|
+
else:
|
|
58
|
+
model_cache_dir = Path(model_config.model_cache_dir)
|
|
59
|
+
if hasattr(sys, "_called_from_test"):
|
|
60
|
+
cache_name = f"{dataset_config.name}-model-outputs-test.json"
|
|
61
|
+
(model_cache_dir / cache_name).unlink(missing_ok=True)
|
|
62
|
+
elif benchmark_config.debug:
|
|
63
|
+
cache_name = f"{model_config.model_id}-{dataset_config.name}-model-outputs.json"
|
|
64
|
+
else:
|
|
65
|
+
cache_name = f"{dataset_config.name}-model-outputs.json"
|
|
66
|
+
|
|
67
|
+
cache = ModelCache(
|
|
68
|
+
model_cache_dir=model_cache_dir,
|
|
69
|
+
cache_name=cache_name,
|
|
70
|
+
max_generated_tokens=dataset_config.max_generated_tokens,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
scores: list[dict[str, float]] = list()
|
|
74
|
+
for idx in tqdm(
|
|
75
|
+
iterable=range(benchmark_config.num_iterations),
|
|
76
|
+
desc="Benchmarking",
|
|
77
|
+
disable=not benchmark_config.progress_bar,
|
|
78
|
+
):
|
|
79
|
+
test_scores = generate_single_iteration(
|
|
80
|
+
model=model,
|
|
81
|
+
dataset=datasets[idx]["test"],
|
|
82
|
+
cache=cache,
|
|
83
|
+
dataset_config=dataset_config,
|
|
84
|
+
benchmark_config=benchmark_config,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
logger.debug(f"Test scores for iteration {idx}: {test_scores}")
|
|
88
|
+
scores.append(test_scores)
|
|
89
|
+
clear_memory()
|
|
90
|
+
|
|
91
|
+
if not benchmark_config.debug:
|
|
92
|
+
cache.remove()
|
|
93
|
+
|
|
94
|
+
return scores
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def generate_single_iteration(
|
|
98
|
+
dataset: Dataset,
|
|
99
|
+
model: "BenchmarkModule",
|
|
100
|
+
dataset_config: "DatasetConfig",
|
|
101
|
+
benchmark_config: "BenchmarkConfig",
|
|
102
|
+
cache: ModelCache,
|
|
103
|
+
) -> dict[str, float]:
|
|
104
|
+
"""Evaluate a model on a dataset in a single iteration through generation.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
dataset:
|
|
108
|
+
The dataset to evaluate on.
|
|
109
|
+
model:
|
|
110
|
+
The model to evaluate.
|
|
111
|
+
dataset_config:
|
|
112
|
+
The configuration of the dataset.
|
|
113
|
+
benchmark_config:
|
|
114
|
+
The configuration of the benchmark.
|
|
115
|
+
cache:
|
|
116
|
+
The model output cache.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
A list of dictionaries containing the scores for each metric.
|
|
120
|
+
"""
|
|
121
|
+
cache.load()
|
|
122
|
+
|
|
123
|
+
# Split up the dataset into a cached and non-cached part
|
|
124
|
+
cached_dataset, non_cached_dataset = split_dataset_into_cached_and_non_cached(
|
|
125
|
+
dataset=dataset, cache=cache
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
all_preds: list[str] = list()
|
|
129
|
+
|
|
130
|
+
if len(non_cached_dataset) > 0:
|
|
131
|
+
match model.batching_preference:
|
|
132
|
+
case BatchingPreference.SINGLE_SAMPLE:
|
|
133
|
+
itr = tqdm(iterable=non_cached_dataset, leave=False)
|
|
134
|
+
case BatchingPreference.ALL_AT_ONCE:
|
|
135
|
+
itr = [non_cached_dataset[:]]
|
|
136
|
+
case _:
|
|
137
|
+
num_batches = len(non_cached_dataset) // benchmark_config.batch_size
|
|
138
|
+
if len(non_cached_dataset) % benchmark_config.batch_size != 0:
|
|
139
|
+
num_batches += 1
|
|
140
|
+
itr = tqdm(
|
|
141
|
+
iterable=mit.batched(
|
|
142
|
+
iterable=non_cached_dataset, n=benchmark_config.batch_size
|
|
143
|
+
),
|
|
144
|
+
total=len(non_cached_dataset) // benchmark_config.batch_size,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Generate the completions for the non-cached examples
|
|
148
|
+
for batch in itr:
|
|
149
|
+
assert isinstance(batch, dict)
|
|
150
|
+
|
|
151
|
+
single_sample_batch = (
|
|
152
|
+
"text" in batch and isinstance(batch["text"], str)
|
|
153
|
+
) or ("messages" in batch and isinstance(batch["messages"][0], dict))
|
|
154
|
+
if single_sample_batch:
|
|
155
|
+
batch = {key: [value] for key, value in batch.items()}
|
|
156
|
+
|
|
157
|
+
model_output = model.generate(inputs=batch)
|
|
158
|
+
extracted_labels = model.extract_labels_from_generation(
|
|
159
|
+
input_batch=batch, model_output=model_output
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Extended logging if we are running in debug mode
|
|
163
|
+
if benchmark_config.debug:
|
|
164
|
+
debug_log(
|
|
165
|
+
batch=batch,
|
|
166
|
+
extracted_labels=extracted_labels, # type: ignore[arg-type]
|
|
167
|
+
dataset_config=dataset_config,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
cache.add_to_cache(model_inputs=batch, model_output=model_output)
|
|
171
|
+
all_preds.extend(extracted_labels)
|
|
172
|
+
|
|
173
|
+
# If we are debugging then we save the cache often, but since this makes
|
|
174
|
+
# evaluation slower, we do not do this by default
|
|
175
|
+
if benchmark_config.debug:
|
|
176
|
+
cache.save()
|
|
177
|
+
|
|
178
|
+
if isinstance(itr, tqdm):
|
|
179
|
+
itr.close()
|
|
180
|
+
|
|
181
|
+
# Store the cache to disk
|
|
182
|
+
cache.save()
|
|
183
|
+
|
|
184
|
+
# Fetch the cached predictions for the cached examples
|
|
185
|
+
if len(cached_dataset) > 0:
|
|
186
|
+
model_output = load_cached_model_outputs(
|
|
187
|
+
cached_dataset=cached_dataset, cache=cache
|
|
188
|
+
)
|
|
189
|
+
extracted_labels = model.extract_labels_from_generation(
|
|
190
|
+
input_batch=cached_dataset[:], model_output=model_output
|
|
191
|
+
)
|
|
192
|
+
all_preds.extend(extracted_labels)
|
|
193
|
+
|
|
194
|
+
if "label" in non_cached_dataset.column_names:
|
|
195
|
+
ground_truth = [
|
|
196
|
+
label.lower() if isinstance(label, str) else label
|
|
197
|
+
for label in non_cached_dataset["label"] + cached_dataset["label"]
|
|
198
|
+
]
|
|
199
|
+
elif "labels" in non_cached_dataset.column_names:
|
|
200
|
+
ground_truth = [
|
|
201
|
+
[label.lower() if isinstance(label, str) else label for label in label_list]
|
|
202
|
+
for label_list in non_cached_dataset["labels"] + cached_dataset["labels"]
|
|
203
|
+
]
|
|
204
|
+
elif "target_text" in non_cached_dataset.column_names:
|
|
205
|
+
ground_truth = non_cached_dataset["target_text"] + cached_dataset["target_text"]
|
|
206
|
+
else:
|
|
207
|
+
raise ValueError(
|
|
208
|
+
"The dataset must have either a 'label', 'labels', or 'target_text' column"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
itr_scores: dict[str, float] = model.compute_metrics(
|
|
212
|
+
model_outputs_and_labels=(all_preds, ground_truth)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
return itr_scores
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def debug_log(
|
|
219
|
+
batch: dict[str, t.Any],
|
|
220
|
+
extracted_labels: list[dict | str | list[str]],
|
|
221
|
+
dataset_config: "DatasetConfig",
|
|
222
|
+
) -> None:
|
|
223
|
+
"""Log inputs and outputs for debugging purposes.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
batch:
|
|
227
|
+
The batch of examples to evaluate on.
|
|
228
|
+
extracted_labels:
|
|
229
|
+
The extracted labels from the model output.
|
|
230
|
+
dataset_config:
|
|
231
|
+
The configuration of the dataset.
|
|
232
|
+
"""
|
|
233
|
+
match dataset_config.task.task_group:
|
|
234
|
+
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
235
|
+
log_msgs = [""]
|
|
236
|
+
for tokens, predictions, labels in zip(
|
|
237
|
+
batch["tokens"], extracted_labels, batch["labels"]
|
|
238
|
+
):
|
|
239
|
+
predictions = [tag.upper() for tag in predictions]
|
|
240
|
+
sample = list(zip(tokens, predictions, labels))
|
|
241
|
+
log_batches = [
|
|
242
|
+
[("Tokens: ", "Predictions: ", "Labels: ")] + sample[i : i + 10]
|
|
243
|
+
for i in range(0, len(sample), 10)
|
|
244
|
+
]
|
|
245
|
+
for log_batch in log_batches:
|
|
246
|
+
lengths = [len(max(triple, key=len)) for triple in log_batch]
|
|
247
|
+
log_batch = [
|
|
248
|
+
[f"{x:<{length}}" for x in triple]
|
|
249
|
+
for triple, length in zip(log_batch, lengths)
|
|
250
|
+
]
|
|
251
|
+
tokens = [triple[0] for triple in log_batch]
|
|
252
|
+
predictions = [triple[1] for triple in log_batch]
|
|
253
|
+
labels = [triple[2] for triple in log_batch]
|
|
254
|
+
log_msgs.append(
|
|
255
|
+
"\t".join(tokens)
|
|
256
|
+
+ "\n"
|
|
257
|
+
+ "\t".join(predictions)
|
|
258
|
+
+ "\n"
|
|
259
|
+
+ "\t".join(labels)
|
|
260
|
+
)
|
|
261
|
+
logger.info("\n\n".join(log_msgs))
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
case (
|
|
265
|
+
TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
266
|
+
):
|
|
267
|
+
labels = [
|
|
268
|
+
dataset_config.prompt_label_mapping.get(label, label).lower()
|
|
269
|
+
for label in batch["label"]
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
case TaskGroup.QUESTION_ANSWERING:
|
|
273
|
+
extracted_labels = [
|
|
274
|
+
prediction["prediction_text"]
|
|
275
|
+
for prediction in extracted_labels
|
|
276
|
+
if isinstance(prediction, dict)
|
|
277
|
+
]
|
|
278
|
+
labels = [label["answers"]["text"][0] for label in batch["label"]]
|
|
279
|
+
|
|
280
|
+
case TaskGroup.TEXT_TO_TEXT:
|
|
281
|
+
labels = batch["target_text"]
|
|
282
|
+
|
|
283
|
+
case _:
|
|
284
|
+
raise InvalidBenchmark(
|
|
285
|
+
f"The task group '{dataset_config.task.task_group}' is not supported."
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
if "messages" in batch:
|
|
289
|
+
input_texts = [messages[-1]["content"] for messages in batch["messages"]]
|
|
290
|
+
else:
|
|
291
|
+
input_texts = batch["text"]
|
|
292
|
+
|
|
293
|
+
for input_text, prediction, label in zip(input_texts, extracted_labels, labels):
|
|
294
|
+
logger.info(
|
|
295
|
+
f"Input: '{input_text}'\nPrediction: '{prediction}'\nLabel: '{label}'"
|
|
296
|
+
)
|