EuroEval 15.14.0__py3-none-any.whl → 15.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -0
- euroeval/benchmark_modules/litellm.py +155 -105
- euroeval/benchmark_modules/vllm.py +21 -15
- euroeval/benchmarker.py +10 -11
- euroeval/data_models.py +1 -1
- euroeval/dataset_configs/danish.py +10 -0
- euroeval/dataset_configs/dutch.py +10 -0
- euroeval/dataset_configs/finnish.py +10 -0
- euroeval/dataset_configs/french.py +10 -0
- euroeval/dataset_configs/german.py +10 -0
- euroeval/dataset_configs/italian.py +10 -0
- euroeval/dataset_configs/spanish.py +10 -0
- euroeval/dataset_configs/swedish.py +10 -0
- euroeval/finetuning.py +2 -1
- euroeval/generation.py +1 -1
- euroeval/human_evaluation.py +2 -1
- euroeval/metrics.py +22 -4
- euroeval/prompt_templates/multiple_choice.py +1 -1
- euroeval/task_group_utils/question_answering.py +7 -1
- euroeval/task_group_utils/sequence_classification.py +8 -1
- euroeval/task_group_utils/text_to_text.py +8 -1
- euroeval/task_group_utils/token_classification.py +9 -2
- euroeval/types.py +5 -0
- {euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/METADATA +5 -6
- {euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/RECORD +28 -28
- {euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/WHEEL +0 -0
- {euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.14.0.dist-info → euroeval-15.16.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
|
|
|
86
86
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
87
87
|
|
|
88
88
|
|
|
89
|
+
# Allow long max model length in vLLM. This happens when vLLM registers that the model
|
|
90
|
+
# has a shorter context length than the value we are inserting. But since we do a
|
|
91
|
+
# thorough check of the model's config before setting the context length, we trust our
|
|
92
|
+
# own checks and ignore the internal vLLM check.
|
|
93
|
+
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
|
|
94
|
+
|
|
95
|
+
|
|
89
96
|
# Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
|
|
90
97
|
# The error comes from the `aiohttp` package, and this environment variable forces the
|
|
91
98
|
# use of `httpx` instead.
|
|
@@ -6,7 +6,7 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import typing as t
|
|
9
|
-
from functools import cached_property, partial
|
|
9
|
+
from functools import cache, cached_property, partial
|
|
10
10
|
from time import sleep
|
|
11
11
|
|
|
12
12
|
import litellm
|
|
@@ -27,6 +27,7 @@ from litellm.exceptions import (
|
|
|
27
27
|
RateLimitError,
|
|
28
28
|
ServiceUnavailableError,
|
|
29
29
|
Timeout,
|
|
30
|
+
UnsupportedParamsError,
|
|
30
31
|
)
|
|
31
32
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
32
33
|
from litellm.router import Router
|
|
@@ -87,6 +88,7 @@ logger = logging.getLogger("euroeval")
|
|
|
87
88
|
|
|
88
89
|
VOCAB_SIZE_MAPPING = {
|
|
89
90
|
# OpenAI models
|
|
91
|
+
r"gpt-5-.*": 100_256,
|
|
90
92
|
r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
|
|
91
93
|
r"gpt-4-[0-9]{4}-preview": 100_256,
|
|
92
94
|
r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
|
|
@@ -105,6 +107,7 @@ VOCAB_SIZE_MAPPING = {
|
|
|
105
107
|
|
|
106
108
|
MODEL_MAX_LENGTH_MAPPING = {
|
|
107
109
|
# OpenAI models
|
|
110
|
+
r"gpt-5-.*": 272_000,
|
|
108
111
|
r"gpt-4(-[0-9]{4})?": 8_191,
|
|
109
112
|
r"gpt-4-32k(-[0-9]{4})?": 32_767,
|
|
110
113
|
r"gpt-4-[0-9]{4}-preview": 128_000,
|
|
@@ -129,6 +132,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
129
132
|
|
|
130
133
|
NUM_PARAMS_MAPPING = {
|
|
131
134
|
# OpenAI models
|
|
135
|
+
r"gpt-5-.*": -1,
|
|
132
136
|
r"gpt-4.*": -1,
|
|
133
137
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
134
138
|
# Anthropic models
|
|
@@ -144,6 +148,7 @@ NUM_PARAMS_MAPPING = {
|
|
|
144
148
|
|
|
145
149
|
ALLOWED_PARAMS = {
|
|
146
150
|
# OpenAI models
|
|
151
|
+
r"gpt-5-.*": ["minimal", "low", "medium", "high"],
|
|
147
152
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
|
|
148
153
|
# Anthropic models
|
|
149
154
|
r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
|
|
@@ -269,28 +274,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
269
274
|
generative_type=self.generative_type,
|
|
270
275
|
)
|
|
271
276
|
|
|
272
|
-
#
|
|
273
|
-
|
|
274
|
-
model=self.model_config.model_id,
|
|
275
|
-
max_completion_tokens=(
|
|
276
|
-
REASONING_MAX_TOKENS
|
|
277
|
-
if self.generative_type == GenerativeType.REASONING
|
|
278
|
-
else self.dataset_config.max_generated_tokens
|
|
279
|
-
),
|
|
280
|
-
stop=[],
|
|
281
|
-
temperature=0.0,
|
|
282
|
-
seed=4242,
|
|
283
|
-
api_key=self.benchmark_config.api_key,
|
|
284
|
-
api_base=self.benchmark_config.api_base,
|
|
285
|
-
api_version=self.benchmark_config.api_version,
|
|
286
|
-
max_retries=3,
|
|
287
|
-
)
|
|
288
|
-
|
|
289
|
-
# Set up the `response_format` generation argument if we are dealing with a task
|
|
290
|
-
# using structured generation
|
|
277
|
+
# Sanity check that "JSON" is included in the prompt, as some models require
|
|
278
|
+
# this
|
|
291
279
|
if self.dataset_config.task in TASKS_USING_JSON:
|
|
292
|
-
# Sanity check that "JSON" is included in the prompt, as some models require
|
|
293
|
-
# this
|
|
294
280
|
for conversation in conversations:
|
|
295
281
|
if not conversation:
|
|
296
282
|
raise InvalidBenchmark(
|
|
@@ -310,87 +296,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
310
296
|
"Prompt must contain 'json' for JSON tasks."
|
|
311
297
|
)
|
|
312
298
|
|
|
313
|
-
if self.generative_type == GenerativeType.REASONING:
|
|
314
|
-
log_once(
|
|
315
|
-
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
316
|
-
"and thus does not support structured generation, so we do not "
|
|
317
|
-
"enable it.",
|
|
318
|
-
level=logging.DEBUG,
|
|
319
|
-
)
|
|
320
|
-
elif supports_response_schema(model=self.model_config.model_id):
|
|
321
|
-
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
322
|
-
keys_and_their_types: dict[str, t.Any] = {
|
|
323
|
-
tag_name: (conlist(str, max_length=5), ...)
|
|
324
|
-
for tag_name in ner_tag_names
|
|
325
|
-
}
|
|
326
|
-
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
327
|
-
generation_kwargs["response_format"] = pydantic_class
|
|
328
|
-
log_once(
|
|
329
|
-
"Enabling structured generation for model "
|
|
330
|
-
f"{self.model_config.model_id!r} with the JSON schema "
|
|
331
|
-
f"{pydantic_class.model_json_schema()}",
|
|
332
|
-
level=logging.DEBUG,
|
|
333
|
-
)
|
|
334
|
-
else:
|
|
335
|
-
generation_kwargs["response_format"] = dict(type="json_object")
|
|
336
|
-
log_once(
|
|
337
|
-
"Enabling structured JSON generation for model "
|
|
338
|
-
f"{self.model_config.model_id!r} with no custom JSON schema, as "
|
|
339
|
-
"the model does not support schemas.",
|
|
340
|
-
level=logging.DEBUG,
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
344
|
-
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
345
|
-
generation_kwargs["think"] = True
|
|
346
|
-
log_once(
|
|
347
|
-
"Enabling thinking mode for Ollama model "
|
|
348
|
-
f"{self.model_config.model_id!r}",
|
|
349
|
-
level=logging.DEBUG,
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
# Handle manually set parameters
|
|
353
|
-
if self.buffer["first_label_token_mapping"]:
|
|
354
|
-
generation_kwargs["logprobs"] = True
|
|
355
|
-
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
356
|
-
if self.model_config.revision == "thinking":
|
|
357
|
-
generation_kwargs["thinking"] = dict(
|
|
358
|
-
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
359
|
-
)
|
|
360
|
-
log_once(
|
|
361
|
-
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
362
|
-
level=logging.DEBUG,
|
|
363
|
-
)
|
|
364
|
-
elif self.model_config.revision == "no-thinking":
|
|
365
|
-
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
366
|
-
log_once(
|
|
367
|
-
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
368
|
-
level=logging.DEBUG,
|
|
369
|
-
)
|
|
370
|
-
elif self.model_config.revision in {"low", "medium", "high"}:
|
|
371
|
-
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
372
|
-
log_once(
|
|
373
|
-
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
374
|
-
f"{self.model_config.model_id!r}",
|
|
375
|
-
level=logging.DEBUG,
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
# Drop generation kwargs that are not supported by the model
|
|
379
|
-
litellm.drop_params = True
|
|
380
|
-
|
|
381
|
-
# First attempt is a test run with a single conversation to handle errors
|
|
382
|
-
# quickly
|
|
383
|
-
test_conversation = conversations[0]
|
|
384
|
-
_, failures = safe_run(
|
|
385
|
-
self._generate_async(
|
|
386
|
-
model_id=self.model_config.model_id,
|
|
387
|
-
conversations=[test_conversation],
|
|
388
|
-
**generation_kwargs,
|
|
389
|
-
)
|
|
390
|
-
)
|
|
391
|
-
for _, error in failures:
|
|
392
|
-
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
393
|
-
|
|
394
299
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
395
300
|
conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
|
|
396
301
|
enumerate(conversations)
|
|
@@ -404,7 +309,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
404
309
|
self._generate_async(
|
|
405
310
|
model_id=self.model_config.model_id,
|
|
406
311
|
conversations=list(batch_conversations),
|
|
407
|
-
**
|
|
312
|
+
**self.get_generation_kwargs(dataset_config=self.dataset_config),
|
|
408
313
|
)
|
|
409
314
|
)
|
|
410
315
|
|
|
@@ -431,7 +336,12 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
431
336
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
432
337
|
# successful generations next time around
|
|
433
338
|
for _, error in failures:
|
|
434
|
-
self._handle_exception(
|
|
339
|
+
self._handle_exception(
|
|
340
|
+
error=error,
|
|
341
|
+
generation_kwargs=self.get_generation_kwargs(
|
|
342
|
+
dataset_config=self.dataset_config
|
|
343
|
+
),
|
|
344
|
+
)
|
|
435
345
|
|
|
436
346
|
# Sleep for a second to avoid pinging the API server too quickly
|
|
437
347
|
sleep(1)
|
|
@@ -484,6 +394,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
484
394
|
"`temperature` may only be set to 1",
|
|
485
395
|
"'temperature' does not support 0.0 with this model. Only the default "
|
|
486
396
|
"(1) value is supported",
|
|
397
|
+
"Only temperature=1 is supported",
|
|
487
398
|
]
|
|
488
399
|
max_items_messages = ["'maxItems' is not permitted."]
|
|
489
400
|
no_json_schema_messages = ["Property keys should match pattern"]
|
|
@@ -593,6 +504,20 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
593
504
|
)
|
|
594
505
|
sleep(5)
|
|
595
506
|
return
|
|
507
|
+
elif isinstance(error, UnsupportedParamsError):
|
|
508
|
+
unsupported_param_match = re.search(
|
|
509
|
+
pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
|
|
510
|
+
string=error.message,
|
|
511
|
+
)
|
|
512
|
+
if unsupported_param_match is None:
|
|
513
|
+
raise InvalidModel(error.message)
|
|
514
|
+
else:
|
|
515
|
+
unsupported_param = unsupported_param_match.group(0)
|
|
516
|
+
raise InvalidModel(
|
|
517
|
+
f"The model {model_id!r} does not support the parameter "
|
|
518
|
+
f"{unsupported_param!r}. Try again without this parameter. "
|
|
519
|
+
"Skipping this model."
|
|
520
|
+
)
|
|
596
521
|
elif isinstance(error, (APIConnectionError, OSError)):
|
|
597
522
|
# If there are too many I/O connections, we increase the number of allowed
|
|
598
523
|
# file descriptors
|
|
@@ -1233,6 +1158,126 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1233
1158
|
|
|
1234
1159
|
return dataset
|
|
1235
1160
|
|
|
1161
|
+
@cache
|
|
1162
|
+
def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
|
|
1163
|
+
"""Get the generation arguments for the model.
|
|
1164
|
+
|
|
1165
|
+
Args:
|
|
1166
|
+
dataset_config:
|
|
1167
|
+
The dataset configuration, which is used to determine the generative
|
|
1168
|
+
type of the model. We use this as an argument here rather than using
|
|
1169
|
+
`self.dataset_config` to ensure that that the cache is updated when the
|
|
1170
|
+
dataset configuration changes.
|
|
1171
|
+
|
|
1172
|
+
Returns:
|
|
1173
|
+
The generation arguments for the model.
|
|
1174
|
+
"""
|
|
1175
|
+
# Set the core generation arguments
|
|
1176
|
+
generation_kwargs: dict[str, t.Any] = dict(
|
|
1177
|
+
model=self.model_config.model_id,
|
|
1178
|
+
max_completion_tokens=(
|
|
1179
|
+
REASONING_MAX_TOKENS
|
|
1180
|
+
if self.generative_type == GenerativeType.REASONING
|
|
1181
|
+
else dataset_config.max_generated_tokens
|
|
1182
|
+
),
|
|
1183
|
+
stop=[],
|
|
1184
|
+
temperature=0.0,
|
|
1185
|
+
seed=4242,
|
|
1186
|
+
api_key=self.benchmark_config.api_key,
|
|
1187
|
+
api_base=self.benchmark_config.api_base,
|
|
1188
|
+
api_version=self.benchmark_config.api_version,
|
|
1189
|
+
max_retries=3,
|
|
1190
|
+
)
|
|
1191
|
+
|
|
1192
|
+
# Set up the `response_format` generation argument if we are dealing with a task
|
|
1193
|
+
# using structured generation
|
|
1194
|
+
if dataset_config.task in TASKS_USING_JSON:
|
|
1195
|
+
if self.generative_type == GenerativeType.REASONING:
|
|
1196
|
+
log_once(
|
|
1197
|
+
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
1198
|
+
"and thus does not support structured generation, so we do not "
|
|
1199
|
+
"enable it.",
|
|
1200
|
+
level=logging.DEBUG,
|
|
1201
|
+
)
|
|
1202
|
+
elif supports_response_schema(model=self.model_config.model_id):
|
|
1203
|
+
ner_tag_names = list(dataset_config.prompt_label_mapping.values())
|
|
1204
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
1205
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
1206
|
+
for tag_name in ner_tag_names
|
|
1207
|
+
}
|
|
1208
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
1209
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
1210
|
+
log_once(
|
|
1211
|
+
"Enabling structured generation for model "
|
|
1212
|
+
f"{self.model_config.model_id!r} with the JSON schema "
|
|
1213
|
+
f"{pydantic_class.model_json_schema()}",
|
|
1214
|
+
level=logging.DEBUG,
|
|
1215
|
+
)
|
|
1216
|
+
else:
|
|
1217
|
+
generation_kwargs["response_format"] = dict(type="json_object")
|
|
1218
|
+
log_once(
|
|
1219
|
+
"Enabling structured JSON generation for model "
|
|
1220
|
+
f"{self.model_config.model_id!r} with no custom JSON schema, as "
|
|
1221
|
+
"the model does not support schemas.",
|
|
1222
|
+
level=logging.DEBUG,
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
1226
|
+
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
1227
|
+
generation_kwargs["think"] = True
|
|
1228
|
+
log_once(
|
|
1229
|
+
"Enabling thinking mode for Ollama model "
|
|
1230
|
+
f"{self.model_config.model_id!r}",
|
|
1231
|
+
level=logging.DEBUG,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
# Handle manually set parameters
|
|
1235
|
+
if self.buffer["first_label_token_mapping"]:
|
|
1236
|
+
generation_kwargs["logprobs"] = True
|
|
1237
|
+
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
1238
|
+
if self.model_config.revision == "thinking":
|
|
1239
|
+
generation_kwargs["thinking"] = dict(
|
|
1240
|
+
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
1241
|
+
)
|
|
1242
|
+
log_once(
|
|
1243
|
+
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
1244
|
+
level=logging.DEBUG,
|
|
1245
|
+
)
|
|
1246
|
+
elif self.model_config.revision == "no-thinking":
|
|
1247
|
+
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
1248
|
+
log_once(
|
|
1249
|
+
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
1250
|
+
level=logging.DEBUG,
|
|
1251
|
+
)
|
|
1252
|
+
elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
|
|
1253
|
+
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
1254
|
+
log_once(
|
|
1255
|
+
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
1256
|
+
f"{self.model_config.model_id!r}",
|
|
1257
|
+
level=logging.DEBUG,
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
# First attempt is a test run with a single conversation to handle errors
|
|
1261
|
+
# quickly. We repeat this multiple times to deal with different types of
|
|
1262
|
+
# errors, and stop if we get a successful response.
|
|
1263
|
+
test_conversation = [
|
|
1264
|
+
litellm.ChatCompletionUserMessage(role="user", content="Test message")
|
|
1265
|
+
]
|
|
1266
|
+
for _ in range(5):
|
|
1267
|
+
_, failures = safe_run(
|
|
1268
|
+
self._generate_async(
|
|
1269
|
+
model_id=self.model_config.model_id,
|
|
1270
|
+
conversations=[test_conversation],
|
|
1271
|
+
**generation_kwargs,
|
|
1272
|
+
)
|
|
1273
|
+
)
|
|
1274
|
+
if not failures:
|
|
1275
|
+
break
|
|
1276
|
+
for _, error in failures:
|
|
1277
|
+
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
1278
|
+
|
|
1279
|
+
return generation_kwargs
|
|
1280
|
+
|
|
1236
1281
|
|
|
1237
1282
|
def raise_if_wrong_params(
|
|
1238
1283
|
model_config: ModelConfig, allowed_params: dict[str, list[str]]
|
|
@@ -1264,6 +1309,11 @@ def raise_if_wrong_params(
|
|
|
1264
1309
|
msg += " No parameters are allowed."
|
|
1265
1310
|
raise InvalidModel(msg)
|
|
1266
1311
|
return
|
|
1312
|
+
else:
|
|
1313
|
+
raise InvalidModel(
|
|
1314
|
+
f"The parameter {param!r} is not supported for the model "
|
|
1315
|
+
f"{model_config.model_id!r}."
|
|
1316
|
+
)
|
|
1267
1317
|
|
|
1268
1318
|
|
|
1269
1319
|
def try_download_ollama_model(model_id: str) -> bool:
|
|
@@ -77,10 +77,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
77
77
|
destroy_model_parallel,
|
|
78
78
|
)
|
|
79
79
|
from vllm.lora.request import LoRARequest
|
|
80
|
-
|
|
81
|
-
if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
82
|
-
from outlines.models.vllm import adapt_tokenizer
|
|
83
|
-
from outlines.processors.structured import JSONLogitsProcessor
|
|
80
|
+
from vllm.sampling_params import GuidedDecodingParams
|
|
84
81
|
|
|
85
82
|
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
86
83
|
import ray
|
|
@@ -171,7 +168,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
171
168
|
|
|
172
169
|
def __del__(self) -> None:
|
|
173
170
|
"""Clean up the model and tokenizer."""
|
|
174
|
-
|
|
171
|
+
if importlib.util.find_spec("vllm") is not None:
|
|
172
|
+
clear_vllm()
|
|
175
173
|
if hasattr(self, "_model"):
|
|
176
174
|
del self._model
|
|
177
175
|
if hasattr(self, "_tokenizer"):
|
|
@@ -327,7 +325,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
327
325
|
if end_of_chat_token:
|
|
328
326
|
stop_tokens.append(end_of_chat_token)
|
|
329
327
|
|
|
330
|
-
|
|
328
|
+
structured_generation_schema = None
|
|
331
329
|
if self.dataset_config.task in TASKS_USING_JSON:
|
|
332
330
|
if self.generative_type == GenerativeType.REASONING:
|
|
333
331
|
log_once(
|
|
@@ -342,15 +340,13 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
342
340
|
tag_name: (conlist(str, max_length=5), ...)
|
|
343
341
|
for tag_name in ner_tag_names
|
|
344
342
|
}
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
schema=pydantic_class,
|
|
348
|
-
tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
|
|
349
|
-
whitespace_pattern=r" ?",
|
|
343
|
+
answer_format_class = create_model(
|
|
344
|
+
"AnswerFormat", **keys_and_their_types
|
|
350
345
|
)
|
|
346
|
+
structured_generation_schema = answer_format_class.model_json_schema()
|
|
351
347
|
log_once(
|
|
352
348
|
"Using structured generation with the JSON schema "
|
|
353
|
-
f"{
|
|
349
|
+
f"{structured_generation_schema}",
|
|
354
350
|
level=logging.DEBUG,
|
|
355
351
|
)
|
|
356
352
|
|
|
@@ -374,7 +370,11 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
374
370
|
logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
|
|
375
371
|
temperature=0.0,
|
|
376
372
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
377
|
-
|
|
373
|
+
guided_decoding=(
|
|
374
|
+
GuidedDecodingParams(json=structured_generation_schema)
|
|
375
|
+
if structured_generation_schema
|
|
376
|
+
else None
|
|
377
|
+
),
|
|
378
378
|
)
|
|
379
379
|
|
|
380
380
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
@@ -691,8 +691,14 @@ def load_model_and_tokenizer(
|
|
|
691
691
|
)
|
|
692
692
|
dtype = torch.float16
|
|
693
693
|
|
|
694
|
-
# If the model is a quantized model, we need to
|
|
695
|
-
if quantization
|
|
694
|
+
# If the model is a quantized model, we might need to change the dtype
|
|
695
|
+
if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
|
|
696
|
+
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
|
697
|
+
logger.debug(
|
|
698
|
+
"You are loading a quantized model where `torch_dtype` has not been set. "
|
|
699
|
+
f"Setting dtype to {dtype!r}."
|
|
700
|
+
)
|
|
701
|
+
elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
|
|
696
702
|
logger.info(
|
|
697
703
|
"You are loading a quantized model with dtype "
|
|
698
704
|
f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
|
euroeval/benchmarker.py
CHANGED
|
@@ -379,7 +379,16 @@ class Benchmarker:
|
|
|
379
379
|
|
|
380
380
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
381
381
|
for model_id in model_ids:
|
|
382
|
-
|
|
382
|
+
# Load the model configuration, or skip the model if it is invalid
|
|
383
|
+
try:
|
|
384
|
+
model_config = get_model_config(
|
|
385
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
386
|
+
)
|
|
387
|
+
except InvalidModel as e:
|
|
388
|
+
logger.info(e.message)
|
|
389
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
390
|
+
continue
|
|
391
|
+
|
|
383
392
|
loaded_model: BenchmarkModule | None = None
|
|
384
393
|
for dataset_config in dataset_configs:
|
|
385
394
|
# Skip if we have already benchmarked this model on this dataset and
|
|
@@ -399,16 +408,6 @@ class Benchmarker:
|
|
|
399
408
|
num_finished_benchmarks += 1
|
|
400
409
|
continue
|
|
401
410
|
|
|
402
|
-
if model_config is None:
|
|
403
|
-
try:
|
|
404
|
-
model_config = get_model_config(
|
|
405
|
-
model_id=model_id, benchmark_config=benchmark_config
|
|
406
|
-
)
|
|
407
|
-
except InvalidModel as e:
|
|
408
|
-
logger.info(e.message)
|
|
409
|
-
num_finished_benchmarks += len(dataset_configs)
|
|
410
|
-
continue
|
|
411
|
-
|
|
412
411
|
# Skip if the model is an encoder model and the task is generative
|
|
413
412
|
task_is_generative = (
|
|
414
413
|
dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
|
euroeval/data_models.py
CHANGED
|
@@ -259,7 +259,7 @@ class BenchmarkResult(pydantic.BaseModel):
|
|
|
259
259
|
transformers_version: str | None = get_package_version("transformers")
|
|
260
260
|
torch_version: str | None = get_package_version("torch")
|
|
261
261
|
vllm_version: str | None = get_package_version("vllm")
|
|
262
|
-
|
|
262
|
+
xgrammar_version: str | None = get_package_version("xgrammar")
|
|
263
263
|
|
|
264
264
|
@classmethod
|
|
265
265
|
def from_dict(cls, config: dict) -> "BenchmarkResult":
|
|
@@ -128,3 +128,13 @@ MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
|
|
|
128
128
|
languages=[DA],
|
|
129
129
|
unofficial=True,
|
|
130
130
|
)
|
|
131
|
+
|
|
132
|
+
GOLDENSWAG_DA_CONFIG = DatasetConfig(
|
|
133
|
+
name="goldenswag-da",
|
|
134
|
+
pretty_name="the truncated version of the Danish common-sense reasoning "
|
|
135
|
+
"dataset GoldenSwag-da, translated from the English GoldenSwag dataset",
|
|
136
|
+
huggingface_id="EuroEval/goldenswag-da-mini",
|
|
137
|
+
task=COMMON_SENSE,
|
|
138
|
+
languages=[DA],
|
|
139
|
+
unofficial=True,
|
|
140
|
+
)
|
|
@@ -120,3 +120,13 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
|
|
|
120
120
|
languages=[NL],
|
|
121
121
|
unofficial=True,
|
|
122
122
|
)
|
|
123
|
+
|
|
124
|
+
GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
125
|
+
name="goldenswag-nl",
|
|
126
|
+
pretty_name="the truncated version of the Dutch common-sense reasoning "
|
|
127
|
+
"dataset GoldenSwag-nl, translated from the English GoldenSwag dataset",
|
|
128
|
+
huggingface_id="EuroEval/goldenswag-nl-mini",
|
|
129
|
+
task=COMMON_SENSE,
|
|
130
|
+
languages=[NL],
|
|
131
|
+
unofficial=True,
|
|
132
|
+
)
|
|
@@ -78,3 +78,13 @@ MULTI_WIKI_QA_FI_CONFIG = DatasetConfig(
|
|
|
78
78
|
languages=[FI],
|
|
79
79
|
unofficial=True,
|
|
80
80
|
)
|
|
81
|
+
|
|
82
|
+
GOLDENSWAG_FI_CONFIG = DatasetConfig(
|
|
83
|
+
name="goldenswag-fi",
|
|
84
|
+
pretty_name="the truncated version of the Finnish common-sense reasoning "
|
|
85
|
+
"dataset GoldenSwag-fi, translated from the English GoldenSwag dataset",
|
|
86
|
+
huggingface_id="EuroEval/goldenswag-fi-mini",
|
|
87
|
+
task=COMMON_SENSE,
|
|
88
|
+
languages=[FI],
|
|
89
|
+
unofficial=True,
|
|
90
|
+
)
|
|
@@ -91,3 +91,13 @@ MULTI_WIKI_QA_FR_CONFIG = DatasetConfig(
|
|
|
91
91
|
languages=[FR],
|
|
92
92
|
unofficial=True,
|
|
93
93
|
)
|
|
94
|
+
|
|
95
|
+
GOLDENSWAG_FR_CONFIG = DatasetConfig(
|
|
96
|
+
name="goldenswag-fr",
|
|
97
|
+
pretty_name="the truncated version of the French common-sense reasoning "
|
|
98
|
+
"dataset GoldenSwag-fr, translated from the English GoldenSwag dataset",
|
|
99
|
+
huggingface_id="EuroEval/goldenswag-fr-mini",
|
|
100
|
+
task=COMMON_SENSE,
|
|
101
|
+
languages=[FR],
|
|
102
|
+
unofficial=True,
|
|
103
|
+
)
|
|
@@ -99,3 +99,13 @@ MULTI_WIKI_QA_DE_CONFIG = DatasetConfig(
|
|
|
99
99
|
languages=[DE],
|
|
100
100
|
unofficial=True,
|
|
101
101
|
)
|
|
102
|
+
|
|
103
|
+
GOLDENSWAG_DE_CONFIG = DatasetConfig(
|
|
104
|
+
name="goldenswag-de",
|
|
105
|
+
pretty_name="the truncated version of the German common-sense reasoning "
|
|
106
|
+
"dataset GoldenSwag-de, translated from the English GoldenSwag dataset",
|
|
107
|
+
huggingface_id="EuroEval/goldenswag-de-mini",
|
|
108
|
+
task=COMMON_SENSE,
|
|
109
|
+
languages=[DE],
|
|
110
|
+
unofficial=True,
|
|
111
|
+
)
|
|
@@ -99,3 +99,13 @@ MULTI_WIKI_QA_IT_CONFIG = DatasetConfig(
|
|
|
99
99
|
languages=[IT],
|
|
100
100
|
unofficial=True,
|
|
101
101
|
)
|
|
102
|
+
|
|
103
|
+
GOLDENSWAG_IT_CONFIG = DatasetConfig(
|
|
104
|
+
name="goldenswag-it",
|
|
105
|
+
pretty_name="the truncated version of the Italian common-sense reasoning "
|
|
106
|
+
"dataset GoldenSwag-it, translated from the English GoldenSwag dataset",
|
|
107
|
+
huggingface_id="EuroEval/goldenswag-it-mini",
|
|
108
|
+
task=COMMON_SENSE,
|
|
109
|
+
languages=[IT],
|
|
110
|
+
unofficial=True,
|
|
111
|
+
)
|
|
@@ -97,3 +97,13 @@ MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
|
|
|
97
97
|
languages=[ES],
|
|
98
98
|
unofficial=True,
|
|
99
99
|
)
|
|
100
|
+
|
|
101
|
+
GOLDENSWAG_ES_CONFIG = DatasetConfig(
|
|
102
|
+
name="goldenswag-es",
|
|
103
|
+
pretty_name="the truncated version of the Spanish common-sense reasoning "
|
|
104
|
+
"dataset GoldenSwag-es, translated from the English GoldenSwag dataset",
|
|
105
|
+
huggingface_id="EuroEval/goldenswag-es-mini",
|
|
106
|
+
task=COMMON_SENSE,
|
|
107
|
+
languages=[ES],
|
|
108
|
+
unofficial=True,
|
|
109
|
+
)
|
|
@@ -108,3 +108,13 @@ MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
|
|
|
108
108
|
languages=[SV],
|
|
109
109
|
unofficial=True,
|
|
110
110
|
)
|
|
111
|
+
|
|
112
|
+
GOLDENSWAG_SV_CONFIG = DatasetConfig(
|
|
113
|
+
name="goldenswag-sv",
|
|
114
|
+
pretty_name="the truncated version of the Swedish common-sense reasoning "
|
|
115
|
+
"dataset GoldenSwag-sv, translated from the English GoldenSwag dataset",
|
|
116
|
+
huggingface_id="EuroEval/goldenswag-sv-mini",
|
|
117
|
+
task=COMMON_SENSE,
|
|
118
|
+
languages=[SV],
|
|
119
|
+
unofficial=True,
|
|
120
|
+
)
|
euroeval/finetuning.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
5
|
import typing as t
|
|
6
|
+
from functools import partial
|
|
6
7
|
|
|
7
8
|
import torch
|
|
8
9
|
from tqdm.auto import tqdm
|
|
@@ -198,7 +199,7 @@ def finetune_single_iteration(
|
|
|
198
199
|
args=training_args,
|
|
199
200
|
train_dataset=dataset["train"],
|
|
200
201
|
eval_dataset=dataset["val"],
|
|
201
|
-
compute_metrics=model.compute_metrics,
|
|
202
|
+
compute_metrics=partial(model.compute_metrics, dataset=None),
|
|
202
203
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
|
|
203
204
|
data_collator=model.data_collator,
|
|
204
205
|
preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
|
euroeval/generation.py
CHANGED
|
@@ -235,7 +235,7 @@ def generate_single_iteration(
|
|
|
235
235
|
)
|
|
236
236
|
|
|
237
237
|
itr_scores: dict[str, float] = model.compute_metrics(
|
|
238
|
-
model_outputs_and_labels=(all_preds, ground_truth)
|
|
238
|
+
model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
|
|
239
239
|
)
|
|
240
240
|
|
|
241
241
|
return itr_scores
|
euroeval/human_evaluation.py
CHANGED
|
@@ -620,7 +620,8 @@ class HumanEvaluator:
|
|
|
620
620
|
)
|
|
621
621
|
ground_truth = self.active_dataset["label"]
|
|
622
622
|
itr_scores: dict[str, float] = self.compute_metrics(
|
|
623
|
-
model_outputs_and_labels=(all_preds, ground_truth)
|
|
623
|
+
model_outputs_and_labels=(all_preds, ground_truth),
|
|
624
|
+
dataset=self.active_dataset,
|
|
624
625
|
)
|
|
625
626
|
|
|
626
627
|
# We reverse the order, as the Info messages are printed in reverse order
|
euroeval/metrics.py
CHANGED
|
@@ -14,6 +14,7 @@ from .exceptions import InvalidBenchmark
|
|
|
14
14
|
from .utils import HiddenPrints
|
|
15
15
|
|
|
16
16
|
if t.TYPE_CHECKING:
|
|
17
|
+
from datasets.arrow_dataset import Dataset
|
|
17
18
|
from evaluate import EvaluationModule
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
@@ -49,7 +50,9 @@ class Metric(abc.ABC):
|
|
|
49
50
|
)
|
|
50
51
|
|
|
51
52
|
@abc.abstractmethod
|
|
52
|
-
def __call__(
|
|
53
|
+
def __call__(
|
|
54
|
+
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
|
|
55
|
+
) -> float | None:
|
|
53
56
|
"""Calculate the metric score.
|
|
54
57
|
|
|
55
58
|
Args:
|
|
@@ -57,6 +60,9 @@ class Metric(abc.ABC):
|
|
|
57
60
|
The model predictions.
|
|
58
61
|
references:
|
|
59
62
|
The ground truth references.
|
|
63
|
+
dataset:
|
|
64
|
+
The dataset used for evaluation. This is only used in case any
|
|
65
|
+
additional metadata is used to compute the metrics.
|
|
60
66
|
|
|
61
67
|
Returns:
|
|
62
68
|
The calculated metric score, or None if the score should be ignored.
|
|
@@ -125,7 +131,9 @@ class HuggingFaceMetric(Metric):
|
|
|
125
131
|
)
|
|
126
132
|
self.metric: "EvaluationModule | None" = None
|
|
127
133
|
|
|
128
|
-
def __call__(
|
|
134
|
+
def __call__(
|
|
135
|
+
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
|
|
136
|
+
) -> float | None:
|
|
129
137
|
"""Calculate the metric score.
|
|
130
138
|
|
|
131
139
|
Args:
|
|
@@ -133,6 +141,9 @@ class HuggingFaceMetric(Metric):
|
|
|
133
141
|
The model predictions.
|
|
134
142
|
references:
|
|
135
143
|
The ground truth references.
|
|
144
|
+
dataset:
|
|
145
|
+
The dataset used for evaluation. This is only used in case any
|
|
146
|
+
additional metadata is used to compute the metrics.
|
|
136
147
|
|
|
137
148
|
Returns:
|
|
138
149
|
The calculated metric score, or None if the score should be ignored.
|
|
@@ -213,7 +224,9 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
213
224
|
self.condition_formatting_fn = condition_formatting_fn
|
|
214
225
|
self.system_prompt = system_prompt
|
|
215
226
|
|
|
216
|
-
def __call__(
|
|
227
|
+
def __call__(
|
|
228
|
+
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
|
|
229
|
+
) -> float | None:
|
|
217
230
|
"""Calculate the metric score using the judge model.
|
|
218
231
|
|
|
219
232
|
Args:
|
|
@@ -221,6 +234,9 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
221
234
|
The model predictions.
|
|
222
235
|
references:
|
|
223
236
|
The ground truth references.
|
|
237
|
+
dataset:
|
|
238
|
+
The dataset used for evaluation. This is only used in case any
|
|
239
|
+
additional metadata is used to compute the metrics.
|
|
224
240
|
|
|
225
241
|
Returns:
|
|
226
242
|
The calculated metric score, or None if the score should be ignored.
|
|
@@ -343,7 +359,9 @@ class SpeedMetric(Metric):
|
|
|
343
359
|
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
|
|
344
360
|
)
|
|
345
361
|
|
|
346
|
-
def __call__(
|
|
362
|
+
def __call__(
|
|
363
|
+
self, _: t.Sequence, __: t.Sequence, ___: "Dataset | None"
|
|
364
|
+
) -> float | None:
|
|
347
365
|
"""Not used with the speed metric, but required for consistency."""
|
|
348
366
|
raise NotImplementedError
|
|
349
367
|
|
|
@@ -69,7 +69,7 @@ MULTIPLE_CHOICE_TEMPLATES = {
|
|
|
69
69
|
IT: PromptConfig(
|
|
70
70
|
default_prompt_prefix="Le seguenti sono domande a scelta multipla "
|
|
71
71
|
"(con relative risposte).",
|
|
72
|
-
default_prompt_template="Domanda: {text}\
|
|
72
|
+
default_prompt_template="Domanda: {text}\nRisposta: {label}",
|
|
73
73
|
default_instruction_prompt="Domanda: {text}\n\nRispondete alla domanda "
|
|
74
74
|
"precedente con {labels_str}, e nient'altro.",
|
|
75
75
|
default_prompt_label_mapping="auto",
|
|
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
149
149
|
def compute_metrics(
|
|
150
150
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
151
151
|
dataset_config: "DatasetConfig",
|
|
152
|
+
dataset: "Dataset",
|
|
152
153
|
) -> dict[str, float]:
|
|
153
154
|
"""Compute the metrics needed for evaluation.
|
|
154
155
|
|
|
@@ -158,6 +159,9 @@ def compute_metrics(
|
|
|
158
159
|
contains the true labels.
|
|
159
160
|
dataset_config:
|
|
160
161
|
The configuration of the dataset.
|
|
162
|
+
dataset:
|
|
163
|
+
The dataset used for evaluation. This is only used in case any additional
|
|
164
|
+
metadata is used to compute the metrics.
|
|
161
165
|
|
|
162
166
|
Returns:
|
|
163
167
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -181,7 +185,9 @@ def compute_metrics(
|
|
|
181
185
|
|
|
182
186
|
results: dict[str, float] = dict()
|
|
183
187
|
for metric in dataset_config.task.metrics:
|
|
184
|
-
score: float | None = metric(
|
|
188
|
+
score: float | None = metric(
|
|
189
|
+
predictions=predictions, references=labels, dataset=dataset
|
|
190
|
+
)
|
|
185
191
|
|
|
186
192
|
# The metric returns None if we are running on multi-GPU and the current
|
|
187
193
|
# process is not the main process
|
|
@@ -11,6 +11,7 @@ from ..exceptions import InvalidBenchmark
|
|
|
11
11
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
12
12
|
|
|
13
13
|
if t.TYPE_CHECKING:
|
|
14
|
+
from datasets.arrow_dataset import Dataset
|
|
14
15
|
from transformers.trainer_utils import EvalPrediction
|
|
15
16
|
|
|
16
17
|
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
@@ -23,6 +24,7 @@ logger = logging.getLogger("euroeval")
|
|
|
23
24
|
def compute_metrics(
|
|
24
25
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
25
26
|
dataset_config: "DatasetConfig",
|
|
27
|
+
dataset: "Dataset",
|
|
26
28
|
) -> dict[str, float]:
|
|
27
29
|
"""Compute the metrics needed for evaluation.
|
|
28
30
|
|
|
@@ -32,6 +34,9 @@ def compute_metrics(
|
|
|
32
34
|
contains the true labels.
|
|
33
35
|
dataset_config:
|
|
34
36
|
The configuration of the dataset.
|
|
37
|
+
dataset:
|
|
38
|
+
The dataset used for evaluation. This is only used in case any additional
|
|
39
|
+
metadata is used to compute the metrics.
|
|
35
40
|
|
|
36
41
|
Returns:
|
|
37
42
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -73,7 +78,9 @@ def compute_metrics(
|
|
|
73
78
|
|
|
74
79
|
results: dict[str, float] = dict()
|
|
75
80
|
for metric in dataset_config.task.metrics:
|
|
76
|
-
score: float | None = metric(
|
|
81
|
+
score: float | None = metric(
|
|
82
|
+
predictions=predictions, references=label_ids, dataset=dataset
|
|
83
|
+
)
|
|
77
84
|
|
|
78
85
|
# The metric returns None if we are running on multi-GPU and the current
|
|
79
86
|
# process is not the main process
|
|
@@ -11,6 +11,7 @@ from ..metrics import HuggingFaceMetric
|
|
|
11
11
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
12
12
|
|
|
13
13
|
if t.TYPE_CHECKING:
|
|
14
|
+
from datasets.arrow_dataset import Dataset
|
|
14
15
|
from transformers.trainer_utils import EvalPrediction
|
|
15
16
|
|
|
16
17
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
@@ -24,6 +25,7 @@ def compute_metrics(
|
|
|
24
25
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
25
26
|
dataset_config: "DatasetConfig",
|
|
26
27
|
benchmark_config: "BenchmarkConfig",
|
|
28
|
+
dataset: "Dataset",
|
|
27
29
|
) -> dict[str, float]:
|
|
28
30
|
"""Compute the metrics needed for evaluation.
|
|
29
31
|
|
|
@@ -35,6 +37,9 @@ def compute_metrics(
|
|
|
35
37
|
The configuration of the dataset.
|
|
36
38
|
benchmark_config:
|
|
37
39
|
The configuration of the benchmark.
|
|
40
|
+
dataset:
|
|
41
|
+
The dataset used for evaluation. This is only used in case any additional
|
|
42
|
+
metadata is used to compute the metrics.
|
|
38
43
|
|
|
39
44
|
Returns:
|
|
40
45
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -69,7 +74,9 @@ def compute_metrics(
|
|
|
69
74
|
|
|
70
75
|
while True:
|
|
71
76
|
try:
|
|
72
|
-
score: float | None = metric(
|
|
77
|
+
score: float | None = metric(
|
|
78
|
+
predictions=predictions, references=labels, dataset=dataset
|
|
79
|
+
)
|
|
73
80
|
break
|
|
74
81
|
except Exception as e:
|
|
75
82
|
oom_error = [
|
|
@@ -12,6 +12,7 @@ from ..exceptions import InvalidBenchmark
|
|
|
12
12
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
13
13
|
|
|
14
14
|
if t.TYPE_CHECKING:
|
|
15
|
+
from datasets.arrow_dataset import Dataset
|
|
15
16
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
16
17
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
17
18
|
from transformers.trainer_utils import EvalPrediction
|
|
@@ -27,6 +28,7 @@ def compute_metrics(
|
|
|
27
28
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
28
29
|
has_misc_tags: bool,
|
|
29
30
|
dataset_config: "DatasetConfig",
|
|
31
|
+
dataset: "Dataset",
|
|
30
32
|
) -> dict[str, float]:
|
|
31
33
|
"""Compute the metrics needed for evaluation.
|
|
32
34
|
|
|
@@ -38,6 +40,9 @@ def compute_metrics(
|
|
|
38
40
|
Whether the dataset has MISC tags.
|
|
39
41
|
dataset_config:
|
|
40
42
|
The configuration of the dataset.
|
|
43
|
+
dataset:
|
|
44
|
+
The dataset used for evaluation. This is only used in case any additional
|
|
45
|
+
metadata is used to compute the metrics.
|
|
41
46
|
|
|
42
47
|
Returns:
|
|
43
48
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -136,7 +141,9 @@ def compute_metrics(
|
|
|
136
141
|
for metric in dataset_config.task.metrics
|
|
137
142
|
if metric.name == "micro_f1"
|
|
138
143
|
)
|
|
139
|
-
micro_f1_score = metric(
|
|
144
|
+
micro_f1_score = metric(
|
|
145
|
+
predictions=predictions, references=list(labels), dataset=dataset
|
|
146
|
+
)
|
|
140
147
|
|
|
141
148
|
# Compute the metrics without MISC tags
|
|
142
149
|
# We manually set the F1 metric to be 100% if both the labels and the models
|
|
@@ -158,7 +165,7 @@ def compute_metrics(
|
|
|
158
165
|
if metric.name == "micro_f1_no_misc"
|
|
159
166
|
)
|
|
160
167
|
micro_f1_no_misc_score = metric(
|
|
161
|
-
predictions=predictions_no_misc, references=labels_no_misc
|
|
168
|
+
predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
|
|
162
169
|
)
|
|
163
170
|
|
|
164
171
|
# Raise error if the metrics are invalid
|
euroeval/types.py
CHANGED
|
@@ -5,6 +5,7 @@ import typing as t
|
|
|
5
5
|
from transformers.trainer_utils import EvalPrediction
|
|
6
6
|
|
|
7
7
|
if t.TYPE_CHECKING:
|
|
8
|
+
from datasets.arrow_dataset import Dataset
|
|
8
9
|
from numpy.typing import NDArray
|
|
9
10
|
|
|
10
11
|
from .data_models import GenerativeModelOutput
|
|
@@ -25,12 +26,16 @@ class ComputeMetricsFunction(t.Protocol):
|
|
|
25
26
|
"NDArray | list[str] | list[list[str]]",
|
|
26
27
|
"NDArray | list[str] | list[list[str]]",
|
|
27
28
|
],
|
|
29
|
+
dataset: "Dataset",
|
|
28
30
|
) -> dict[str, float]:
|
|
29
31
|
"""Compute the metrics.
|
|
30
32
|
|
|
31
33
|
Args:
|
|
32
34
|
model_outputs_and_labels:
|
|
33
35
|
The model outputs and labels.
|
|
36
|
+
dataset:
|
|
37
|
+
The dataset used for evaluation. This is only used in case any
|
|
38
|
+
additional metadata is used to compute the metrics.
|
|
34
39
|
|
|
35
40
|
Returns:
|
|
36
41
|
The computed metrics.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.16.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -56,18 +56,16 @@ Requires-Dist: setuptools>=75.8.2
|
|
|
56
56
|
Requires-Dist: tenacity>=9.0.0
|
|
57
57
|
Requires-Dist: termcolor>=2.0.0
|
|
58
58
|
Requires-Dist: torch>=2.6.0
|
|
59
|
-
Requires-Dist: transformers>=4.
|
|
59
|
+
Requires-Dist: transformers>=4.55.0
|
|
60
60
|
Provides-Extra: all
|
|
61
61
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
62
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
64
|
-
Requires-Dist:
|
|
65
|
-
Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
|
|
66
65
|
Provides-Extra: generative
|
|
67
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
-
Requires-Dist:
|
|
70
|
-
Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
|
+
Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'generative'
|
|
71
69
|
Provides-Extra: human-evaluation
|
|
72
70
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
73
71
|
Provides-Extra: test
|
|
@@ -235,6 +233,7 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
235
233
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
236
234
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
237
235
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
236
|
+
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
238
237
|
|
|
239
238
|
|
|
240
239
|
### Contribute to EuroEval
|
|
@@ -1,19 +1,19 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=ZZoVc6tKWz_h8Pw2n26PV-q_Gd4TM_02O235ZBRUNJw,3756
|
|
2
2
|
euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
3
|
+
euroeval/benchmarker.py,sha256=6qo0ytRnvZLxTQZvo2Fryox5DFHGrLsa0tVGquLHdTQ,48419
|
|
4
4
|
euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
|
|
5
5
|
euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
|
|
6
6
|
euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
|
|
7
7
|
euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=qSCNq3PV7qo--gibqEvvu4cXkEkhGGAb6UiZW8U_KiU,22031
|
|
9
9
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
10
|
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
|
-
euroeval/finetuning.py,sha256=
|
|
12
|
-
euroeval/generation.py,sha256=
|
|
11
|
+
euroeval/finetuning.py,sha256=Wzagme1n3lSZLWX0WbKMHtSUlAZr8t8_FJvggDZf72c,11393
|
|
12
|
+
euroeval/generation.py,sha256=lmvu__6w3cLxi0zBtXSlyZvV8CJpV3BdajUoIEA9ElA,11639
|
|
13
13
|
euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
|
|
14
|
-
euroeval/human_evaluation.py,sha256=
|
|
14
|
+
euroeval/human_evaluation.py,sha256=FLuTl1DHxCiWB_laVVQHIH86yXvA_ZeNNSrUmyExZXI,27579
|
|
15
15
|
euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
|
|
16
|
-
euroeval/metrics.py,sha256=
|
|
16
|
+
euroeval/metrics.py,sha256=m8nVnxUnwmIrlBfW8pkN4FCMjW3Sbg9Iq4oMZFAicEc,16227
|
|
17
17
|
euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
|
|
18
18
|
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
19
19
|
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
@@ -21,43 +21,43 @@ euroeval/scores.py,sha256=TatSbjia7Zwj71gQFyV_gCHyppMbOgeaZgNCib8G86k,2849
|
|
|
21
21
|
euroeval/speed_benchmark.py,sha256=6bFGeMmtdl_6owkxNQ3ZKiyQQS58k0NApzlsbDgBW5s,4037
|
|
22
22
|
euroeval/tasks.py,sha256=btxf29M5rUP7JjBl6u9aQlHQAxrJNP4bRbdEQtDnmDA,3376
|
|
23
23
|
euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
|
|
24
|
-
euroeval/types.py,sha256=
|
|
24
|
+
euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
|
|
25
25
|
euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
|
|
26
26
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
27
27
|
euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
|
|
28
28
|
euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
|
|
29
29
|
euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
|
|
30
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
31
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
30
|
+
euroeval/benchmark_modules/litellm.py,sha256=ibdbOmxAO1VsuZX4uUs5MQ8pFPfqPJoleOOjAim3syY,55493
|
|
31
|
+
euroeval/benchmark_modules/vllm.py,sha256=7PhfqqeRGFdzOL-RBJbrHEAMGfwrVWngF14dSeq9IpI,39072
|
|
32
32
|
euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
|
|
33
|
-
euroeval/dataset_configs/danish.py,sha256
|
|
34
|
-
euroeval/dataset_configs/dutch.py,sha256=
|
|
33
|
+
euroeval/dataset_configs/danish.py,sha256=0lDtvpgszXY1XaPjTU8yA3oNCU8W2OllvrBWgn6pkhk,4027
|
|
34
|
+
euroeval/dataset_configs/dutch.py,sha256=ekZxLL9d09BUMijCxy9EFa2heNQVvySPySOjhWdtJc8,3815
|
|
35
35
|
euroeval/dataset_configs/english.py,sha256=uQAaGWpHk8xqFCeIhmmPXYTb1cZomeEdRaRe9qIZQrg,2858
|
|
36
36
|
euroeval/dataset_configs/faroese.py,sha256=gkgxQTWGFbfg9Eo1z-NSLROgKDcaij9tAN2mfgtrt0M,1647
|
|
37
|
-
euroeval/dataset_configs/finnish.py,sha256=
|
|
38
|
-
euroeval/dataset_configs/french.py,sha256=
|
|
39
|
-
euroeval/dataset_configs/german.py,sha256=
|
|
37
|
+
euroeval/dataset_configs/finnish.py,sha256=UZwy0_d17O2L-v2AKOu3OlDwFPcLGTZNAOt7ZKlr4K8,2679
|
|
38
|
+
euroeval/dataset_configs/french.py,sha256=Hei2M4bGIz8hVtaPKQlQATcmK-0bFBNEocEszR3gia0,3014
|
|
39
|
+
euroeval/dataset_configs/german.py,sha256=sRYtOl6CYf4kZkeINfff6xoKBG4OsDxb2b72lKwELGc,3192
|
|
40
40
|
euroeval/dataset_configs/icelandic.py,sha256=g21IHjcwEZvf_yJ9PobeuBOqRiLOk0oCdEjY34g-UMk,4497
|
|
41
|
-
euroeval/dataset_configs/italian.py,sha256=
|
|
41
|
+
euroeval/dataset_configs/italian.py,sha256=4SEmdUyfGbbwMPhv_9nL3JNJtoDKHLAlWuvr7Ihmi9o,3294
|
|
42
42
|
euroeval/dataset_configs/norwegian.py,sha256=-WvQM44xCwjrqBzlAy4rjf6v87fGera2JmZV_069TeQ,6003
|
|
43
43
|
euroeval/dataset_configs/portuguese.py,sha256=3SqbwD0PNTILGALzh50pVoEwC-spRD75ZeE2NEj151E,2367
|
|
44
|
-
euroeval/dataset_configs/spanish.py,sha256=
|
|
45
|
-
euroeval/dataset_configs/swedish.py,sha256=
|
|
44
|
+
euroeval/dataset_configs/spanish.py,sha256=Bm0Z19Mh2qYXR0RIRlqEkzfVb5KiqJRectfuY7JLql4,3192
|
|
45
|
+
euroeval/dataset_configs/swedish.py,sha256=js4paNsuC0nQzPpf6_BzHBf7MT60XUpP1-qM2uxRtQs,3445
|
|
46
46
|
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
47
47
|
euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
|
|
48
|
-
euroeval/prompt_templates/multiple_choice.py,sha256=
|
|
48
|
+
euroeval/prompt_templates/multiple_choice.py,sha256=wHnQCE5bv947L6hSK5zJitE37V-PbuNYAp156mWaIYA,5494
|
|
49
49
|
euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
|
|
50
50
|
euroeval/prompt_templates/reading_comprehension.py,sha256=3Nch-9zHfUDIwy-k5mP-TRhHQRQ9nad8HdhpJ1S8nGc,7072
|
|
51
51
|
euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
|
|
52
52
|
euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
|
|
53
53
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
54
54
|
euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
|
|
55
|
-
euroeval/task_group_utils/question_answering.py,sha256=
|
|
56
|
-
euroeval/task_group_utils/sequence_classification.py,sha256=
|
|
57
|
-
euroeval/task_group_utils/text_to_text.py,sha256=
|
|
58
|
-
euroeval/task_group_utils/token_classification.py,sha256=
|
|
59
|
-
euroeval-15.
|
|
60
|
-
euroeval-15.
|
|
61
|
-
euroeval-15.
|
|
62
|
-
euroeval-15.
|
|
63
|
-
euroeval-15.
|
|
55
|
+
euroeval/task_group_utils/question_answering.py,sha256=6jpiHukzA7IrJh4vVYyZDDyvD5Xc2GsxoXzpm_PHpXw,27503
|
|
56
|
+
euroeval/task_group_utils/sequence_classification.py,sha256=ihJO55f3Dy565d3ByYGMuSINasnjAADaTrM59LwZzA0,12977
|
|
57
|
+
euroeval/task_group_utils/text_to_text.py,sha256=go0y6X9QAv5iywlLAclb8cqFX_3QlAT-1-VNZ9zMWFA,4832
|
|
58
|
+
euroeval/task_group_utils/token_classification.py,sha256=BDqOfopdH5Bbj67HTEbZd9KZtNCDNket8NrCTfxZFzQ,17773
|
|
59
|
+
euroeval-15.16.0.dist-info/METADATA,sha256=_oeIq0ZGzS0i7n51NdhNhuDX2A3_lDjYDD-6KgB1rW0,13536
|
|
60
|
+
euroeval-15.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
61
|
+
euroeval-15.16.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
62
|
+
euroeval-15.16.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
63
|
+
euroeval-15.16.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|