EuroEval 15.14.0__py3-none-any.whl → 15.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
86
86
  os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
87
87
 
88
88
 
89
+ # Allow long max model length in vLLM. This happens when vLLM registers that the model
90
+ # has a shorter context length than the value we are inserting. But since we do a
91
+ # thorough check of the model's config before setting the context length, we trust our
92
+ # own checks and ignore the internal vLLM check.
93
+ os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
94
+
95
+
89
96
  # Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
90
97
  # The error comes from the `aiohttp` package, and this environment variable forces the
91
98
  # use of `httpx` instead.
@@ -6,7 +6,7 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import typing as t
9
- from functools import cached_property, partial
9
+ from functools import cache, cached_property, partial
10
10
  from time import sleep
11
11
 
12
12
  import litellm
@@ -27,6 +27,7 @@ from litellm.exceptions import (
27
27
  RateLimitError,
28
28
  ServiceUnavailableError,
29
29
  Timeout,
30
+ UnsupportedParamsError,
30
31
  )
31
32
  from litellm.llms.vertex_ai.common_utils import VertexAIError
32
33
  from litellm.router import Router
@@ -87,6 +88,7 @@ logger = logging.getLogger("euroeval")
87
88
 
88
89
  VOCAB_SIZE_MAPPING = {
89
90
  # OpenAI models
91
+ r"gpt-5-.*": 100_256,
90
92
  r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
91
93
  r"gpt-4-[0-9]{4}-preview": 100_256,
92
94
  r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
@@ -105,6 +107,7 @@ VOCAB_SIZE_MAPPING = {
105
107
 
106
108
  MODEL_MAX_LENGTH_MAPPING = {
107
109
  # OpenAI models
110
+ r"gpt-5-.*": 272_000,
108
111
  r"gpt-4(-[0-9]{4})?": 8_191,
109
112
  r"gpt-4-32k(-[0-9]{4})?": 32_767,
110
113
  r"gpt-4-[0-9]{4}-preview": 128_000,
@@ -129,6 +132,7 @@ MODEL_MAX_LENGTH_MAPPING = {
129
132
 
130
133
  NUM_PARAMS_MAPPING = {
131
134
  # OpenAI models
135
+ r"gpt-5-.*": -1,
132
136
  r"gpt-4.*": -1,
133
137
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
134
138
  # Anthropic models
@@ -144,6 +148,7 @@ NUM_PARAMS_MAPPING = {
144
148
 
145
149
  ALLOWED_PARAMS = {
146
150
  # OpenAI models
151
+ r"gpt-5-.*": ["minimal", "low", "medium", "high"],
147
152
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
148
153
  # Anthropic models
149
154
  r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
@@ -269,28 +274,9 @@ class LiteLLMModel(BenchmarkModule):
269
274
  generative_type=self.generative_type,
270
275
  )
271
276
 
272
- # Set the core generation arguments
273
- generation_kwargs: dict[str, t.Any] = dict(
274
- model=self.model_config.model_id,
275
- max_completion_tokens=(
276
- REASONING_MAX_TOKENS
277
- if self.generative_type == GenerativeType.REASONING
278
- else self.dataset_config.max_generated_tokens
279
- ),
280
- stop=[],
281
- temperature=0.0,
282
- seed=4242,
283
- api_key=self.benchmark_config.api_key,
284
- api_base=self.benchmark_config.api_base,
285
- api_version=self.benchmark_config.api_version,
286
- max_retries=3,
287
- )
288
-
289
- # Set up the `response_format` generation argument if we are dealing with a task
290
- # using structured generation
277
+ # Sanity check that "JSON" is included in the prompt, as some models require
278
+ # this
291
279
  if self.dataset_config.task in TASKS_USING_JSON:
292
- # Sanity check that "JSON" is included in the prompt, as some models require
293
- # this
294
280
  for conversation in conversations:
295
281
  if not conversation:
296
282
  raise InvalidBenchmark(
@@ -310,87 +296,6 @@ class LiteLLMModel(BenchmarkModule):
310
296
  "Prompt must contain 'json' for JSON tasks."
311
297
  )
312
298
 
313
- if self.generative_type == GenerativeType.REASONING:
314
- log_once(
315
- f"The model {self.model_config.model_id!r} is a reasoning model "
316
- "and thus does not support structured generation, so we do not "
317
- "enable it.",
318
- level=logging.DEBUG,
319
- )
320
- elif supports_response_schema(model=self.model_config.model_id):
321
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
322
- keys_and_their_types: dict[str, t.Any] = {
323
- tag_name: (conlist(str, max_length=5), ...)
324
- for tag_name in ner_tag_names
325
- }
326
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
327
- generation_kwargs["response_format"] = pydantic_class
328
- log_once(
329
- "Enabling structured generation for model "
330
- f"{self.model_config.model_id!r} with the JSON schema "
331
- f"{pydantic_class.model_json_schema()}",
332
- level=logging.DEBUG,
333
- )
334
- else:
335
- generation_kwargs["response_format"] = dict(type="json_object")
336
- log_once(
337
- "Enabling structured JSON generation for model "
338
- f"{self.model_config.model_id!r} with no custom JSON schema, as "
339
- "the model does not support schemas.",
340
- level=logging.DEBUG,
341
- )
342
-
343
- # If the model is an Ollama reasoning model, we ensure that thinking is enabled
344
- if self.is_ollama and self.generative_type == GenerativeType.REASONING:
345
- generation_kwargs["think"] = True
346
- log_once(
347
- "Enabling thinking mode for Ollama model "
348
- f"{self.model_config.model_id!r}",
349
- level=logging.DEBUG,
350
- )
351
-
352
- # Handle manually set parameters
353
- if self.buffer["first_label_token_mapping"]:
354
- generation_kwargs["logprobs"] = True
355
- generation_kwargs["top_logprobs"] = MAX_LOGPROBS
356
- if self.model_config.revision == "thinking":
357
- generation_kwargs["thinking"] = dict(
358
- type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
359
- )
360
- log_once(
361
- f"Enabling thinking mode for model {self.model_config.model_id!r}",
362
- level=logging.DEBUG,
363
- )
364
- elif self.model_config.revision == "no-thinking":
365
- generation_kwargs["thinking"] = dict(budget_tokens=0)
366
- log_once(
367
- f"Disabling thinking mode for model {self.model_config.model_id!r}",
368
- level=logging.DEBUG,
369
- )
370
- elif self.model_config.revision in {"low", "medium", "high"}:
371
- generation_kwargs["reasoning_effort"] = self.model_config.revision
372
- log_once(
373
- f"Enabling reasoning effort {self.model_config.revision!r} for model "
374
- f"{self.model_config.model_id!r}",
375
- level=logging.DEBUG,
376
- )
377
-
378
- # Drop generation kwargs that are not supported by the model
379
- litellm.drop_params = True
380
-
381
- # First attempt is a test run with a single conversation to handle errors
382
- # quickly
383
- test_conversation = conversations[0]
384
- _, failures = safe_run(
385
- self._generate_async(
386
- model_id=self.model_config.model_id,
387
- conversations=[test_conversation],
388
- **generation_kwargs,
389
- )
390
- )
391
- for _, error in failures:
392
- self._handle_exception(error=error, generation_kwargs=generation_kwargs)
393
-
394
299
  all_responses: dict[int, "ModelResponse"] = {}
395
300
  conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
396
301
  enumerate(conversations)
@@ -404,7 +309,7 @@ class LiteLLMModel(BenchmarkModule):
404
309
  self._generate_async(
405
310
  model_id=self.model_config.model_id,
406
311
  conversations=list(batch_conversations),
407
- **generation_kwargs,
312
+ **self.get_generation_kwargs(dataset_config=self.dataset_config),
408
313
  )
409
314
  )
410
315
 
@@ -431,7 +336,12 @@ class LiteLLMModel(BenchmarkModule):
431
336
  # Attempt to handle the exceptions, to improve the chance of getting
432
337
  # successful generations next time around
433
338
  for _, error in failures:
434
- self._handle_exception(error=error, generation_kwargs=generation_kwargs)
339
+ self._handle_exception(
340
+ error=error,
341
+ generation_kwargs=self.get_generation_kwargs(
342
+ dataset_config=self.dataset_config
343
+ ),
344
+ )
435
345
 
436
346
  # Sleep for a second to avoid pinging the API server too quickly
437
347
  sleep(1)
@@ -484,6 +394,7 @@ class LiteLLMModel(BenchmarkModule):
484
394
  "`temperature` may only be set to 1",
485
395
  "'temperature' does not support 0.0 with this model. Only the default "
486
396
  "(1) value is supported",
397
+ "Only temperature=1 is supported",
487
398
  ]
488
399
  max_items_messages = ["'maxItems' is not permitted."]
489
400
  no_json_schema_messages = ["Property keys should match pattern"]
@@ -593,6 +504,20 @@ class LiteLLMModel(BenchmarkModule):
593
504
  )
594
505
  sleep(5)
595
506
  return
507
+ elif isinstance(error, UnsupportedParamsError):
508
+ unsupported_param_match = re.search(
509
+ pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
510
+ string=error.message,
511
+ )
512
+ if unsupported_param_match is None:
513
+ raise InvalidModel(error.message)
514
+ else:
515
+ unsupported_param = unsupported_param_match.group(0)
516
+ raise InvalidModel(
517
+ f"The model {model_id!r} does not support the parameter "
518
+ f"{unsupported_param!r}. Try again without this parameter. "
519
+ "Skipping this model."
520
+ )
596
521
  elif isinstance(error, (APIConnectionError, OSError)):
597
522
  # If there are too many I/O connections, we increase the number of allowed
598
523
  # file descriptors
@@ -1233,6 +1158,126 @@ class LiteLLMModel(BenchmarkModule):
1233
1158
 
1234
1159
  return dataset
1235
1160
 
1161
+ @cache
1162
+ def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
1163
+ """Get the generation arguments for the model.
1164
+
1165
+ Args:
1166
+ dataset_config:
1167
+ The dataset configuration, which is used to determine the generative
1168
+ type of the model. We use this as an argument here rather than using
1169
+ `self.dataset_config` to ensure that that the cache is updated when the
1170
+ dataset configuration changes.
1171
+
1172
+ Returns:
1173
+ The generation arguments for the model.
1174
+ """
1175
+ # Set the core generation arguments
1176
+ generation_kwargs: dict[str, t.Any] = dict(
1177
+ model=self.model_config.model_id,
1178
+ max_completion_tokens=(
1179
+ REASONING_MAX_TOKENS
1180
+ if self.generative_type == GenerativeType.REASONING
1181
+ else dataset_config.max_generated_tokens
1182
+ ),
1183
+ stop=[],
1184
+ temperature=0.0,
1185
+ seed=4242,
1186
+ api_key=self.benchmark_config.api_key,
1187
+ api_base=self.benchmark_config.api_base,
1188
+ api_version=self.benchmark_config.api_version,
1189
+ max_retries=3,
1190
+ )
1191
+
1192
+ # Set up the `response_format` generation argument if we are dealing with a task
1193
+ # using structured generation
1194
+ if dataset_config.task in TASKS_USING_JSON:
1195
+ if self.generative_type == GenerativeType.REASONING:
1196
+ log_once(
1197
+ f"The model {self.model_config.model_id!r} is a reasoning model "
1198
+ "and thus does not support structured generation, so we do not "
1199
+ "enable it.",
1200
+ level=logging.DEBUG,
1201
+ )
1202
+ elif supports_response_schema(model=self.model_config.model_id):
1203
+ ner_tag_names = list(dataset_config.prompt_label_mapping.values())
1204
+ keys_and_their_types: dict[str, t.Any] = {
1205
+ tag_name: (conlist(str, max_length=5), ...)
1206
+ for tag_name in ner_tag_names
1207
+ }
1208
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
1209
+ generation_kwargs["response_format"] = pydantic_class
1210
+ log_once(
1211
+ "Enabling structured generation for model "
1212
+ f"{self.model_config.model_id!r} with the JSON schema "
1213
+ f"{pydantic_class.model_json_schema()}",
1214
+ level=logging.DEBUG,
1215
+ )
1216
+ else:
1217
+ generation_kwargs["response_format"] = dict(type="json_object")
1218
+ log_once(
1219
+ "Enabling structured JSON generation for model "
1220
+ f"{self.model_config.model_id!r} with no custom JSON schema, as "
1221
+ "the model does not support schemas.",
1222
+ level=logging.DEBUG,
1223
+ )
1224
+
1225
+ # If the model is an Ollama reasoning model, we ensure that thinking is enabled
1226
+ if self.is_ollama and self.generative_type == GenerativeType.REASONING:
1227
+ generation_kwargs["think"] = True
1228
+ log_once(
1229
+ "Enabling thinking mode for Ollama model "
1230
+ f"{self.model_config.model_id!r}",
1231
+ level=logging.DEBUG,
1232
+ )
1233
+
1234
+ # Handle manually set parameters
1235
+ if self.buffer["first_label_token_mapping"]:
1236
+ generation_kwargs["logprobs"] = True
1237
+ generation_kwargs["top_logprobs"] = MAX_LOGPROBS
1238
+ if self.model_config.revision == "thinking":
1239
+ generation_kwargs["thinking"] = dict(
1240
+ type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
1241
+ )
1242
+ log_once(
1243
+ f"Enabling thinking mode for model {self.model_config.model_id!r}",
1244
+ level=logging.DEBUG,
1245
+ )
1246
+ elif self.model_config.revision == "no-thinking":
1247
+ generation_kwargs["thinking"] = dict(budget_tokens=0)
1248
+ log_once(
1249
+ f"Disabling thinking mode for model {self.model_config.model_id!r}",
1250
+ level=logging.DEBUG,
1251
+ )
1252
+ elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
1253
+ generation_kwargs["reasoning_effort"] = self.model_config.revision
1254
+ log_once(
1255
+ f"Enabling reasoning effort {self.model_config.revision!r} for model "
1256
+ f"{self.model_config.model_id!r}",
1257
+ level=logging.DEBUG,
1258
+ )
1259
+
1260
+ # First attempt is a test run with a single conversation to handle errors
1261
+ # quickly. We repeat this multiple times to deal with different types of
1262
+ # errors, and stop if we get a successful response.
1263
+ test_conversation = [
1264
+ litellm.ChatCompletionUserMessage(role="user", content="Test message")
1265
+ ]
1266
+ for _ in range(5):
1267
+ _, failures = safe_run(
1268
+ self._generate_async(
1269
+ model_id=self.model_config.model_id,
1270
+ conversations=[test_conversation],
1271
+ **generation_kwargs,
1272
+ )
1273
+ )
1274
+ if not failures:
1275
+ break
1276
+ for _, error in failures:
1277
+ self._handle_exception(error=error, generation_kwargs=generation_kwargs)
1278
+
1279
+ return generation_kwargs
1280
+
1236
1281
 
1237
1282
  def raise_if_wrong_params(
1238
1283
  model_config: ModelConfig, allowed_params: dict[str, list[str]]
@@ -1264,6 +1309,11 @@ def raise_if_wrong_params(
1264
1309
  msg += " No parameters are allowed."
1265
1310
  raise InvalidModel(msg)
1266
1311
  return
1312
+ else:
1313
+ raise InvalidModel(
1314
+ f"The parameter {param!r} is not supported for the model "
1315
+ f"{model_config.model_id!r}."
1316
+ )
1267
1317
 
1268
1318
 
1269
1319
  def try_download_ollama_model(model_id: str) -> bool:
@@ -77,10 +77,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
77
77
  destroy_model_parallel,
78
78
  )
79
79
  from vllm.lora.request import LoRARequest
80
-
81
- if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
82
- from outlines.models.vllm import adapt_tokenizer
83
- from outlines.processors.structured import JSONLogitsProcessor
80
+ from vllm.sampling_params import GuidedDecodingParams
84
81
 
85
82
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
86
83
  import ray
@@ -171,7 +168,8 @@ class VLLMModel(HuggingFaceEncoderModel):
171
168
 
172
169
  def __del__(self) -> None:
173
170
  """Clean up the model and tokenizer."""
174
- clear_vllm()
171
+ if importlib.util.find_spec("vllm") is not None:
172
+ clear_vllm()
175
173
  if hasattr(self, "_model"):
176
174
  del self._model
177
175
  if hasattr(self, "_tokenizer"):
@@ -327,7 +325,7 @@ class VLLMModel(HuggingFaceEncoderModel):
327
325
  if end_of_chat_token:
328
326
  stop_tokens.append(end_of_chat_token)
329
327
 
330
- logits_processor = None
328
+ structured_generation_schema = None
331
329
  if self.dataset_config.task in TASKS_USING_JSON:
332
330
  if self.generative_type == GenerativeType.REASONING:
333
331
  log_once(
@@ -342,15 +340,13 @@ class VLLMModel(HuggingFaceEncoderModel):
342
340
  tag_name: (conlist(str, max_length=5), ...)
343
341
  for tag_name in ner_tag_names
344
342
  }
345
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
346
- logits_processor = JSONLogitsProcessor(
347
- schema=pydantic_class,
348
- tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
349
- whitespace_pattern=r" ?",
343
+ answer_format_class = create_model(
344
+ "AnswerFormat", **keys_and_their_types
350
345
  )
346
+ structured_generation_schema = answer_format_class.model_json_schema()
351
347
  log_once(
352
348
  "Using structured generation with the JSON schema "
353
- f"{pydantic_class.model_json_schema()}",
349
+ f"{structured_generation_schema}",
354
350
  level=logging.DEBUG,
355
351
  )
356
352
 
@@ -374,7 +370,11 @@ class VLLMModel(HuggingFaceEncoderModel):
374
370
  logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
375
371
  temperature=0.0,
376
372
  stop=[stop_token for stop_token in stop_tokens if stop_token],
377
- logits_processors=[logits_processor] if logits_processor else None,
373
+ guided_decoding=(
374
+ GuidedDecodingParams(json=structured_generation_schema)
375
+ if structured_generation_schema
376
+ else None
377
+ ),
378
378
  )
379
379
 
380
380
  # If any of the prompts are empty then we need to replace them with a BOS token
@@ -691,8 +691,14 @@ def load_model_and_tokenizer(
691
691
  )
692
692
  dtype = torch.float16
693
693
 
694
- # If the model is a quantized model, we need to set the dtype to float16
695
- if quantization is not None and hf_model_config.torch_dtype != torch.float16:
694
+ # If the model is a quantized model, we might need to change the dtype
695
+ if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
696
+ dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
697
+ logger.debug(
698
+ "You are loading a quantized model where `torch_dtype` has not been set. "
699
+ f"Setting dtype to {dtype!r}."
700
+ )
701
+ elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
696
702
  logger.info(
697
703
  "You are loading a quantized model with dtype "
698
704
  f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
euroeval/benchmarker.py CHANGED
@@ -379,7 +379,16 @@ class Benchmarker:
379
379
 
380
380
  current_benchmark_results: list[BenchmarkResult] = list()
381
381
  for model_id in model_ids:
382
- model_config: ModelConfig | None = None
382
+ # Load the model configuration, or skip the model if it is invalid
383
+ try:
384
+ model_config = get_model_config(
385
+ model_id=model_id, benchmark_config=benchmark_config
386
+ )
387
+ except InvalidModel as e:
388
+ logger.info(e.message)
389
+ num_finished_benchmarks += len(dataset_configs)
390
+ continue
391
+
383
392
  loaded_model: BenchmarkModule | None = None
384
393
  for dataset_config in dataset_configs:
385
394
  # Skip if we have already benchmarked this model on this dataset and
@@ -399,16 +408,6 @@ class Benchmarker:
399
408
  num_finished_benchmarks += 1
400
409
  continue
401
410
 
402
- if model_config is None:
403
- try:
404
- model_config = get_model_config(
405
- model_id=model_id, benchmark_config=benchmark_config
406
- )
407
- except InvalidModel as e:
408
- logger.info(e.message)
409
- num_finished_benchmarks += len(dataset_configs)
410
- continue
411
-
412
411
  # Skip if the model is an encoder model and the task is generative
413
412
  task_is_generative = (
414
413
  dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
euroeval/data_models.py CHANGED
@@ -259,7 +259,7 @@ class BenchmarkResult(pydantic.BaseModel):
259
259
  transformers_version: str | None = get_package_version("transformers")
260
260
  torch_version: str | None = get_package_version("torch")
261
261
  vllm_version: str | None = get_package_version("vllm")
262
- outlines_version: str | None = get_package_version("outlines")
262
+ xgrammar_version: str | None = get_package_version("xgrammar")
263
263
 
264
264
  @classmethod
265
265
  def from_dict(cls, config: dict) -> "BenchmarkResult":
@@ -128,3 +128,13 @@ MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
128
128
  languages=[DA],
129
129
  unofficial=True,
130
130
  )
131
+
132
+ GOLDENSWAG_DA_CONFIG = DatasetConfig(
133
+ name="goldenswag-da",
134
+ pretty_name="the truncated version of the Danish common-sense reasoning "
135
+ "dataset GoldenSwag-da, translated from the English GoldenSwag dataset",
136
+ huggingface_id="EuroEval/goldenswag-da-mini",
137
+ task=COMMON_SENSE,
138
+ languages=[DA],
139
+ unofficial=True,
140
+ )
@@ -120,3 +120,13 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
120
120
  languages=[NL],
121
121
  unofficial=True,
122
122
  )
123
+
124
+ GOLDENSWAG_NL_CONFIG = DatasetConfig(
125
+ name="goldenswag-nl",
126
+ pretty_name="the truncated version of the Dutch common-sense reasoning "
127
+ "dataset GoldenSwag-nl, translated from the English GoldenSwag dataset",
128
+ huggingface_id="EuroEval/goldenswag-nl-mini",
129
+ task=COMMON_SENSE,
130
+ languages=[NL],
131
+ unofficial=True,
132
+ )
@@ -78,3 +78,13 @@ MULTI_WIKI_QA_FI_CONFIG = DatasetConfig(
78
78
  languages=[FI],
79
79
  unofficial=True,
80
80
  )
81
+
82
+ GOLDENSWAG_FI_CONFIG = DatasetConfig(
83
+ name="goldenswag-fi",
84
+ pretty_name="the truncated version of the Finnish common-sense reasoning "
85
+ "dataset GoldenSwag-fi, translated from the English GoldenSwag dataset",
86
+ huggingface_id="EuroEval/goldenswag-fi-mini",
87
+ task=COMMON_SENSE,
88
+ languages=[FI],
89
+ unofficial=True,
90
+ )
@@ -91,3 +91,13 @@ MULTI_WIKI_QA_FR_CONFIG = DatasetConfig(
91
91
  languages=[FR],
92
92
  unofficial=True,
93
93
  )
94
+
95
+ GOLDENSWAG_FR_CONFIG = DatasetConfig(
96
+ name="goldenswag-fr",
97
+ pretty_name="the truncated version of the French common-sense reasoning "
98
+ "dataset GoldenSwag-fr, translated from the English GoldenSwag dataset",
99
+ huggingface_id="EuroEval/goldenswag-fr-mini",
100
+ task=COMMON_SENSE,
101
+ languages=[FR],
102
+ unofficial=True,
103
+ )
@@ -99,3 +99,13 @@ MULTI_WIKI_QA_DE_CONFIG = DatasetConfig(
99
99
  languages=[DE],
100
100
  unofficial=True,
101
101
  )
102
+
103
+ GOLDENSWAG_DE_CONFIG = DatasetConfig(
104
+ name="goldenswag-de",
105
+ pretty_name="the truncated version of the German common-sense reasoning "
106
+ "dataset GoldenSwag-de, translated from the English GoldenSwag dataset",
107
+ huggingface_id="EuroEval/goldenswag-de-mini",
108
+ task=COMMON_SENSE,
109
+ languages=[DE],
110
+ unofficial=True,
111
+ )
@@ -99,3 +99,13 @@ MULTI_WIKI_QA_IT_CONFIG = DatasetConfig(
99
99
  languages=[IT],
100
100
  unofficial=True,
101
101
  )
102
+
103
+ GOLDENSWAG_IT_CONFIG = DatasetConfig(
104
+ name="goldenswag-it",
105
+ pretty_name="the truncated version of the Italian common-sense reasoning "
106
+ "dataset GoldenSwag-it, translated from the English GoldenSwag dataset",
107
+ huggingface_id="EuroEval/goldenswag-it-mini",
108
+ task=COMMON_SENSE,
109
+ languages=[IT],
110
+ unofficial=True,
111
+ )
@@ -97,3 +97,13 @@ MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
97
97
  languages=[ES],
98
98
  unofficial=True,
99
99
  )
100
+
101
+ GOLDENSWAG_ES_CONFIG = DatasetConfig(
102
+ name="goldenswag-es",
103
+ pretty_name="the truncated version of the Spanish common-sense reasoning "
104
+ "dataset GoldenSwag-es, translated from the English GoldenSwag dataset",
105
+ huggingface_id="EuroEval/goldenswag-es-mini",
106
+ task=COMMON_SENSE,
107
+ languages=[ES],
108
+ unofficial=True,
109
+ )
@@ -108,3 +108,13 @@ MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
108
108
  languages=[SV],
109
109
  unofficial=True,
110
110
  )
111
+
112
+ GOLDENSWAG_SV_CONFIG = DatasetConfig(
113
+ name="goldenswag-sv",
114
+ pretty_name="the truncated version of the Swedish common-sense reasoning "
115
+ "dataset GoldenSwag-sv, translated from the English GoldenSwag dataset",
116
+ huggingface_id="EuroEval/goldenswag-sv-mini",
117
+ task=COMMON_SENSE,
118
+ languages=[SV],
119
+ unofficial=True,
120
+ )
euroeval/finetuning.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import logging
4
4
  import sys
5
5
  import typing as t
6
+ from functools import partial
6
7
 
7
8
  import torch
8
9
  from tqdm.auto import tqdm
@@ -198,7 +199,7 @@ def finetune_single_iteration(
198
199
  args=training_args,
199
200
  train_dataset=dataset["train"],
200
201
  eval_dataset=dataset["val"],
201
- compute_metrics=model.compute_metrics,
202
+ compute_metrics=partial(model.compute_metrics, dataset=None),
202
203
  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
203
204
  data_collator=model.data_collator,
204
205
  preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
euroeval/generation.py CHANGED
@@ -235,7 +235,7 @@ def generate_single_iteration(
235
235
  )
236
236
 
237
237
  itr_scores: dict[str, float] = model.compute_metrics(
238
- model_outputs_and_labels=(all_preds, ground_truth)
238
+ model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
239
239
  )
240
240
 
241
241
  return itr_scores
@@ -620,7 +620,8 @@ class HumanEvaluator:
620
620
  )
621
621
  ground_truth = self.active_dataset["label"]
622
622
  itr_scores: dict[str, float] = self.compute_metrics(
623
- model_outputs_and_labels=(all_preds, ground_truth)
623
+ model_outputs_and_labels=(all_preds, ground_truth),
624
+ dataset=self.active_dataset,
624
625
  )
625
626
 
626
627
  # We reverse the order, as the Info messages are printed in reverse order
euroeval/metrics.py CHANGED
@@ -14,6 +14,7 @@ from .exceptions import InvalidBenchmark
14
14
  from .utils import HiddenPrints
15
15
 
16
16
  if t.TYPE_CHECKING:
17
+ from datasets.arrow_dataset import Dataset
17
18
  from evaluate import EvaluationModule
18
19
 
19
20
  logger = logging.getLogger(__name__)
@@ -49,7 +50,9 @@ class Metric(abc.ABC):
49
50
  )
50
51
 
51
52
  @abc.abstractmethod
52
- def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
53
+ def __call__(
54
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
55
+ ) -> float | None:
53
56
  """Calculate the metric score.
54
57
 
55
58
  Args:
@@ -57,6 +60,9 @@ class Metric(abc.ABC):
57
60
  The model predictions.
58
61
  references:
59
62
  The ground truth references.
63
+ dataset:
64
+ The dataset used for evaluation. This is only used in case any
65
+ additional metadata is used to compute the metrics.
60
66
 
61
67
  Returns:
62
68
  The calculated metric score, or None if the score should be ignored.
@@ -125,7 +131,9 @@ class HuggingFaceMetric(Metric):
125
131
  )
126
132
  self.metric: "EvaluationModule | None" = None
127
133
 
128
- def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
134
+ def __call__(
135
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
136
+ ) -> float | None:
129
137
  """Calculate the metric score.
130
138
 
131
139
  Args:
@@ -133,6 +141,9 @@ class HuggingFaceMetric(Metric):
133
141
  The model predictions.
134
142
  references:
135
143
  The ground truth references.
144
+ dataset:
145
+ The dataset used for evaluation. This is only used in case any
146
+ additional metadata is used to compute the metrics.
136
147
 
137
148
  Returns:
138
149
  The calculated metric score, or None if the score should be ignored.
@@ -213,7 +224,9 @@ class LLMAsAJudgeMetric(Metric):
213
224
  self.condition_formatting_fn = condition_formatting_fn
214
225
  self.system_prompt = system_prompt
215
226
 
216
- def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
227
+ def __call__(
228
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
229
+ ) -> float | None:
217
230
  """Calculate the metric score using the judge model.
218
231
 
219
232
  Args:
@@ -221,6 +234,9 @@ class LLMAsAJudgeMetric(Metric):
221
234
  The model predictions.
222
235
  references:
223
236
  The ground truth references.
237
+ dataset:
238
+ The dataset used for evaluation. This is only used in case any
239
+ additional metadata is used to compute the metrics.
224
240
 
225
241
  Returns:
226
242
  The calculated metric score, or None if the score should be ignored.
@@ -343,7 +359,9 @@ class SpeedMetric(Metric):
343
359
  postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
344
360
  )
345
361
 
346
- def __call__(self, _: t.Sequence, __: t.Sequence) -> float | None:
362
+ def __call__(
363
+ self, _: t.Sequence, __: t.Sequence, ___: "Dataset | None"
364
+ ) -> float | None:
347
365
  """Not used with the speed metric, but required for consistency."""
348
366
  raise NotImplementedError
349
367
 
@@ -69,7 +69,7 @@ MULTIPLE_CHOICE_TEMPLATES = {
69
69
  IT: PromptConfig(
70
70
  default_prompt_prefix="Le seguenti sono domande a scelta multipla "
71
71
  "(con relative risposte).",
72
- default_prompt_template="Domanda: {text}\nRéponse: {label}",
72
+ default_prompt_template="Domanda: {text}\nRisposta: {label}",
73
73
  default_instruction_prompt="Domanda: {text}\n\nRispondete alla domanda "
74
74
  "precedente con {labels_str}, e nient'altro.",
75
75
  default_prompt_label_mapping="auto",
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
149
149
  def compute_metrics(
150
150
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
151
151
  dataset_config: "DatasetConfig",
152
+ dataset: "Dataset",
152
153
  ) -> dict[str, float]:
153
154
  """Compute the metrics needed for evaluation.
154
155
 
@@ -158,6 +159,9 @@ def compute_metrics(
158
159
  contains the true labels.
159
160
  dataset_config:
160
161
  The configuration of the dataset.
162
+ dataset:
163
+ The dataset used for evaluation. This is only used in case any additional
164
+ metadata is used to compute the metrics.
161
165
 
162
166
  Returns:
163
167
  A dictionary with the names of the metrics as keys and the metric values as
@@ -181,7 +185,9 @@ def compute_metrics(
181
185
 
182
186
  results: dict[str, float] = dict()
183
187
  for metric in dataset_config.task.metrics:
184
- score: float | None = metric(predictions=predictions, references=labels)
188
+ score: float | None = metric(
189
+ predictions=predictions, references=labels, dataset=dataset
190
+ )
185
191
 
186
192
  # The metric returns None if we are running on multi-GPU and the current
187
193
  # process is not the main process
@@ -11,6 +11,7 @@ from ..exceptions import InvalidBenchmark
11
11
  from ..utils import log_once, raise_if_model_output_contains_nan_values
12
12
 
13
13
  if t.TYPE_CHECKING:
14
+ from datasets.arrow_dataset import Dataset
14
15
  from transformers.trainer_utils import EvalPrediction
15
16
 
16
17
  from ..data_models import DatasetConfig, GenerativeModelOutput
@@ -23,6 +24,7 @@ logger = logging.getLogger("euroeval")
23
24
  def compute_metrics(
24
25
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
25
26
  dataset_config: "DatasetConfig",
27
+ dataset: "Dataset",
26
28
  ) -> dict[str, float]:
27
29
  """Compute the metrics needed for evaluation.
28
30
 
@@ -32,6 +34,9 @@ def compute_metrics(
32
34
  contains the true labels.
33
35
  dataset_config:
34
36
  The configuration of the dataset.
37
+ dataset:
38
+ The dataset used for evaluation. This is only used in case any additional
39
+ metadata is used to compute the metrics.
35
40
 
36
41
  Returns:
37
42
  A dictionary with the names of the metrics as keys and the metric values as
@@ -73,7 +78,9 @@ def compute_metrics(
73
78
 
74
79
  results: dict[str, float] = dict()
75
80
  for metric in dataset_config.task.metrics:
76
- score: float | None = metric(predictions=predictions, references=label_ids)
81
+ score: float | None = metric(
82
+ predictions=predictions, references=label_ids, dataset=dataset
83
+ )
77
84
 
78
85
  # The metric returns None if we are running on multi-GPU and the current
79
86
  # process is not the main process
@@ -11,6 +11,7 @@ from ..metrics import HuggingFaceMetric
11
11
  from ..utils import raise_if_model_output_contains_nan_values
12
12
 
13
13
  if t.TYPE_CHECKING:
14
+ from datasets.arrow_dataset import Dataset
14
15
  from transformers.trainer_utils import EvalPrediction
15
16
 
16
17
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
@@ -24,6 +25,7 @@ def compute_metrics(
24
25
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
25
26
  dataset_config: "DatasetConfig",
26
27
  benchmark_config: "BenchmarkConfig",
28
+ dataset: "Dataset",
27
29
  ) -> dict[str, float]:
28
30
  """Compute the metrics needed for evaluation.
29
31
 
@@ -35,6 +37,9 @@ def compute_metrics(
35
37
  The configuration of the dataset.
36
38
  benchmark_config:
37
39
  The configuration of the benchmark.
40
+ dataset:
41
+ The dataset used for evaluation. This is only used in case any additional
42
+ metadata is used to compute the metrics.
38
43
 
39
44
  Returns:
40
45
  A dictionary with the names of the metrics as keys and the metric values as
@@ -69,7 +74,9 @@ def compute_metrics(
69
74
 
70
75
  while True:
71
76
  try:
72
- score: float | None = metric(predictions=predictions, references=labels)
77
+ score: float | None = metric(
78
+ predictions=predictions, references=labels, dataset=dataset
79
+ )
73
80
  break
74
81
  except Exception as e:
75
82
  oom_error = [
@@ -12,6 +12,7 @@ from ..exceptions import InvalidBenchmark
12
12
  from ..utils import raise_if_model_output_contains_nan_values
13
13
 
14
14
  if t.TYPE_CHECKING:
15
+ from datasets.arrow_dataset import Dataset
15
16
  from transformers.tokenization_utils import PreTrainedTokenizer
16
17
  from transformers.tokenization_utils_base import BatchEncoding
17
18
  from transformers.trainer_utils import EvalPrediction
@@ -27,6 +28,7 @@ def compute_metrics(
27
28
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
28
29
  has_misc_tags: bool,
29
30
  dataset_config: "DatasetConfig",
31
+ dataset: "Dataset",
30
32
  ) -> dict[str, float]:
31
33
  """Compute the metrics needed for evaluation.
32
34
 
@@ -38,6 +40,9 @@ def compute_metrics(
38
40
  Whether the dataset has MISC tags.
39
41
  dataset_config:
40
42
  The configuration of the dataset.
43
+ dataset:
44
+ The dataset used for evaluation. This is only used in case any additional
45
+ metadata is used to compute the metrics.
41
46
 
42
47
  Returns:
43
48
  A dictionary with the names of the metrics as keys and the metric values as
@@ -136,7 +141,9 @@ def compute_metrics(
136
141
  for metric in dataset_config.task.metrics
137
142
  if metric.name == "micro_f1"
138
143
  )
139
- micro_f1_score = metric(predictions=predictions, references=list(labels))
144
+ micro_f1_score = metric(
145
+ predictions=predictions, references=list(labels), dataset=dataset
146
+ )
140
147
 
141
148
  # Compute the metrics without MISC tags
142
149
  # We manually set the F1 metric to be 100% if both the labels and the models
@@ -158,7 +165,7 @@ def compute_metrics(
158
165
  if metric.name == "micro_f1_no_misc"
159
166
  )
160
167
  micro_f1_no_misc_score = metric(
161
- predictions=predictions_no_misc, references=labels_no_misc
168
+ predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
162
169
  )
163
170
 
164
171
  # Raise error if the metrics are invalid
euroeval/types.py CHANGED
@@ -5,6 +5,7 @@ import typing as t
5
5
  from transformers.trainer_utils import EvalPrediction
6
6
 
7
7
  if t.TYPE_CHECKING:
8
+ from datasets.arrow_dataset import Dataset
8
9
  from numpy.typing import NDArray
9
10
 
10
11
  from .data_models import GenerativeModelOutput
@@ -25,12 +26,16 @@ class ComputeMetricsFunction(t.Protocol):
25
26
  "NDArray | list[str] | list[list[str]]",
26
27
  "NDArray | list[str] | list[list[str]]",
27
28
  ],
29
+ dataset: "Dataset",
28
30
  ) -> dict[str, float]:
29
31
  """Compute the metrics.
30
32
 
31
33
  Args:
32
34
  model_outputs_and_labels:
33
35
  The model outputs and labels.
36
+ dataset:
37
+ The dataset used for evaluation. This is only used in case any
38
+ additional metadata is used to compute the metrics.
34
39
 
35
40
  Returns:
36
41
  The computed metrics.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.14.0
3
+ Version: 15.16.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -56,18 +56,16 @@ Requires-Dist: setuptools>=75.8.2
56
56
  Requires-Dist: tenacity>=9.0.0
57
57
  Requires-Dist: termcolor>=2.0.0
58
58
  Requires-Dist: torch>=2.6.0
59
- Requires-Dist: transformers>=4.51.0
59
+ Requires-Dist: transformers>=4.55.0
60
60
  Provides-Extra: all
61
61
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
- Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
66
65
  Provides-Extra: generative
67
66
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
67
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
- Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
68
+ Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'generative'
71
69
  Provides-Extra: human-evaluation
72
70
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
71
  Provides-Extra: test
@@ -235,6 +233,7 @@ A huge thank you to all the contributors who have helped make this project a suc
235
233
  <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
236
234
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
237
235
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
236
+ <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
238
237
 
239
238
 
240
239
  ### Contribute to EuroEval
@@ -1,19 +1,19 @@
1
- euroeval/__init__.py,sha256=fZyR9R3C3vwGJS3CrCJ6ySr_FDnMu_Aqnz0FdadWEEs,3399
1
+ euroeval/__init__.py,sha256=ZZoVc6tKWz_h8Pw2n26PV-q_Gd4TM_02O235ZBRUNJw,3756
2
2
  euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
3
- euroeval/benchmarker.py,sha256=SDBzdCa4I8u1XDeN_1mKTFzfaaQbbY_oWcHt3niADxk,48497
3
+ euroeval/benchmarker.py,sha256=6qo0ytRnvZLxTQZvo2Fryox5DFHGrLsa0tVGquLHdTQ,48419
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
5
  euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
6
6
  euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
7
7
  euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
8
- euroeval/data_models.py,sha256=gPHyIoN2A5_O-cJgyb6jhn6enH8zsiIBI09W_wdHMQs,22031
8
+ euroeval/data_models.py,sha256=qSCNq3PV7qo--gibqEvvu4cXkEkhGGAb6UiZW8U_KiU,22031
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
- euroeval/finetuning.py,sha256=BrPZ-6qFY8K-dwfaRwNetVYfYburoQwLQty6pn6iP_s,11340
12
- euroeval/generation.py,sha256=1fqFEWwM2RzI3uPZem95VFWbN8EfrKZQTrHEP34ihHs,11622
11
+ euroeval/finetuning.py,sha256=Wzagme1n3lSZLWX0WbKMHtSUlAZr8t8_FJvggDZf72c,11393
12
+ euroeval/generation.py,sha256=lmvu__6w3cLxi0zBtXSlyZvV8CJpV3BdajUoIEA9ElA,11639
13
13
  euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
14
- euroeval/human_evaluation.py,sha256=Jtz3K5Lqne48wPZWf4EAd3d-n_wX27nGJHigjhV1D7s,27537
14
+ euroeval/human_evaluation.py,sha256=FLuTl1DHxCiWB_laVVQHIH86yXvA_ZeNNSrUmyExZXI,27579
15
15
  euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
16
- euroeval/metrics.py,sha256=nxosyoRjlk7TcoAOkjU7zx2TB43b9tA8M1m4V1s5eKU,15516
16
+ euroeval/metrics.py,sha256=m8nVnxUnwmIrlBfW8pkN4FCMjW3Sbg9Iq4oMZFAicEc,16227
17
17
  euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
18
18
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
19
19
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
@@ -21,43 +21,43 @@ euroeval/scores.py,sha256=TatSbjia7Zwj71gQFyV_gCHyppMbOgeaZgNCib8G86k,2849
21
21
  euroeval/speed_benchmark.py,sha256=6bFGeMmtdl_6owkxNQ3ZKiyQQS58k0NApzlsbDgBW5s,4037
22
22
  euroeval/tasks.py,sha256=btxf29M5rUP7JjBl6u9aQlHQAxrJNP4bRbdEQtDnmDA,3376
23
23
  euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
24
- euroeval/types.py,sha256=EIYMNOqqHqibnbNw-fvdst6HwTvq32gtxhr7jL7i-xM,2511
24
+ euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
25
25
  euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
26
26
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
27
27
  euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
28
28
  euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
29
29
  euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
30
- euroeval/benchmark_modules/litellm.py,sha256=qv-k2ntk48OF4ikevQ95k4zLbBkZYOZ2z-GAisA-tFY,53374
31
- euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
30
+ euroeval/benchmark_modules/litellm.py,sha256=ibdbOmxAO1VsuZX4uUs5MQ8pFPfqPJoleOOjAim3syY,55493
31
+ euroeval/benchmark_modules/vllm.py,sha256=7PhfqqeRGFdzOL-RBJbrHEAMGfwrVWngF14dSeq9IpI,39072
32
32
  euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
33
- euroeval/dataset_configs/danish.py,sha256=-y-n08hTApwTdSVdjRlZYa3gOX92cTGhg8xsuG-Lhww,3691
34
- euroeval/dataset_configs/dutch.py,sha256=siyFeEKYx2gBpyqQPtOZ0cD8FTsIMUqzRX5xrQfrNXI,3480
33
+ euroeval/dataset_configs/danish.py,sha256=0lDtvpgszXY1XaPjTU8yA3oNCU8W2OllvrBWgn6pkhk,4027
34
+ euroeval/dataset_configs/dutch.py,sha256=ekZxLL9d09BUMijCxy9EFa2heNQVvySPySOjhWdtJc8,3815
35
35
  euroeval/dataset_configs/english.py,sha256=uQAaGWpHk8xqFCeIhmmPXYTb1cZomeEdRaRe9qIZQrg,2858
36
36
  euroeval/dataset_configs/faroese.py,sha256=gkgxQTWGFbfg9Eo1z-NSLROgKDcaij9tAN2mfgtrt0M,1647
37
- euroeval/dataset_configs/finnish.py,sha256=OyveLgyii0hOlo6HZsqAq4rwDrj8tl2qstRfQKugURo,2342
38
- euroeval/dataset_configs/french.py,sha256=DKKZEtohWkw_ouBaxWcPzp-K6NhQNtvCKxj8NLbIpUc,2678
39
- euroeval/dataset_configs/german.py,sha256=3bfRgkqIGkAhcw4kwcJN9PKuJSmi1r6AFTJY-IWKgWM,2856
37
+ euroeval/dataset_configs/finnish.py,sha256=UZwy0_d17O2L-v2AKOu3OlDwFPcLGTZNAOt7ZKlr4K8,2679
38
+ euroeval/dataset_configs/french.py,sha256=Hei2M4bGIz8hVtaPKQlQATcmK-0bFBNEocEszR3gia0,3014
39
+ euroeval/dataset_configs/german.py,sha256=sRYtOl6CYf4kZkeINfff6xoKBG4OsDxb2b72lKwELGc,3192
40
40
  euroeval/dataset_configs/icelandic.py,sha256=g21IHjcwEZvf_yJ9PobeuBOqRiLOk0oCdEjY34g-UMk,4497
41
- euroeval/dataset_configs/italian.py,sha256=rHLMkSXT0kFoQlkwHODxO50WBRIfGtkAnW_C-sfIu74,2957
41
+ euroeval/dataset_configs/italian.py,sha256=4SEmdUyfGbbwMPhv_9nL3JNJtoDKHLAlWuvr7Ihmi9o,3294
42
42
  euroeval/dataset_configs/norwegian.py,sha256=-WvQM44xCwjrqBzlAy4rjf6v87fGera2JmZV_069TeQ,6003
43
43
  euroeval/dataset_configs/portuguese.py,sha256=3SqbwD0PNTILGALzh50pVoEwC-spRD75ZeE2NEj151E,2367
44
- euroeval/dataset_configs/spanish.py,sha256=VKfBIpBRR38ckuULw7Ftmc-0smsm6GshUAik2-Y1Npw,2855
45
- euroeval/dataset_configs/swedish.py,sha256=WpExi4TJqy_Ruwy4Kvde94jM605vT_88el_KKUzLV4E,3108
44
+ euroeval/dataset_configs/spanish.py,sha256=Bm0Z19Mh2qYXR0RIRlqEkzfVb5KiqJRectfuY7JLql4,3192
45
+ euroeval/dataset_configs/swedish.py,sha256=js4paNsuC0nQzPpf6_BzHBf7MT60XUpP1-qM2uxRtQs,3445
46
46
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
47
47
  euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
48
- euroeval/prompt_templates/multiple_choice.py,sha256=F9ItGQtnaaez15A8MQ1UCpKRDsLM-AZyRdYetGAofa0,5494
48
+ euroeval/prompt_templates/multiple_choice.py,sha256=wHnQCE5bv947L6hSK5zJitE37V-PbuNYAp156mWaIYA,5494
49
49
  euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
50
50
  euroeval/prompt_templates/reading_comprehension.py,sha256=3Nch-9zHfUDIwy-k5mP-TRhHQRQ9nad8HdhpJ1S8nGc,7072
51
51
  euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
52
52
  euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
53
53
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
54
54
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
55
- euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6zmoaPgYVyzMmOkNjr58,27284
56
- euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
57
- euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
58
- euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
59
- euroeval-15.14.0.dist-info/METADATA,sha256=uQY74VCgn3TRCTXJGCb8ilS-3U5UL69lbhNGQw2NGTM,13478
60
- euroeval-15.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
- euroeval-15.14.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
- euroeval-15.14.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
- euroeval-15.14.0.dist-info/RECORD,,
55
+ euroeval/task_group_utils/question_answering.py,sha256=6jpiHukzA7IrJh4vVYyZDDyvD5Xc2GsxoXzpm_PHpXw,27503
56
+ euroeval/task_group_utils/sequence_classification.py,sha256=ihJO55f3Dy565d3ByYGMuSINasnjAADaTrM59LwZzA0,12977
57
+ euroeval/task_group_utils/text_to_text.py,sha256=go0y6X9QAv5iywlLAclb8cqFX_3QlAT-1-VNZ9zMWFA,4832
58
+ euroeval/task_group_utils/token_classification.py,sha256=BDqOfopdH5Bbj67HTEbZd9KZtNCDNket8NrCTfxZFzQ,17773
59
+ euroeval-15.16.0.dist-info/METADATA,sha256=_oeIq0ZGzS0i7n51NdhNhuDX2A3_lDjYDD-6KgB1rW0,13536
60
+ euroeval-15.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
+ euroeval-15.16.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
+ euroeval-15.16.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
+ euroeval-15.16.0.dist-info/RECORD,,