EuroEval 15.6.1__py3-none-any.whl → 15.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -33,6 +33,7 @@ from litellm.exceptions import (
33
33
  )
34
34
  from litellm.llms.vertex_ai.common_utils import VertexAIError
35
35
  from litellm.types.utils import ChoiceLogprobs, ModelResponse
36
+ from pydantic import conlist, create_model
36
37
  from requests.exceptions import RequestException
37
38
  from tqdm.auto import tqdm
38
39
  from transformers.trainer import Trainer
@@ -104,6 +105,7 @@ MODEL_MAX_LENGTH_MAPPING = {
104
105
  r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
105
106
  r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
106
107
  r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
108
+ r"gpt-4.1.*": 1_047_576,
107
109
  # Anthropic models
108
110
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
109
111
  # Gemini models
@@ -135,20 +137,23 @@ ALLOWED_PARAMS = {
135
137
  r"gpt-4.*": [],
136
138
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
137
139
  # Anthropic models
138
- r"(anthropic/)?claude-3-.*": [],
139
- r"(anthropic/)?claude-3.5-.*": [],
140
- r"(anthropic/)?claude-3.7-sonnet.*": ["thinking"],
140
+ r"(anthropic/)?claude-3-(haiku|sonnet|opus).*": [],
141
+ r"(anthropic/)?claude-3-5-.*": [],
142
+ r"(anthropic/)?claude-3-7-sonnet.*": ["thinking"],
141
143
  # Gemini models
142
144
  r"(gemini/)?gemini-.*": [],
143
145
  # xAI models
144
- r"(xai/)?grok.*": [],
146
+ r"(xai/)?grok-2.*": [],
147
+ r"(xai/)?grok-3(-fast)?(-beta)?": [],
148
+ r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "high"],
145
149
  }
146
150
 
147
151
 
148
152
  REASONING_MODELS = [
149
153
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
150
154
  r"(gemini/)?gemini.*thinking.*",
151
- r"(gemini/)?gemini-2.5-pro.*",
155
+ r"(gemini/)?gemini-2.5.*",
156
+ r"(xai/)?grok-3-mini.*",
152
157
  ]
153
158
 
154
159
 
@@ -190,7 +195,10 @@ class LiteLLMModel(BenchmarkModule):
190
195
  )
191
196
 
192
197
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
193
- dataset_config=self.dataset_config, tokenizer=None
198
+ dataset_config=self.dataset_config,
199
+ model_config=self.model_config,
200
+ tokenizer=None,
201
+ generative_type=self.generative_type,
194
202
  )
195
203
 
196
204
  @property
@@ -201,13 +209,20 @@ class LiteLLMModel(BenchmarkModule):
201
209
  The generative type of the model, or None if it has not been set yet.
202
210
  """
203
211
  if self.model_config.revision == "thinking":
204
- return GenerativeType.REASONING
212
+ type_ = GenerativeType.REASONING
205
213
  elif re.fullmatch(
206
214
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
207
215
  ):
208
- return GenerativeType.REASONING
216
+ type_ = GenerativeType.REASONING
209
217
  else:
210
- return GenerativeType.INSTRUCTION_TUNED
218
+ type_ = GenerativeType.INSTRUCTION_TUNED
219
+
220
+ log_once(
221
+ f"Detected generative type {type_.name!r} for model "
222
+ f"{self.model_config.model_id!r}",
223
+ level=logging.DEBUG,
224
+ )
225
+ return type_
211
226
 
212
227
  def generate(self, inputs: dict) -> GenerativeModelOutput:
213
228
  """Generate outputs from the model.
@@ -243,7 +258,10 @@ class LiteLLMModel(BenchmarkModule):
243
258
  # Get the mapping from labels to the first token in the label. We call this each
244
259
  # time we generate a new dataset since the dataset config can change
245
260
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
246
- dataset_config=self.dataset_config, tokenizer=None
261
+ dataset_config=self.dataset_config,
262
+ model_config=self.model_config,
263
+ tokenizer=None,
264
+ generative_type=self.generative_type,
247
265
  )
248
266
 
249
267
  if self.buffer["first_label_token_mapping"]:
@@ -254,16 +272,41 @@ class LiteLLMModel(BenchmarkModule):
254
272
  assert "json" in messages[0]["content"].lower(), (
255
273
  "Prompt must contain 'json' for JSON tasks."
256
274
  )
257
- generation_kwargs["response_format"] = dict(type="json_object")
258
- log_once(
259
- "Enabling JSON response format for model "
260
- f"{self.model_config.model_id!r}",
261
- level=logging.DEBUG,
262
- )
275
+ if self.generative_type == GenerativeType.REASONING:
276
+ log_once(
277
+ f"The model {self.model_config.model_id!r} is a reasoning model "
278
+ "and thus does not support structured generation, so we do not "
279
+ "enable it.",
280
+ level=logging.DEBUG,
281
+ )
282
+ elif litellm.utils.supports_response_schema(
283
+ model=self.model_config.model_id
284
+ ):
285
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
286
+ keys_and_their_types: dict[str, t.Any] = {
287
+ tag_name: (conlist(str, max_length=5), ...)
288
+ for tag_name in ner_tag_names
289
+ }
290
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
291
+ generation_kwargs["response_format"] = pydantic_class
292
+ log_once(
293
+ "Enabling structured generation for model "
294
+ f"{self.model_config.model_id!r} with the JSON schema "
295
+ f"{pydantic_class.model_json_schema()}",
296
+ level=logging.DEBUG,
297
+ )
298
+ else:
299
+ generation_kwargs["response_format"] = dict(type="json_object")
300
+ log_once(
301
+ "Enabling structured JSON generation for model "
302
+ f"{self.model_config.model_id!r} with no custom JSON schema, as "
303
+ "the model does not support schemas.",
304
+ level=logging.DEBUG,
305
+ )
263
306
 
264
307
  if self.model_config.revision == "thinking":
265
308
  generation_kwargs["thinking"] = dict(
266
- type="enabled", budget_tokens=REASONING_MAX_TOKENS
309
+ type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
267
310
  )
268
311
  log_once(
269
312
  f"Enabling thinking mode for model {self.model_config.model_id!r}",
@@ -280,28 +323,42 @@ class LiteLLMModel(BenchmarkModule):
280
323
  # This drops generation kwargs that are not supported by the model
281
324
  litellm.drop_params = True
282
325
 
326
+ # Error messages that we want to catch and handle
327
+ stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
328
+ logprobs_messages = [
329
+ "you are not allowed to request logprobs",
330
+ "you've reached the maximum number of requests with logprobs",
331
+ "logprobs is not supported",
332
+ "logprobs is not enabled",
333
+ ]
334
+ temperature_messages = [
335
+ "'temperature' is not supported with this model.",
336
+ "temperature is not supported with this model",
337
+ ]
338
+ temperature_must_be_one_messages = [
339
+ "`temperature` may only be set to 1",
340
+ "'temperature' does not support 0.0 with this model. Only the default "
341
+ "(1) value is supported",
342
+ ]
343
+ max_items_messages = ["'maxItems' is not permitted."]
344
+ no_json_schema_messages = ["Property keys should match pattern"]
345
+
283
346
  # Extract the generated sequences from the model response. Some APIs cannot
284
347
  # handle using newlines as stop sequences, so we try both.
285
348
  num_attempts = 10
286
349
  for _ in range(num_attempts):
287
- stop_messages = ["stop_sequences"]
288
- logprobs_messages = [
289
- "you are not allowed to request logprobs",
290
- "you've reached the maximum number of requests with logprobs",
291
- "logprobs is not supported",
292
- "logprobs is not enabled",
293
- ]
294
- temperature_messages = [
295
- "'temperature' is not supported with this model.",
296
- "temperature is not supported with this model",
297
- ]
298
350
  try:
299
- model_response = litellm.completion(
300
- messages=messages, max_retries=3, **generation_kwargs
351
+ model_response = litellm.completion_with_retries(
352
+ messages=messages, **generation_kwargs
301
353
  )
302
354
  break
303
355
  except (BadRequestError, RateLimitError) as e:
304
356
  if any(msg.lower() in str(e).lower() for msg in stop_messages):
357
+ log_once(
358
+ f"The model {self.model_config.model_id!r} does not support "
359
+ "stop sequences, so disabling them.",
360
+ level=logging.DEBUG,
361
+ )
305
362
  generation_kwargs["stop"] = None
306
363
  elif (
307
364
  any(msg.lower() in str(e).lower() for msg in logprobs_messages)
@@ -310,10 +367,55 @@ class LiteLLMModel(BenchmarkModule):
310
367
  # we ignore this since the rate limiting makes it unusable anyway.
311
368
  or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
312
369
  ):
370
+ log_once(
371
+ f"The model {self.model_config.model_id!r} does not support "
372
+ "logprobs, so disabling it.",
373
+ level=logging.DEBUG,
374
+ )
313
375
  generation_kwargs.pop("logprobs")
314
376
  generation_kwargs.pop("top_logprobs")
315
377
  elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
378
+ log_once(
379
+ f"The model {self.model_config.model_id!r} does not support "
380
+ "temperature, so disabling it.",
381
+ level=logging.DEBUG,
382
+ )
316
383
  generation_kwargs.pop("temperature")
384
+ elif any(
385
+ msg.lower() in str(e).lower()
386
+ for msg in temperature_must_be_one_messages
387
+ ):
388
+ log_once(
389
+ f"The model {self.model_config.model_id!r} requires "
390
+ "temperature to be set to 1, so setting it.",
391
+ level=logging.DEBUG,
392
+ )
393
+ generation_kwargs["temperature"] = 1.0
394
+ elif any(msg.lower() in str(e).lower() for msg in max_items_messages):
395
+ log_once(
396
+ f"The model {self.model_config.model_id!r} does not support "
397
+ "maxItems in the JSON schema, so disabling it.",
398
+ level=logging.DEBUG,
399
+ )
400
+ ner_tag_names = list(
401
+ self.dataset_config.prompt_label_mapping.values()
402
+ )
403
+ keys_and_their_types = {
404
+ tag_name: (list[str], ...) for tag_name in ner_tag_names
405
+ }
406
+ pydantic_class = create_model(
407
+ "AnswerFormat", **keys_and_their_types
408
+ )
409
+ generation_kwargs["response_format"] = pydantic_class
410
+ elif any(
411
+ msg.lower() in str(e).lower() for msg in no_json_schema_messages
412
+ ):
413
+ log_once(
414
+ f"The model {self.model_config.model_id!r} does not support "
415
+ "JSON schemas, so using the vanilla JSON format.",
416
+ level=logging.DEBUG,
417
+ )
418
+ generation_kwargs["response_format"] = dict(type="json_object")
317
419
  elif isinstance(e, RateLimitError):
318
420
  raise InvalidModel(
319
421
  "You have encountered your rate limit for model "
@@ -332,6 +434,7 @@ class LiteLLMModel(BenchmarkModule):
332
434
  Timeout,
333
435
  ServiceUnavailableError,
334
436
  InternalServerError,
437
+ SystemError,
335
438
  ) as e:
336
439
  logger.debug(
337
440
  f"Service temporarily unavailable. The error message was: {e}. "
@@ -359,9 +462,11 @@ class LiteLLMModel(BenchmarkModule):
359
462
  "reasoning. Returning an empty string."
360
463
  )
361
464
  return GenerativeModelOutput(sequences=[""])
465
+
362
466
  model_response_choices = model_response.choices[0]
363
467
  assert isinstance(model_response_choices, litellm.Choices)
364
- generation_output = model_response_choices.message["content"] or ""
468
+ generated_message: litellm.Message = model_response_choices.message
469
+ generation_output = generated_message.content or ""
365
470
  generation_output = generation_output.strip()
366
471
 
367
472
  # Structure the model output as a GenerativeModelOutput object
@@ -132,7 +132,7 @@ class VLLMModel(HuggingFaceEncoderModel):
132
132
  self._model: LLM = model
133
133
  self._tokenizer: PreTrainedTokenizer = tokenizer
134
134
  self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
135
- model=self._model, tokenizer=self._tokenizer
135
+ model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
136
136
  )
137
137
 
138
138
  # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
@@ -146,7 +146,10 @@ class VLLMModel(HuggingFaceEncoderModel):
146
146
  self.buffer |= dict(
147
147
  instruction_model=self._tokenizer.chat_template is not None,
148
148
  first_label_token_mapping=get_first_label_token_mapping(
149
- dataset_config=self.dataset_config, tokenizer=self._tokenizer
149
+ dataset_config=self.dataset_config,
150
+ model_config=self.model_config,
151
+ tokenizer=self._tokenizer,
152
+ generative_type=self.generative_type,
150
153
  ),
151
154
  )
152
155
  if self.model_config.adapter_base_model_id is not None:
@@ -332,30 +335,40 @@ class VLLMModel(HuggingFaceEncoderModel):
332
335
  if end_of_chat_token:
333
336
  stop_tokens.append(end_of_chat_token)
334
337
 
338
+ logits_processor = None
335
339
  if self.dataset_config.task in TASKS_USING_JSON:
336
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
337
- keys_and_their_types: dict[str, t.Any] = {
338
- tag_name: (conlist(str, max_length=5), ...)
339
- for tag_name in ner_tag_names
340
- }
341
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
342
- logits_processor = JSONLogitsProcessor(
343
- schema=pydantic_class,
344
- tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
345
- whitespace_pattern=r" ?",
346
- )
347
- log_once(
348
- "Using structured generation with the schema "
349
- f"{pydantic_class.model_json_schema()}",
350
- level=logging.DEBUG,
351
- )
352
- else:
353
- logits_processor = None
340
+ if self.generative_type == GenerativeType.REASONING:
341
+ log_once(
342
+ f"The model {self.model_config.model_id!r} is a reasoning model "
343
+ "and thus does not support structured generation, so we do not "
344
+ "enable it.",
345
+ level=logging.DEBUG,
346
+ )
347
+ else:
348
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
349
+ keys_and_their_types: dict[str, t.Any] = {
350
+ tag_name: (conlist(str, max_length=5), ...)
351
+ for tag_name in ner_tag_names
352
+ }
353
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
354
+ logits_processor = JSONLogitsProcessor(
355
+ schema=pydantic_class,
356
+ tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
357
+ whitespace_pattern=r" ?",
358
+ )
359
+ log_once(
360
+ "Using structured generation with the JSON schema "
361
+ f"{pydantic_class.model_json_schema()}",
362
+ level=logging.DEBUG,
363
+ )
354
364
 
355
365
  # Get the mapping from labels to the first token in the label. We call this each
356
366
  # time we generate a new dataset since the dataset config can change
357
367
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
358
- dataset_config=self.dataset_config, tokenizer=self._tokenizer
368
+ dataset_config=self.dataset_config,
369
+ model_config=self.model_config,
370
+ tokenizer=self._tokenizer,
371
+ generative_type=self.generative_type,
359
372
  )
360
373
 
361
374
  # Define the parameters used for vLLM generation
@@ -391,7 +404,10 @@ class VLLMModel(HuggingFaceEncoderModel):
391
404
  ) and should_prompts_be_stripped(
392
405
  labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
393
406
  ):
394
- log_once(message="Stripping prompts.", level=logging.DEBUG)
407
+ log_once(
408
+ f"Stripping prompts for model {self.model_config.model_id!r}.",
409
+ level=logging.DEBUG,
410
+ )
395
411
  prompts = [prompt.strip() for prompt in prompts]
396
412
 
397
413
  # Generate sequences using vLLM
@@ -411,18 +427,64 @@ class VLLMModel(HuggingFaceEncoderModel):
411
427
  f"Encountered error during vLLM generation: {str(e)}. Retrying..."
412
428
  )
413
429
  sleep(1)
430
+ except ValueError as e:
431
+ # Truncate the prompts if they are too long for the model
432
+ truncate_error_messages = [
433
+ r"prompt \(length [0-9]+\) is longer than the maximum model length"
434
+ ]
435
+ if any(
436
+ re.search(pattern, str(e), flags=re.IGNORECASE) is not None
437
+ for pattern in truncate_error_messages
438
+ ):
439
+ logger.info(
440
+ "Prompts are too long, so truncating them and trying again..."
441
+ )
442
+ tokenized_prompts = self._tokenizer(
443
+ text=prompts,
444
+ truncation=True,
445
+ max_length=max(
446
+ self._tokenizer.model_max_length - max_tokens, 0
447
+ ),
448
+ )
449
+ prompts = self._tokenizer.batch_decode(
450
+ sequences=tokenized_prompts.input_ids, skip_special_tokens=True
451
+ )
452
+ else:
453
+ raise InvalidBenchmark(
454
+ f"An error occurred during vLLM generation: {str(e)}"
455
+ )
414
456
  else:
415
457
  raise InvalidBenchmark(
416
458
  f"Could not generate sequences after {num_attempts} attempts."
417
459
  )
418
460
 
461
+ # When we shorten the prompts then some residual model outputs persist, so we
462
+ # need to filter these out
463
+ num_extra_outputs = len(raw_outputs) - len(prompts)
464
+ if num_extra_outputs > 0:
465
+ raw_outputs = raw_outputs[num_extra_outputs:]
466
+ if not all(
467
+ raw_output.prompt == prompt
468
+ for raw_output, prompt in zip(raw_outputs, prompts)
469
+ ):
470
+ raise InvalidBenchmark(
471
+ f"The prompts and the model outputs do not match. There were "
472
+ f"{num_extra_outputs!r} extra outputs."
473
+ )
474
+ else:
475
+ logger.debug(
476
+ f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
477
+ "which occured as we interupted the generation when we truncated "
478
+ "the prompts."
479
+ )
480
+
419
481
  # Parse the raw model outputs
420
482
  completion_ids: list[list[int]] = [
421
483
  output.outputs[0].token_ids for output in raw_outputs
422
484
  ]
423
485
  if self.end_of_reasoning_token_id in completion_ids[0]:
424
486
  completion_ids = [
425
- token_ids[token_ids.index(self.end_of_reasoning_token_id) + 2 :]
487
+ token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
426
488
  if self.end_of_reasoning_token_id in token_ids
427
489
  else token_ids
428
490
  for token_ids in completion_ids
@@ -435,6 +497,13 @@ class VLLMModel(HuggingFaceEncoderModel):
435
497
  )
436
498
  completions = [completion.strip() for completion in completions]
437
499
 
500
+ # Sanity check
501
+ if len(completions) != len(prompts):
502
+ breakpoint()
503
+ raise InvalidBenchmark(
504
+ f"Expected {len(prompts):,} completions, but got {len(completions):,}."
505
+ )
506
+
438
507
  # Add logprobs scores to the output
439
508
  if self.buffer["first_label_token_mapping"]:
440
509
  scores: list[list[list[tuple[str, float]]]] = [
@@ -809,7 +878,8 @@ class VLLMModel(HuggingFaceEncoderModel):
809
878
  if name.lower() in language_codes:
810
879
  chat_template = candidate_template
811
880
  log_once(
812
- f"Using the {name!r} chat template for the tokenizer.",
881
+ f"Using the {name!r} chat template for the tokenizer for "
882
+ f"model {self.model_config.model_id!r}.",
813
883
  level=logging.DEBUG,
814
884
  )
815
885
  break
@@ -1169,7 +1239,7 @@ def clear_vllm() -> None:
1169
1239
 
1170
1240
 
1171
1241
  def get_end_of_reasoning_token_id(
1172
- model: "LLM", tokenizer: "PreTrainedTokenizer"
1242
+ model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
1173
1243
  ) -> int | None:
1174
1244
  """Get the end of reasoning token ID for a generative model.
1175
1245
 
@@ -1182,6 +1252,8 @@ def get_end_of_reasoning_token_id(
1182
1252
  The vLLM model.
1183
1253
  tokenizer:
1184
1254
  The tokenizer.
1255
+ model_id:
1256
+ The model ID.
1185
1257
 
1186
1258
  Returns:
1187
1259
  The end of reasoning token ID, or None if it could not be found.
@@ -1220,10 +1292,8 @@ def get_end_of_reasoning_token_id(
1220
1292
  completion_match = re.search(pattern=r"<\w+>", string=completion)
1221
1293
  if completion_match is None and prompt_match is None:
1222
1294
  log_once(
1223
- message=(
1224
- "Could not find a reasoning token, so assuming the model is not a "
1225
- "reasoning model."
1226
- ),
1295
+ f"Could not find a reasoning token for model {model_id!r}, so assuming "
1296
+ "the model is not a reasoning model.",
1227
1297
  level=logging.DEBUG,
1228
1298
  )
1229
1299
  return None
@@ -1249,20 +1319,17 @@ def get_end_of_reasoning_token_id(
1249
1319
  or end_of_reasoning_token not in special_tokens
1250
1320
  ):
1251
1321
  log_once(
1252
- message=(
1253
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1254
- f"token {end_of_reasoning_token!r}, but one of them is not registered "
1255
- "as a special token, so assuming it is not a real reasoning token."
1256
- ),
1322
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1323
+ f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
1324
+ "them is not registered as a special token, so assuming it is not a "
1325
+ "real reasoning token.",
1257
1326
  level=logging.DEBUG,
1258
1327
  )
1259
1328
  return None
1260
1329
 
1261
1330
  log_once(
1262
- message=(
1263
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1264
- f"token {end_of_reasoning_token!r}."
1265
- ),
1331
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1332
+ f"token {end_of_reasoning_token!r} for model {model_id!r}.",
1266
1333
  level=logging.DEBUG,
1267
1334
  )
1268
1335
 
euroeval/benchmarker.py CHANGED
@@ -782,7 +782,11 @@ class Benchmarker:
782
782
  dataset_languages=[
783
783
  language.code for language in dataset_config.languages
784
784
  ],
785
- model=model_config.model_id,
785
+ model=(
786
+ f"{model_config.model_id}@{model_config.revision}"
787
+ if model_config.revision and model_config.revision != "main"
788
+ else model_config.model_id
789
+ ),
786
790
  results=results,
787
791
  num_model_parameters=model.num_params,
788
792
  max_sequence_length=model.model_max_length,
@@ -1076,6 +1080,10 @@ def initial_logging(
1076
1080
  benchmark_config:
1077
1081
  The general benchmark configuration.
1078
1082
  """
1083
+ model_id = model_config.model_id
1084
+ if model_config.revision and model_config.revision != "main":
1085
+ model_id += f"@{model_config.revision}"
1086
+
1079
1087
  split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
1080
1088
  if model_config.task in GENERATIVE_PIPELINE_TAGS:
1081
1089
  if benchmark_config.few_shot:
@@ -1084,8 +1092,9 @@ def initial_logging(
1084
1092
  eval_type = "Zero-shot benchmarking"
1085
1093
  else:
1086
1094
  eval_type = "Benchmarking"
1095
+
1087
1096
  logger.info(
1088
- f"{eval_type} {model_config.model_id} on the {split_type} split of "
1097
+ f"{eval_type} {model_id} on the {split_type} split of "
1089
1098
  f"{dataset_config.pretty_name}"
1090
1099
  )
1091
1100
 
@@ -1095,6 +1104,7 @@ def initial_logging(
1095
1104
  "meaning that the resulting evaluation will not be included in the "
1096
1105
  "official leaderboard."
1097
1106
  )
1107
+
1098
1108
  if benchmark_config.debug:
1099
1109
  logger.info(
1100
1110
  "Running in debug mode. This will output additional information, as "
euroeval/constants.py CHANGED
@@ -16,7 +16,7 @@ MAX_CONTEXT_LENGTH = 5_000
16
16
 
17
17
  # We need to raise the amount of tokens generated for reasoning models, to give them
18
18
  # time to think
19
- REASONING_MAX_TOKENS = 8_192
19
+ REASONING_MAX_TOKENS = 32_768
20
20
 
21
21
 
22
22
  # The Hugging Face Hub pipeline tags used to classify models as generative
euroeval/data_loading.py CHANGED
@@ -39,32 +39,9 @@ def load_data(
39
39
  HuggingFaceHubDown:
40
40
  If the Hugging Face Hub is down.
41
41
  """
42
- num_attempts = 5
43
- for _ in range(num_attempts):
44
- try:
45
- dataset = load_dataset(
46
- path=dataset_config.huggingface_id,
47
- cache_dir=benchmark_config.cache_dir,
48
- token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
49
- )
50
- break
51
- except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
52
- logger.warning(
53
- f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
54
- )
55
- time.sleep(1)
56
- continue
57
- except HfHubHTTPError:
58
- raise HuggingFaceHubDown()
59
- else:
60
- raise InvalidBenchmark(
61
- f"Failed to load dataset {dataset_config.huggingface_id!r} after "
62
- f"{num_attempts} attempts."
63
- )
64
-
65
- assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
66
-
67
- dataset = DatasetDict({key: dataset[key] for key in ["train", "val", "test"]})
42
+ dataset = load_raw_data(
43
+ dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
44
+ )
68
45
 
69
46
  if not benchmark_config.evaluate_test_split:
70
47
  dataset["test"] = dataset["val"]
@@ -101,3 +78,48 @@ def load_data(
101
78
  for idx in range(benchmark_config.num_iterations)
102
79
  ]
103
80
  return datasets
81
+
82
+
83
+ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
84
+ """Load the raw dataset.
85
+
86
+ Args:
87
+ dataset_config:
88
+ The configuration for the dataset.
89
+ cache_dir:
90
+ The directory to cache the dataset.
91
+
92
+ Returns:
93
+ The dataset.
94
+ """
95
+ num_attempts = 5
96
+ for _ in range(num_attempts):
97
+ try:
98
+ dataset = load_dataset(
99
+ path=dataset_config.huggingface_id,
100
+ cache_dir=cache_dir,
101
+ token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
102
+ )
103
+ break
104
+ except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
105
+ logger.warning(
106
+ f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
107
+ )
108
+ time.sleep(1)
109
+ continue
110
+ except HfHubHTTPError:
111
+ raise HuggingFaceHubDown()
112
+ else:
113
+ raise InvalidBenchmark(
114
+ f"Failed to load dataset {dataset_config.huggingface_id!r} after "
115
+ f"{num_attempts} attempts."
116
+ )
117
+ assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
118
+ required_keys = ["train", "val", "test"]
119
+ missing_keys = [key for key in required_keys if key not in dataset]
120
+ if missing_keys:
121
+ raise InvalidBenchmark(
122
+ "The dataset is missing the following required splits: "
123
+ f"{', '.join(missing_keys)}"
124
+ )
125
+ return DatasetDict({key: dataset[key] for key in required_keys})
euroeval/data_models.py CHANGED
@@ -521,14 +521,6 @@ class DatasetConfig:
521
521
 
522
522
  Returns:
523
523
  The natural string representation of the labels in specified language.
524
-
525
- Raises:
526
- NotImplementedError:
527
- If `and_separator` or `or_separator` are `None`, see `Language`.
528
-
529
- Example:
530
- >>> get_labels_str(language=DA)
531
- "'a', 'b', 'c' eller 'd'"
532
524
  """
533
525
  main_language = self.languages[0]
534
526
 
@@ -0,0 +1,60 @@
1
+ """All Finnish dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import FI
5
+ from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ SCANDISENT_FI_CONFIG = DatasetConfig(
10
+ name="scandisent-fi",
11
+ pretty_name="the truncated version of the Finnish part of the binary sentiment "
12
+ "classification dataset ScandiSent",
13
+ huggingface_id="EuroEval/scandisent-fi-mini",
14
+ task=SENT,
15
+ languages=[FI],
16
+ _labels=["negative", "positive"],
17
+ )
18
+
19
+ TURKU_NER_FI_CONFIG = DatasetConfig(
20
+ name="turku-ner-fi",
21
+ pretty_name="the Finnish part of the named entity recognition dataset Turku NER",
22
+ huggingface_id="EuroEval/turku-ner-fi-mini",
23
+ task=NER,
24
+ languages=[FI],
25
+ )
26
+
27
+ TYDIQA_FI_CONFIG = DatasetConfig(
28
+ name="tydiqa-fi",
29
+ pretty_name="the Finnish part of the TydiQA reading comprehension dataset",
30
+ huggingface_id="EuroEval/tydiqa-fi-mini",
31
+ task=RC,
32
+ languages=[FI],
33
+ )
34
+
35
+ XLSUM_FI_CONFIG = DatasetConfig(
36
+ name="xlsum-fi",
37
+ pretty_name="the Finnish summarisation dataset XL-Sum",
38
+ huggingface_id="EuroEval/xlsum-fi-mini",
39
+ task=SUMM,
40
+ languages=[FI],
41
+ )
42
+
43
+ HELLASWAG_FI_CONFIG = DatasetConfig(
44
+ name="hellaswag-fi",
45
+ pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
46
+ "HellaSwag-fi, translated from the English HellaSwag dataset",
47
+ huggingface_id="EuroEval/hellaswag-fi-mini",
48
+ task=COMMON_SENSE,
49
+ languages=[FI],
50
+ )
51
+
52
+ SCALA_FI_CONFIG = DatasetConfig(
53
+ name="scala-fi",
54
+ pretty_name="the Finnish part of the linguistic acceptability dataset ScaLA",
55
+ huggingface_id="EuroEval/scala-fi",
56
+ task=LA,
57
+ languages=[FI],
58
+ )
59
+
60
+ ### Unofficial datasets ###
@@ -1,7 +1,7 @@
1
1
  """Templates for the Linguistic Acceptability task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  LA_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
36
36
  default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
37
37
  "gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
38
38
  ),
39
+ FI: PromptConfig(
40
+ default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
41
+ default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
42
+ "kieliopillisesti oikein.",
43
+ default_prompt_template="Lause: {text}\nKieliopillisesti oikein: {label}",
44
+ default_instruction_prompt="Lause: {text}\n\nMääritä onko lause "
45
+ "oikein vai ei. Vastaa {labels_str}, ja ei mitään muuta.",
46
+ ),
39
47
  FO: PromptConfig(
40
48
  default_prompt_label_mapping=dict(correct="ja", incorrect="nei"),
41
49
  default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
@@ -1,7 +1,7 @@
1
1
  """Templates for all multiple choice tasks."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  # TODO: Missing Faroese
7
7
  MULTIPLE_CHOICE_TEMPLATES = {
@@ -36,6 +36,13 @@ MULTIPLE_CHOICE_TEMPLATES = {
36
36
  "usando solo {labels_str}, y nada más.",
37
37
  default_prompt_label_mapping="auto",
38
38
  ),
39
+ FI: PromptConfig(
40
+ default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
41
+ default_prompt_template="Kysymys: {text}\nVastaus: {label}",
42
+ default_instruction_prompt="Kysymys: {text}\n\nVastaa yllä olevaan kysymykseen "
43
+ "käyttämällä {labels_str}, äläkä mitään muuta.",
44
+ default_prompt_label_mapping="auto",
45
+ ),
39
46
  FR: PromptConfig(
40
47
  default_prompt_prefix="Les questions suivantes sont des questions à choix "
41
48
  "multiples (avec réponses).",
@@ -1,7 +1,7 @@
1
1
  """Templates for the Named Entity Recognition task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  NER_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
80
80
  "claves {labels_str}. Los valores deben ser listas de las "
81
81
  "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
82
82
  ),
83
+ FI: PromptConfig(
84
+ default_prompt_label_mapping={
85
+ "b-per": "henkilö",
86
+ "i-per": "henkilö",
87
+ "b-loc": "paikka",
88
+ "i-loc": "paikka",
89
+ "b-org": "organisaatio",
90
+ "i-org": "organisaatio",
91
+ "b-misc": "muut",
92
+ "i-misc": "muut",
93
+ },
94
+ default_prompt_prefix="Seuraavassa on lauseita ja JSON-sanakirjoja, jotka "
95
+ "sisältävät annetussa lauseessa esiintyvät nimetyt entiteetit.",
96
+ default_prompt_template="Lause: {text}\nNimetyt entiteetit: {label}",
97
+ default_instruction_prompt="Lause: {text}\n\nTunnista lauseessa olevat "
98
+ "entiteetit. Tulosta ne JSON-sanakirjana, jonka avaimet ovat {labels_str}. "
99
+ "Arvojen tulee olla listoja kyseisen tyypin nimetyistä entiteeteistä "
100
+ "täsmälleen siinä muodossa kuin ne esiintyvät lauseessa.",
101
+ ),
83
102
  FO: PromptConfig(
84
103
  default_prompt_label_mapping={
85
104
  "b-per": "persónur",
@@ -1,7 +1,7 @@
1
1
  """Templates for the Reading Comprehension task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  RC_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -39,6 +39,16 @@ RC_TEMPLATES = {
39
39
  "sobre el texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
40
40
  default_prompt_label_mapping=dict(),
41
41
  ),
42
+ FI: PromptConfig(
43
+ default_prompt_prefix="Seuraavassa on tekstejä ja niihin liittyviä kysymyksiä "
44
+ "ja vastauksia.",
45
+ default_prompt_template="Teksti: {text}\nKysymys: {question} "
46
+ "\nVastaa enintään 3 sanalla: {label}",
47
+ default_instruction_prompt="Teksti: {text}\n\nVastaa seuraavaan "
48
+ "kysymykseen yllä olevasta tekstistä enintään 3 sanalla.\n\n"
49
+ "Kysymys: {question}",
50
+ default_prompt_label_mapping=dict(),
51
+ ),
42
52
  FO: PromptConfig(
43
53
  default_prompt_prefix="Hetta eru tekstir saman við spurningum og svar.",
44
54
  default_prompt_template="Tekstur: {text}\nSpurningur: {question}\nSvara við í "
@@ -1,7 +1,7 @@
1
1
  """Templates for the Sentiment Analysis task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  SENT_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
44
44
  default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
45
45
  "documento. Responde con {labels_str}, y nada más.",
46
46
  ),
47
+ FI: PromptConfig(
48
+ default_prompt_label_mapping=dict(
49
+ positive="positiivinen", neutral="neutrali", negative="negatiivinen"
50
+ ),
51
+ default_prompt_prefix="Seuraavassa on arvosteluja ja niiden tunnesävy, joka "
52
+ "voi olla {labels_str}.",
53
+ default_prompt_template="Teksti: {text}\nTunnesävy: {label}",
54
+ default_instruction_prompt="Teksti: {text}\n\nLuokittele arvostelun tunnesävy. "
55
+ "Vastaa vain {labels_str}, ei muuta.",
56
+ ),
47
57
  FO: PromptConfig(
48
58
  default_prompt_label_mapping=dict(
49
59
  positive="positivt", neutral="neutralt", negative="negativt"
@@ -1,7 +1,7 @@
1
1
  """Templates for the Summarization task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  # TODO: Missing Faroese
7
7
  SUMM_TEMPLATES = {
@@ -36,6 +36,14 @@ SUMM_TEMPLATES = {
36
36
  "documento anterior.",
37
37
  default_prompt_label_mapping=dict(),
38
38
  ),
39
+ FI: PromptConfig(
40
+ default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
41
+ "tiivistelmiä.",
42
+ default_prompt_template="Uutisartikkeli: {text}\nTiivistelmä: {target_text}",
43
+ default_instruction_prompt="Uutisartikkeli: {text}\n\nKirjoita tiivistelmä "
44
+ "yllä olevasta artikkelista.",
45
+ default_prompt_label_mapping=dict(),
46
+ ),
39
47
  FR: PromptConfig(
40
48
  default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
41
49
  default_prompt_template="Document: {text}\nRésumé: {target_text}",
@@ -132,22 +132,23 @@ def extract_labels_from_generation(
132
132
  The predicted labels.
133
133
  """
134
134
  if model_output.scores is not None:
135
- return get_closest_logprobs_labels(
135
+ labels = get_closest_logprobs_labels(
136
136
  generation_logprobs=model_output.scores,
137
137
  dataset_config=dataset_config,
138
138
  first_label_token_mapping=first_label_token_mapping,
139
139
  )
140
- else:
141
- return get_closest_word_edit_labels(
142
- generated_sequences=model_output.sequences, dataset_config=dataset_config
143
- )
140
+ if labels is not None:
141
+ return labels
142
+ return get_closest_word_edit_labels(
143
+ generated_sequences=model_output.sequences, dataset_config=dataset_config
144
+ )
144
145
 
145
146
 
146
147
  def get_closest_logprobs_labels(
147
148
  generation_logprobs: list[list[list[tuple[str, float]]]],
148
149
  dataset_config: "DatasetConfig",
149
150
  first_label_token_mapping: dict[str, str] | bool,
150
- ) -> list[str]:
151
+ ) -> list[str] | None:
151
152
  """Get the labels with the highest predicted logprob value.
152
153
 
153
154
  In case a candidate label is split into multiple tokens, we only use the first
@@ -167,7 +168,7 @@ def get_closest_logprobs_labels(
167
168
  mapping is outputted then the model will always output scores).
168
169
 
169
170
  Returns:
170
- The predicted labels.
171
+ The predicted labels, or None if labels could not be extracted.
171
172
 
172
173
  Raises:
173
174
  InvalidBenchmark:
@@ -193,10 +194,7 @@ def get_closest_logprobs_labels(
193
194
  # We want to use the first generated label which contains a unique candidate
194
195
  # label, as the output label
195
196
  output_label: str | None = None
196
- previously_generated_labels: list[str] = list()
197
- for label_idx, generated_label in enumerate(generated_labels):
198
- generated_label = "".join(previously_generated_labels) + generated_label
199
-
197
+ for generated_label in generated_labels:
200
198
  # Get the candidate labels that starts with the generated label
201
199
  if isinstance(first_label_token_mapping, dict):
202
200
  if any(
@@ -222,31 +220,28 @@ def get_closest_logprobs_labels(
222
220
  if candidate_label.startswith(generated_label)
223
221
  }
224
222
 
225
- # If we can uniquely determine the output label, we break the loop. If
226
- # there are multiple possible labels then we store the current one, and
227
- # concatenate it with the next generated label. We can only do this if
228
- # the current one is the first one, however, since we're using greedy
229
- # sampling. In case this happens for a label that is not the first one,
230
- # we warn the user.
223
+ # If we can uniquely determine the output label, we break the loop.
231
224
  if len(candidate_output_labels) == 1:
232
225
  output_label = candidate_output_labels.pop()
233
226
  break
227
+
228
+ # If we have multiple candidate labels, we cannot uniquely determine the
229
+ # output label, so we abandon extracting the labels using logprobs and
230
+ # fall back to using word edit distance.
234
231
  elif len(candidate_output_labels) > 1:
235
- if label_idx == 0:
236
- previously_generated_labels.append(generated_label)
237
- else:
238
- output_label = candidate_output_labels.pop()
239
- candidate_output_labels.add(output_label)
240
- raise InvalidBenchmark(
241
- "Multiple candidate labels found for the generated label "
242
- f"{generated_label!r}: {candidate_output_labels}. Since "
243
- "this is not the first generated label, we cannot "
244
- "concatenate it with the next generated label. We are thus "
245
- f"forced to use the arbitrary {output_label!r} as the "
246
- "output label, potentially resulting in worse performance. "
247
- "Please report this issue to the EuroEval team at "
248
- "github.com/EuroEval/EuroEval/issues."
249
- )
232
+ log_once(
233
+ "Multiple candidate labels found for the generated label "
234
+ f"{generated_label!r}: {candidate_output_labels}. This means "
235
+ "that using logprobs to extract the labels is not reliable, "
236
+ "and we will instead fall back to extracting the labels "
237
+ "using word edit distance.",
238
+ level=logging.DEBUG,
239
+ )
240
+ return None
241
+
242
+ # If no candidate label is found, we ignore the generated label, as it
243
+ # basically means that the model is just really bad at generating
244
+ # labels.
250
245
  elif len(candidate_output_labels) == 0:
251
246
  logger.debug(
252
247
  f"No candidate label found for the generated label "
@@ -10,11 +10,7 @@ from evaluate import EvaluationModule
10
10
  from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
11
11
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
12
12
  from ..exceptions import InvalidBenchmark
13
- from ..utils import (
14
- HiddenPrints,
15
- clear_memory,
16
- raise_if_model_output_contains_nan_values,
17
- )
13
+ from ..utils import HiddenPrints, raise_if_model_output_contains_nan_values
18
14
 
19
15
  if t.TYPE_CHECKING:
20
16
  from transformers.trainer_utils import EvalPrediction
@@ -89,20 +85,8 @@ def compute_metrics(
89
85
  score_dict: dict[str, float] | None = metric.compute(
90
86
  predictions=predictions, references=labels, **cfg.compute_kwargs
91
87
  )
92
-
93
- # Clear the cache of the BERTScorer to avoid memory leaks
94
- for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
95
- if hasattr(metric, attribute):
96
- delattr(metric, attribute)
97
-
98
- clear_memory()
99
88
  break
100
89
  except Exception as e:
101
- # Clear the cache of the BERTScorer to avoid memory leaks
102
- if hasattr(metric, "cached_bertscorer"):
103
- del metric.cached_bertscorer
104
- clear_memory()
105
-
106
90
  oom_error = [
107
91
  "CUDA out of memory",
108
92
  "CUDA error",
@@ -111,16 +95,7 @@ def compute_metrics(
111
95
  if not any(error in str(e) for error in oom_error):
112
96
  raise InvalidBenchmark(str(e))
113
97
 
114
- if cfg.compute_kwargs.get("batch_size", 1) > 1:
115
- batch_size = cfg.compute_kwargs["batch_size"]
116
- cfg.compute_kwargs["batch_size"] = batch_size // 2
117
- logger.debug(
118
- "Out of memory error occurred during the computation of "
119
- f"the metric {cfg.pretty_name}. Reducing the batch size to "
120
- f"{cfg.compute_kwargs['batch_size']}."
121
- )
122
- elif cfg.compute_kwargs.get("device", "cpu") != "cpu":
123
- cfg.compute_kwargs["batch_size"] = 32
98
+ if cfg.compute_kwargs.get("device", "cpu") != "cpu":
124
99
  cfg.compute_kwargs["device"] = "cpu"
125
100
  logger.debug(
126
101
  "Out of memory error occurred during the computation of "
@@ -129,6 +104,14 @@ def compute_metrics(
129
104
  )
130
105
  else:
131
106
  raise InvalidBenchmark(str(e))
107
+ finally:
108
+ for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
109
+ if hasattr(metric, attribute):
110
+ logger.debug(
111
+ f"Deleting the {attribute!r} attribute of the metric "
112
+ f"{cfg.pretty_name} to free up memory."
113
+ )
114
+ delattr(metric, attribute)
132
115
 
133
116
  # The metric returns None if we are running on multi-GPU and the current
134
117
  # process is not the main process
euroeval/tasks.py CHANGED
@@ -142,7 +142,7 @@ SUMM = Task(
142
142
  huggingface_id="bertscore",
143
143
  results_key="f1",
144
144
  compute_kwargs=dict(
145
- model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=32
145
+ model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
146
146
  ),
147
147
  ),
148
148
  MetricConfig(
@@ -7,6 +7,7 @@ import typing as t
7
7
  import torch
8
8
 
9
9
  from .constants import TASK_GROUPS_USING_LOGPROBS
10
+ from .enums import GenerativeType
10
11
  from .exceptions import InvalidModel
11
12
  from .utils import log_once
12
13
 
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
14
15
  from transformers.tokenization_utils import PreTrainedTokenizer
15
16
  from transformers.tokenization_utils_base import PreTrainedTokenizerBase
16
17
 
17
- from .data_models import DatasetConfig
18
+ from .data_models import DatasetConfig, ModelConfig
18
19
 
19
20
 
20
21
  logger = logging.getLogger("euroeval")
@@ -254,35 +255,50 @@ def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | N
254
255
 
255
256
 
256
257
  def get_first_label_token_mapping(
257
- dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
258
+ dataset_config: "DatasetConfig",
259
+ model_config: "ModelConfig",
260
+ tokenizer: "PreTrainedTokenizer | None",
261
+ generative_type: "GenerativeType | None",
258
262
  ) -> dict[str, str] | bool:
259
263
  """Check if the model should output scores.
260
264
 
261
265
  Args:
262
266
  dataset_config:
263
267
  The dataset configuration.
268
+ model_config:
269
+ The model configuration.
264
270
  tokenizer:
265
271
  The tokenizer, or None if not available.
272
+ generative_type:
273
+ The generative type, or None if not available.
266
274
 
267
275
  Returns:
268
276
  A mapping from labels to the first token in each label, or alternatively a
269
277
  Boolean value indicating whether the model should output scores (if the mapping
270
278
  is outputted then the model will always output scores).
271
279
  """
280
+ if generative_type == GenerativeType.REASONING:
281
+ log_once(
282
+ f"The model {model_config.model_id!r} is a reasoning model and "
283
+ "thus does not support logprobs, so we do not enable it.",
284
+ level=logging.DEBUG,
285
+ )
286
+ return False
287
+
272
288
  # If we do not have any tokenizer, then we cannot check if the model should output
273
289
  # scores and we just assume it should if the dataset supports it
274
290
  output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
275
291
  if tokenizer is None:
276
292
  if output_scores:
277
293
  log_once(
278
- "The model will output scores, since the dataset supports it and no "
279
- "tokenizer is available.",
294
+ f"The model {model_config.model_id!r} will output scores, since the "
295
+ "dataset supports it and no tokenizer is available.",
280
296
  level=logging.DEBUG,
281
297
  )
282
298
  else:
283
299
  log_once(
284
- "The model will not output scores, since the dataset does not support "
285
- "it and no tokenizer is available.",
300
+ f"The model {model_config.model_id!r} will not output scores, since "
301
+ "the dataset does not support it and no tokenizer is available.",
286
302
  level=logging.DEBUG,
287
303
  )
288
304
  return output_scores
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.6.1
3
+ Version: 15.7.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -32,7 +32,7 @@ Requires-Python: <4.0,>=3.10
32
32
  Requires-Dist: accelerate>=0.34.2
33
33
  Requires-Dist: bert-score>=0.3.13
34
34
  Requires-Dist: click>=8.1.3
35
- Requires-Dist: datasets>=2.15.0
35
+ Requires-Dist: datasets>=3.5.0
36
36
  Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
38
  Requires-Dist: huggingface-hub>=0.30.1
@@ -239,6 +239,18 @@ A huge thank you to all the contributors who have helped make this project a suc
239
239
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
240
240
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
241
241
 
242
+
243
+ ### Contribute to EuroEval
244
+
245
+ We welcome contributions to EuroEval! Whether you're fixing bugs, adding features, or
246
+ contributing new datasets, your help makes this project better for everyone.
247
+
248
+ - **General contributions**: Check out our [contribution guidelines](CONTRIBUTING.md)
249
+ for information on how to get started.
250
+ - **Adding datasets**: If you're interested in adding a new dataset to EuroEval, we have
251
+ a [dedicated guide](NEW_DATASET_GUIDE.md) with step-by-step instructions.
252
+
253
+
242
254
  ### Special Thanks
243
255
  - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
244
256
  [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
@@ -1,11 +1,11 @@
1
1
  euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
2
  euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
- euroeval/benchmarker.py,sha256=7LVFr7zL7OeJPs7WVYwekNnEmiIKPXHydcbAkW99MUk,48080
3
+ euroeval/benchmarker.py,sha256=gOLNpW11cBX_8AvotnlGNbejtOM4acmXS3aovNREqhA,48434
4
4
  euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
- euroeval/constants.py,sha256=t2mAT8tE3Dn2lXWHTnaFoaOIaUcdiBjJTASCt7nSdkg,1984
7
- euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
- euroeval/data_models.py,sha256=oZLrGg1dhIIwbgtEzq4U_fu_ZbBsz35mrqsyizuZNPw,23138
6
+ euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
7
+ euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
8
+ euroeval/data_models.py,sha256=Nlb2s26u5OvQ2AITAt25NMpeI1IHM2_qqbpyU_bZhiY,22907
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
11
11
  euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
@@ -17,21 +17,22 @@ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
17
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
18
  euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
19
19
  euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
20
- euroeval/tasks.py,sha256=VVXFDcEM250KTGXd1pxQb8vwdia4ZJxgTUY5Kdsa-ik,7070
21
- euroeval/tokenization_utils.py,sha256=PNuS-FTdVrL9TWNDGlq42MvUggKwmyYM0BnC5I37IO0,11876
20
+ euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
21
+ euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
22
22
  euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
23
23
  euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
26
26
  euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
27
27
  euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
28
- euroeval/benchmark_modules/litellm.py,sha256=wohdi1WoeJ-JEdQLgg2q3JbZJA77XO7yGZaTRvbRU4o,47575
29
- euroeval/benchmark_modules/vllm.py,sha256=FTpwal5WdrVsOpkjm_RXwf6-2PrNrrP1LO6BVGYb6GE,48086
28
+ euroeval/benchmark_modules/litellm.py,sha256=9Fhh7Zyn6F4JBlRoQkST1wIeb8z0YliRRrcmD5pONs4,52551
29
+ euroeval/benchmark_modules/vllm.py,sha256=vwAE7SGRhePqkzAt1S-FKPelEqe8VMGwah9Nj2J1hLs,51295
30
30
  euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
31
31
  euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
32
32
  euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
33
33
  euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
34
34
  euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
35
+ euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
35
36
  euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
36
37
  euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
37
38
  euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
@@ -40,20 +41,20 @@ euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada
40
41
  euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
41
42
  euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
42
43
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
43
- euroeval/prompt_templates/linguistic_acceptability.py,sha256=sx_WqLm7N6Thll6COUCCA0lXe9RMZ7WhoH6X498pixM,6232
44
- euroeval/prompt_templates/multiple_choice.py,sha256=H0CDQPs_WzgSJ7oI_FBzHs0TOF0Na2qZYJLhDC7S8tk,4710
45
- euroeval/prompt_templates/named_entity_recognition.py,sha256=T65oFEtVT8JRF9c7bq2nPm233rftPdEAGic0DU-toko,11835
46
- euroeval/prompt_templates/reading_comprehension.py,sha256=WbQoal_tjoTt7qsmSZXEWwlI77vgiANcZoZC1l1AZjc,6090
47
- euroeval/prompt_templates/sentiment_classification.py,sha256=LcFD89e5nMOv4u-Unj8_jHpNjKMmgKPEfz0-e39VbsM,6639
48
- euroeval/prompt_templates/summarization.py,sha256=eX0uUTf_5Xorm6f_TlBBNwLC9zKvR7YJkP0RSaLWgIw,4585
44
+ euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
45
+ euroeval/prompt_templates/multiple_choice.py,sha256=6iEqiPpT-3WJN_gsyhyapnwsrcsYGdVkSkzwn-VKKxw,5101
46
+ euroeval/prompt_templates/named_entity_recognition.py,sha256=Xd6gBJD2e1l8-We2Ujor7crRUBcbgnNeeVknBIrTMJo,12737
47
+ euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLOS7ZrDUeasuLFt-dtqCnYk,6585
48
+ euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
49
+ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
49
50
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
50
51
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
51
52
  euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
52
- euroeval/task_group_utils/sequence_classification.py,sha256=gqd0-l5o7vAY5QIpGSkSqwJwez3Y0r5SqOiywfPNW8A,12239
53
- euroeval/task_group_utils/text_to_text.py,sha256=QECnGdZ0YLjsbMc6LwXqVi4KMuITdiOjmJUNQtAAOW0,5712
53
+ euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
54
+ euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
54
55
  euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
55
- euroeval-15.6.1.dist-info/METADATA,sha256=4i98IBxn6yWh4ugBW-SnljmDfKEXBSfRGjZyf_dlOUs,13183
56
- euroeval-15.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
57
- euroeval-15.6.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
58
- euroeval-15.6.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
59
- euroeval-15.6.1.dist-info/RECORD,,
56
+ euroeval-15.7.0.dist-info/METADATA,sha256=8oMsbhHWeO7j4KQdn4lpt-O94Nw0erwRoD_Ogk6CX2U,13669
57
+ euroeval-15.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
58
+ euroeval-15.7.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
59
+ euroeval-15.7.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
60
+ euroeval-15.7.0.dist-info/RECORD,,