EuroEval 15.6.0__py3-none-any.whl → 15.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -33,6 +33,7 @@ from litellm.exceptions import (
33
33
  )
34
34
  from litellm.llms.vertex_ai.common_utils import VertexAIError
35
35
  from litellm.types.utils import ChoiceLogprobs, ModelResponse
36
+ from pydantic import conlist, create_model
36
37
  from requests.exceptions import RequestException
37
38
  from tqdm.auto import tqdm
38
39
  from transformers.trainer import Trainer
@@ -104,6 +105,7 @@ MODEL_MAX_LENGTH_MAPPING = {
104
105
  r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
105
106
  r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
106
107
  r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
108
+ r"gpt-4.1.*": 1_047_576,
107
109
  # Anthropic models
108
110
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
109
111
  # Gemini models
@@ -135,20 +137,23 @@ ALLOWED_PARAMS = {
135
137
  r"gpt-4.*": [],
136
138
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
137
139
  # Anthropic models
138
- r"(anthropic/)?claude-3-.*": [],
139
- r"(anthropic/)?claude-3.5-.*": [],
140
- r"(anthropic/)?claude-3.7-sonnet.*": ["thinking"],
140
+ r"(anthropic/)?claude-3-(haiku|sonnet|opus).*": [],
141
+ r"(anthropic/)?claude-3-5-.*": [],
142
+ r"(anthropic/)?claude-3-7-sonnet.*": ["thinking"],
141
143
  # Gemini models
142
144
  r"(gemini/)?gemini-.*": [],
143
145
  # xAI models
144
- r"(xai/)?grok.*": [],
146
+ r"(xai/)?grok-2.*": [],
147
+ r"(xai/)?grok-3(-fast)?(-beta)?": [],
148
+ r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "high"],
145
149
  }
146
150
 
147
151
 
148
152
  REASONING_MODELS = [
149
153
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
150
154
  r"(gemini/)?gemini.*thinking.*",
151
- r"(gemini/)?gemini-2.5-pro.*",
155
+ r"(gemini/)?gemini-2.5.*",
156
+ r"(xai/)?grok-3-mini.*",
152
157
  ]
153
158
 
154
159
 
@@ -190,7 +195,10 @@ class LiteLLMModel(BenchmarkModule):
190
195
  )
191
196
 
192
197
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
193
- dataset_config=self.dataset_config, tokenizer=None
198
+ dataset_config=self.dataset_config,
199
+ model_config=self.model_config,
200
+ tokenizer=None,
201
+ generative_type=self.generative_type,
194
202
  )
195
203
 
196
204
  @property
@@ -201,13 +209,20 @@ class LiteLLMModel(BenchmarkModule):
201
209
  The generative type of the model, or None if it has not been set yet.
202
210
  """
203
211
  if self.model_config.revision == "thinking":
204
- return GenerativeType.REASONING
212
+ type_ = GenerativeType.REASONING
205
213
  elif re.fullmatch(
206
214
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
207
215
  ):
208
- return GenerativeType.REASONING
216
+ type_ = GenerativeType.REASONING
209
217
  else:
210
- return GenerativeType.INSTRUCTION_TUNED
218
+ type_ = GenerativeType.INSTRUCTION_TUNED
219
+
220
+ log_once(
221
+ f"Detected generative type {type_.name!r} for model "
222
+ f"{self.model_config.model_id!r}",
223
+ level=logging.DEBUG,
224
+ )
225
+ return type_
211
226
 
212
227
  def generate(self, inputs: dict) -> GenerativeModelOutput:
213
228
  """Generate outputs from the model.
@@ -243,7 +258,10 @@ class LiteLLMModel(BenchmarkModule):
243
258
  # Get the mapping from labels to the first token in the label. We call this each
244
259
  # time we generate a new dataset since the dataset config can change
245
260
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
246
- dataset_config=self.dataset_config, tokenizer=None
261
+ dataset_config=self.dataset_config,
262
+ model_config=self.model_config,
263
+ tokenizer=None,
264
+ generative_type=self.generative_type,
247
265
  )
248
266
 
249
267
  if self.buffer["first_label_token_mapping"]:
@@ -254,16 +272,41 @@ class LiteLLMModel(BenchmarkModule):
254
272
  assert "json" in messages[0]["content"].lower(), (
255
273
  "Prompt must contain 'json' for JSON tasks."
256
274
  )
257
- generation_kwargs["response_format"] = dict(type="json_object")
258
- log_once(
259
- "Enabling JSON response format for model "
260
- f"{self.model_config.model_id!r}",
261
- level=logging.DEBUG,
262
- )
275
+ if self.generative_type == GenerativeType.REASONING:
276
+ log_once(
277
+ f"The model {self.model_config.model_id!r} is a reasoning model "
278
+ "and thus does not support structured generation, so we do not "
279
+ "enable it.",
280
+ level=logging.DEBUG,
281
+ )
282
+ elif litellm.utils.supports_response_schema(
283
+ model=self.model_config.model_id
284
+ ):
285
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
286
+ keys_and_their_types: dict[str, t.Any] = {
287
+ tag_name: (conlist(str, max_length=5), ...)
288
+ for tag_name in ner_tag_names
289
+ }
290
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
291
+ generation_kwargs["response_format"] = pydantic_class
292
+ log_once(
293
+ "Enabling structured generation for model "
294
+ f"{self.model_config.model_id!r} with the JSON schema "
295
+ f"{pydantic_class.model_json_schema()}",
296
+ level=logging.DEBUG,
297
+ )
298
+ else:
299
+ generation_kwargs["response_format"] = dict(type="json_object")
300
+ log_once(
301
+ "Enabling structured JSON generation for model "
302
+ f"{self.model_config.model_id!r} with no custom JSON schema, as "
303
+ "the model does not support schemas.",
304
+ level=logging.DEBUG,
305
+ )
263
306
 
264
307
  if self.model_config.revision == "thinking":
265
308
  generation_kwargs["thinking"] = dict(
266
- type="enabled", budget_tokens=REASONING_MAX_TOKENS
309
+ type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
267
310
  )
268
311
  log_once(
269
312
  f"Enabling thinking mode for model {self.model_config.model_id!r}",
@@ -280,28 +323,42 @@ class LiteLLMModel(BenchmarkModule):
280
323
  # This drops generation kwargs that are not supported by the model
281
324
  litellm.drop_params = True
282
325
 
326
+ # Error messages that we want to catch and handle
327
+ stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
328
+ logprobs_messages = [
329
+ "you are not allowed to request logprobs",
330
+ "you've reached the maximum number of requests with logprobs",
331
+ "logprobs is not supported",
332
+ "logprobs is not enabled",
333
+ ]
334
+ temperature_messages = [
335
+ "'temperature' is not supported with this model.",
336
+ "temperature is not supported with this model",
337
+ ]
338
+ temperature_must_be_one_messages = [
339
+ "`temperature` may only be set to 1",
340
+ "'temperature' does not support 0.0 with this model. Only the default "
341
+ "(1) value is supported",
342
+ ]
343
+ max_items_messages = ["'maxItems' is not permitted."]
344
+ no_json_schema_messages = ["Property keys should match pattern"]
345
+
283
346
  # Extract the generated sequences from the model response. Some APIs cannot
284
347
  # handle using newlines as stop sequences, so we try both.
285
348
  num_attempts = 10
286
349
  for _ in range(num_attempts):
287
- stop_messages = ["stop_sequences"]
288
- logprobs_messages = [
289
- "you are not allowed to request logprobs",
290
- "you've reached the maximum number of requests with logprobs",
291
- "logprobs is not supported",
292
- "logprobs is not enabled",
293
- ]
294
- temperature_messages = [
295
- "'temperature' is not supported with this model.",
296
- "temperature is not supported with this model",
297
- ]
298
350
  try:
299
- model_response = litellm.completion(
300
- messages=messages, max_retries=3, **generation_kwargs
351
+ model_response = litellm.completion_with_retries(
352
+ messages=messages, **generation_kwargs
301
353
  )
302
354
  break
303
355
  except (BadRequestError, RateLimitError) as e:
304
356
  if any(msg.lower() in str(e).lower() for msg in stop_messages):
357
+ log_once(
358
+ f"The model {self.model_config.model_id!r} does not support "
359
+ "stop sequences, so disabling them.",
360
+ level=logging.DEBUG,
361
+ )
305
362
  generation_kwargs["stop"] = None
306
363
  elif (
307
364
  any(msg.lower() in str(e).lower() for msg in logprobs_messages)
@@ -310,10 +367,55 @@ class LiteLLMModel(BenchmarkModule):
310
367
  # we ignore this since the rate limiting makes it unusable anyway.
311
368
  or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
312
369
  ):
370
+ log_once(
371
+ f"The model {self.model_config.model_id!r} does not support "
372
+ "logprobs, so disabling it.",
373
+ level=logging.DEBUG,
374
+ )
313
375
  generation_kwargs.pop("logprobs")
314
376
  generation_kwargs.pop("top_logprobs")
315
377
  elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
378
+ log_once(
379
+ f"The model {self.model_config.model_id!r} does not support "
380
+ "temperature, so disabling it.",
381
+ level=logging.DEBUG,
382
+ )
316
383
  generation_kwargs.pop("temperature")
384
+ elif any(
385
+ msg.lower() in str(e).lower()
386
+ for msg in temperature_must_be_one_messages
387
+ ):
388
+ log_once(
389
+ f"The model {self.model_config.model_id!r} requires "
390
+ "temperature to be set to 1, so setting it.",
391
+ level=logging.DEBUG,
392
+ )
393
+ generation_kwargs["temperature"] = 1.0
394
+ elif any(msg.lower() in str(e).lower() for msg in max_items_messages):
395
+ log_once(
396
+ f"The model {self.model_config.model_id!r} does not support "
397
+ "maxItems in the JSON schema, so disabling it.",
398
+ level=logging.DEBUG,
399
+ )
400
+ ner_tag_names = list(
401
+ self.dataset_config.prompt_label_mapping.values()
402
+ )
403
+ keys_and_their_types = {
404
+ tag_name: (list[str], ...) for tag_name in ner_tag_names
405
+ }
406
+ pydantic_class = create_model(
407
+ "AnswerFormat", **keys_and_their_types
408
+ )
409
+ generation_kwargs["response_format"] = pydantic_class
410
+ elif any(
411
+ msg.lower() in str(e).lower() for msg in no_json_schema_messages
412
+ ):
413
+ log_once(
414
+ f"The model {self.model_config.model_id!r} does not support "
415
+ "JSON schemas, so using the vanilla JSON format.",
416
+ level=logging.DEBUG,
417
+ )
418
+ generation_kwargs["response_format"] = dict(type="json_object")
317
419
  elif isinstance(e, RateLimitError):
318
420
  raise InvalidModel(
319
421
  "You have encountered your rate limit for model "
@@ -332,6 +434,7 @@ class LiteLLMModel(BenchmarkModule):
332
434
  Timeout,
333
435
  ServiceUnavailableError,
334
436
  InternalServerError,
437
+ SystemError,
335
438
  ) as e:
336
439
  logger.debug(
337
440
  f"Service temporarily unavailable. The error message was: {e}. "
@@ -359,9 +462,11 @@ class LiteLLMModel(BenchmarkModule):
359
462
  "reasoning. Returning an empty string."
360
463
  )
361
464
  return GenerativeModelOutput(sequences=[""])
465
+
362
466
  model_response_choices = model_response.choices[0]
363
467
  assert isinstance(model_response_choices, litellm.Choices)
364
- generation_output = model_response_choices.message["content"] or ""
468
+ generated_message: litellm.Message = model_response_choices.message
469
+ generation_output = generated_message.content or ""
365
470
  generation_output = generation_output.strip()
366
471
 
367
472
  # Structure the model output as a GenerativeModelOutput object
@@ -132,7 +132,7 @@ class VLLMModel(HuggingFaceEncoderModel):
132
132
  self._model: LLM = model
133
133
  self._tokenizer: PreTrainedTokenizer = tokenizer
134
134
  self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
135
- model=self._model, tokenizer=self._tokenizer
135
+ model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
136
136
  )
137
137
 
138
138
  # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
@@ -146,7 +146,10 @@ class VLLMModel(HuggingFaceEncoderModel):
146
146
  self.buffer |= dict(
147
147
  instruction_model=self._tokenizer.chat_template is not None,
148
148
  first_label_token_mapping=get_first_label_token_mapping(
149
- dataset_config=self.dataset_config, tokenizer=self._tokenizer
149
+ dataset_config=self.dataset_config,
150
+ model_config=self.model_config,
151
+ tokenizer=self._tokenizer,
152
+ generative_type=self.generative_type,
150
153
  ),
151
154
  )
152
155
  if self.model_config.adapter_base_model_id is not None:
@@ -332,30 +335,40 @@ class VLLMModel(HuggingFaceEncoderModel):
332
335
  if end_of_chat_token:
333
336
  stop_tokens.append(end_of_chat_token)
334
337
 
338
+ logits_processor = None
335
339
  if self.dataset_config.task in TASKS_USING_JSON:
336
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
337
- keys_and_their_types: dict[str, t.Any] = {
338
- tag_name: (conlist(str, max_length=5), ...)
339
- for tag_name in ner_tag_names
340
- }
341
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
342
- logits_processor = JSONLogitsProcessor(
343
- schema=pydantic_class,
344
- tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
345
- whitespace_pattern=r" ?",
346
- )
347
- log_once(
348
- "Using structured generation with the schema "
349
- f"{pydantic_class.model_json_schema()}",
350
- level=logging.DEBUG,
351
- )
352
- else:
353
- logits_processor = None
340
+ if self.generative_type == GenerativeType.REASONING:
341
+ log_once(
342
+ f"The model {self.model_config.model_id!r} is a reasoning model "
343
+ "and thus does not support structured generation, so we do not "
344
+ "enable it.",
345
+ level=logging.DEBUG,
346
+ )
347
+ else:
348
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
349
+ keys_and_their_types: dict[str, t.Any] = {
350
+ tag_name: (conlist(str, max_length=5), ...)
351
+ for tag_name in ner_tag_names
352
+ }
353
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
354
+ logits_processor = JSONLogitsProcessor(
355
+ schema=pydantic_class,
356
+ tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
357
+ whitespace_pattern=r" ?",
358
+ )
359
+ log_once(
360
+ "Using structured generation with the JSON schema "
361
+ f"{pydantic_class.model_json_schema()}",
362
+ level=logging.DEBUG,
363
+ )
354
364
 
355
365
  # Get the mapping from labels to the first token in the label. We call this each
356
366
  # time we generate a new dataset since the dataset config can change
357
367
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
358
- dataset_config=self.dataset_config, tokenizer=self._tokenizer
368
+ dataset_config=self.dataset_config,
369
+ model_config=self.model_config,
370
+ tokenizer=self._tokenizer,
371
+ generative_type=self.generative_type,
359
372
  )
360
373
 
361
374
  # Define the parameters used for vLLM generation
@@ -391,7 +404,10 @@ class VLLMModel(HuggingFaceEncoderModel):
391
404
  ) and should_prompts_be_stripped(
392
405
  labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
393
406
  ):
394
- log_once(message="Stripping prompts.", level=logging.DEBUG)
407
+ log_once(
408
+ f"Stripping prompts for model {self.model_config.model_id!r}.",
409
+ level=logging.DEBUG,
410
+ )
395
411
  prompts = [prompt.strip() for prompt in prompts]
396
412
 
397
413
  # Generate sequences using vLLM
@@ -411,18 +427,64 @@ class VLLMModel(HuggingFaceEncoderModel):
411
427
  f"Encountered error during vLLM generation: {str(e)}. Retrying..."
412
428
  )
413
429
  sleep(1)
430
+ except ValueError as e:
431
+ # Truncate the prompts if they are too long for the model
432
+ truncate_error_messages = [
433
+ r"prompt \(length [0-9]+\) is longer than the maximum model length"
434
+ ]
435
+ if any(
436
+ re.search(pattern, str(e), flags=re.IGNORECASE) is not None
437
+ for pattern in truncate_error_messages
438
+ ):
439
+ logger.info(
440
+ "Prompts are too long, so truncating them and trying again..."
441
+ )
442
+ tokenized_prompts = self._tokenizer(
443
+ text=prompts,
444
+ truncation=True,
445
+ max_length=max(
446
+ self._tokenizer.model_max_length - max_tokens, 0
447
+ ),
448
+ )
449
+ prompts = self._tokenizer.batch_decode(
450
+ sequences=tokenized_prompts.input_ids, skip_special_tokens=True
451
+ )
452
+ else:
453
+ raise InvalidBenchmark(
454
+ f"An error occurred during vLLM generation: {str(e)}"
455
+ )
414
456
  else:
415
457
  raise InvalidBenchmark(
416
458
  f"Could not generate sequences after {num_attempts} attempts."
417
459
  )
418
460
 
461
+ # When we shorten the prompts then some residual model outputs persist, so we
462
+ # need to filter these out
463
+ num_extra_outputs = len(raw_outputs) - len(prompts)
464
+ if num_extra_outputs > 0:
465
+ raw_outputs = raw_outputs[num_extra_outputs:]
466
+ if not all(
467
+ raw_output.prompt == prompt
468
+ for raw_output, prompt in zip(raw_outputs, prompts)
469
+ ):
470
+ raise InvalidBenchmark(
471
+ f"The prompts and the model outputs do not match. There were "
472
+ f"{num_extra_outputs!r} extra outputs."
473
+ )
474
+ else:
475
+ logger.debug(
476
+ f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
477
+ "which occured as we interupted the generation when we truncated "
478
+ "the prompts."
479
+ )
480
+
419
481
  # Parse the raw model outputs
420
482
  completion_ids: list[list[int]] = [
421
483
  output.outputs[0].token_ids for output in raw_outputs
422
484
  ]
423
485
  if self.end_of_reasoning_token_id in completion_ids[0]:
424
486
  completion_ids = [
425
- token_ids[token_ids.index(self.end_of_reasoning_token_id) + 2 :]
487
+ token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
426
488
  if self.end_of_reasoning_token_id in token_ids
427
489
  else token_ids
428
490
  for token_ids in completion_ids
@@ -435,6 +497,13 @@ class VLLMModel(HuggingFaceEncoderModel):
435
497
  )
436
498
  completions = [completion.strip() for completion in completions]
437
499
 
500
+ # Sanity check
501
+ if len(completions) != len(prompts):
502
+ breakpoint()
503
+ raise InvalidBenchmark(
504
+ f"Expected {len(prompts):,} completions, but got {len(completions):,}."
505
+ )
506
+
438
507
  # Add logprobs scores to the output
439
508
  if self.buffer["first_label_token_mapping"]:
440
509
  scores: list[list[list[tuple[str, float]]]] = [
@@ -809,7 +878,8 @@ class VLLMModel(HuggingFaceEncoderModel):
809
878
  if name.lower() in language_codes:
810
879
  chat_template = candidate_template
811
880
  log_once(
812
- f"Using the {name!r} chat template for the tokenizer.",
881
+ f"Using the {name!r} chat template for the tokenizer for "
882
+ f"model {self.model_config.model_id!r}.",
813
883
  level=logging.DEBUG,
814
884
  )
815
885
  break
@@ -1169,7 +1239,7 @@ def clear_vllm() -> None:
1169
1239
 
1170
1240
 
1171
1241
  def get_end_of_reasoning_token_id(
1172
- model: "LLM", tokenizer: "PreTrainedTokenizer"
1242
+ model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
1173
1243
  ) -> int | None:
1174
1244
  """Get the end of reasoning token ID for a generative model.
1175
1245
 
@@ -1182,6 +1252,8 @@ def get_end_of_reasoning_token_id(
1182
1252
  The vLLM model.
1183
1253
  tokenizer:
1184
1254
  The tokenizer.
1255
+ model_id:
1256
+ The model ID.
1185
1257
 
1186
1258
  Returns:
1187
1259
  The end of reasoning token ID, or None if it could not be found.
@@ -1220,10 +1292,8 @@ def get_end_of_reasoning_token_id(
1220
1292
  completion_match = re.search(pattern=r"<\w+>", string=completion)
1221
1293
  if completion_match is None and prompt_match is None:
1222
1294
  log_once(
1223
- message=(
1224
- "Could not find a reasoning token, so assuming the model is not a "
1225
- "reasoning model."
1226
- ),
1295
+ f"Could not find a reasoning token for model {model_id!r}, so assuming "
1296
+ "the model is not a reasoning model.",
1227
1297
  level=logging.DEBUG,
1228
1298
  )
1229
1299
  return None
@@ -1249,20 +1319,17 @@ def get_end_of_reasoning_token_id(
1249
1319
  or end_of_reasoning_token not in special_tokens
1250
1320
  ):
1251
1321
  log_once(
1252
- message=(
1253
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1254
- f"token {end_of_reasoning_token!r}, but one of them is not registered "
1255
- "as a special token, so assuming it is not a real reasoning token."
1256
- ),
1322
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1323
+ f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
1324
+ "them is not registered as a special token, so assuming it is not a "
1325
+ "real reasoning token.",
1257
1326
  level=logging.DEBUG,
1258
1327
  )
1259
1328
  return None
1260
1329
 
1261
1330
  log_once(
1262
- message=(
1263
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1264
- f"token {end_of_reasoning_token!r}."
1265
- ),
1331
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1332
+ f"token {end_of_reasoning_token!r} for model {model_id!r}.",
1266
1333
  level=logging.DEBUG,
1267
1334
  )
1268
1335
 
euroeval/benchmarker.py CHANGED
@@ -782,7 +782,11 @@ class Benchmarker:
782
782
  dataset_languages=[
783
783
  language.code for language in dataset_config.languages
784
784
  ],
785
- model=model_config.model_id,
785
+ model=(
786
+ f"{model_config.model_id}@{model_config.revision}"
787
+ if model_config.revision and model_config.revision != "main"
788
+ else model_config.model_id
789
+ ),
786
790
  results=results,
787
791
  num_model_parameters=model.num_params,
788
792
  max_sequence_length=model.model_max_length,
@@ -1076,6 +1080,10 @@ def initial_logging(
1076
1080
  benchmark_config:
1077
1081
  The general benchmark configuration.
1078
1082
  """
1083
+ model_id = model_config.model_id
1084
+ if model_config.revision and model_config.revision != "main":
1085
+ model_id += f"@{model_config.revision}"
1086
+
1079
1087
  split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
1080
1088
  if model_config.task in GENERATIVE_PIPELINE_TAGS:
1081
1089
  if benchmark_config.few_shot:
@@ -1084,8 +1092,9 @@ def initial_logging(
1084
1092
  eval_type = "Zero-shot benchmarking"
1085
1093
  else:
1086
1094
  eval_type = "Benchmarking"
1095
+
1087
1096
  logger.info(
1088
- f"{eval_type} {model_config.model_id} on the {split_type} split of "
1097
+ f"{eval_type} {model_id} on the {split_type} split of "
1089
1098
  f"{dataset_config.pretty_name}"
1090
1099
  )
1091
1100
 
@@ -1095,6 +1104,7 @@ def initial_logging(
1095
1104
  "meaning that the resulting evaluation will not be included in the "
1096
1105
  "official leaderboard."
1097
1106
  )
1107
+
1098
1108
  if benchmark_config.debug:
1099
1109
  logger.info(
1100
1110
  "Running in debug mode. This will output additional information, as "
euroeval/constants.py CHANGED
@@ -16,7 +16,7 @@ MAX_CONTEXT_LENGTH = 5_000
16
16
 
17
17
  # We need to raise the amount of tokens generated for reasoning models, to give them
18
18
  # time to think
19
- REASONING_MAX_TOKENS = 8_192
19
+ REASONING_MAX_TOKENS = 32_768
20
20
 
21
21
 
22
22
  # The Hugging Face Hub pipeline tags used to classify models as generative
euroeval/data_loading.py CHANGED
@@ -39,32 +39,9 @@ def load_data(
39
39
  HuggingFaceHubDown:
40
40
  If the Hugging Face Hub is down.
41
41
  """
42
- num_attempts = 5
43
- for _ in range(num_attempts):
44
- try:
45
- dataset = load_dataset(
46
- path=dataset_config.huggingface_id,
47
- cache_dir=benchmark_config.cache_dir,
48
- token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
49
- )
50
- break
51
- except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
52
- logger.warning(
53
- f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
54
- )
55
- time.sleep(1)
56
- continue
57
- except HfHubHTTPError:
58
- raise HuggingFaceHubDown()
59
- else:
60
- raise InvalidBenchmark(
61
- f"Failed to load dataset {dataset_config.huggingface_id!r} after "
62
- f"{num_attempts} attempts."
63
- )
64
-
65
- assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
66
-
67
- dataset = DatasetDict({key: dataset[key] for key in ["train", "val", "test"]})
42
+ dataset = load_raw_data(
43
+ dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
44
+ )
68
45
 
69
46
  if not benchmark_config.evaluate_test_split:
70
47
  dataset["test"] = dataset["val"]
@@ -101,3 +78,48 @@ def load_data(
101
78
  for idx in range(benchmark_config.num_iterations)
102
79
  ]
103
80
  return datasets
81
+
82
+
83
+ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
84
+ """Load the raw dataset.
85
+
86
+ Args:
87
+ dataset_config:
88
+ The configuration for the dataset.
89
+ cache_dir:
90
+ The directory to cache the dataset.
91
+
92
+ Returns:
93
+ The dataset.
94
+ """
95
+ num_attempts = 5
96
+ for _ in range(num_attempts):
97
+ try:
98
+ dataset = load_dataset(
99
+ path=dataset_config.huggingface_id,
100
+ cache_dir=cache_dir,
101
+ token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
102
+ )
103
+ break
104
+ except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
105
+ logger.warning(
106
+ f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
107
+ )
108
+ time.sleep(1)
109
+ continue
110
+ except HfHubHTTPError:
111
+ raise HuggingFaceHubDown()
112
+ else:
113
+ raise InvalidBenchmark(
114
+ f"Failed to load dataset {dataset_config.huggingface_id!r} after "
115
+ f"{num_attempts} attempts."
116
+ )
117
+ assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
118
+ required_keys = ["train", "val", "test"]
119
+ missing_keys = [key for key in required_keys if key not in dataset]
120
+ if missing_keys:
121
+ raise InvalidBenchmark(
122
+ "The dataset is missing the following required splits: "
123
+ f"{', '.join(missing_keys)}"
124
+ )
125
+ return DatasetDict({key: dataset[key] for key in required_keys})