EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +161 -114
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -470
  61. euroeval-15.16.0.dist-info/RECORD +0 -63
  62. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,8 +2,8 @@
2
2
 
3
3
  import asyncio
4
4
  import collections.abc as c
5
+ import json
5
6
  import logging
6
- import os
7
7
  import re
8
8
  import typing as t
9
9
  from functools import cache, cached_property, partial
@@ -38,7 +38,12 @@ from requests.exceptions import RequestException
38
38
  from tqdm.asyncio import tqdm as tqdm_async
39
39
  from tqdm.auto import tqdm
40
40
 
41
- from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
41
+ from ..constants import (
42
+ JSON_STRIP_CHARACTERS,
43
+ LITELLM_CLASSIFICATION_OUTPUT_KEY,
44
+ MAX_LITELLM_LOGPROBS,
45
+ REASONING_MAX_TOKENS,
46
+ )
42
47
  from ..data_models import (
43
48
  BenchmarkConfig,
44
49
  DatasetConfig,
@@ -67,16 +72,18 @@ from ..task_group_utils import (
67
72
  text_to_text,
68
73
  token_classification,
69
74
  )
75
+ from ..tasks import NER
70
76
  from ..tokenization_utils import get_first_label_token_mapping
71
77
  from ..types import ExtractLabelsFunction
72
78
  from ..utils import (
73
79
  add_semaphore_and_catch_exception,
74
80
  create_model_cache_dir,
81
+ get_hf_token,
75
82
  log_once,
76
83
  safe_run,
77
84
  )
78
85
  from .base import BenchmarkModule
79
- from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
86
+ from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
80
87
 
81
88
  if t.TYPE_CHECKING:
82
89
  from datasets import DatasetDict
@@ -155,7 +162,7 @@ ALLOWED_PARAMS = {
155
162
  r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
156
163
  # Gemini models
157
164
  r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
158
- r"(gemini/)?gemini-2.5-flash-[0-9].*": ["no-thinking", "thinking"],
165
+ r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
159
166
  # xAI models
160
167
  r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
161
168
  }
@@ -181,6 +188,8 @@ class LiteLLMModel(BenchmarkModule):
181
188
  model_config: ModelConfig,
182
189
  dataset_config: DatasetConfig,
183
190
  benchmark_config: BenchmarkConfig,
191
+ log_metadata: bool = True,
192
+ **generation_kwargs: dict[str, t.Any],
184
193
  ) -> None:
185
194
  """Initialise the model.
186
195
 
@@ -191,6 +200,11 @@ class LiteLLMModel(BenchmarkModule):
191
200
  The dataset configuration.
192
201
  benchmark_config:
193
202
  The benchmark configuration.
203
+ log_metadata:
204
+ Whether to log the model metadata.
205
+ generation_kwargs:
206
+ The generation kwargs to pass to the model. If None, default values will
207
+ be used.
194
208
  """
195
209
  # Detect whether the model is an Ollama model, as we need to extract metadata
196
210
  # differently for these models
@@ -209,13 +223,16 @@ class LiteLLMModel(BenchmarkModule):
209
223
  model_config=model_config,
210
224
  dataset_config=dataset_config,
211
225
  benchmark_config=benchmark_config,
226
+ log_metadata=log_metadata,
212
227
  )
213
228
 
229
+ self.generation_kwargs = generation_kwargs
214
230
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
215
231
  dataset_config=self.dataset_config,
216
232
  model_config=self.model_config,
217
- tokenizer=None,
233
+ tokeniser=None,
218
234
  generative_type=self.generative_type,
235
+ log_metadata=self.log_metadata,
219
236
  )
220
237
 
221
238
  @property
@@ -245,11 +262,12 @@ class LiteLLMModel(BenchmarkModule):
245
262
  else:
246
263
  type_ = GenerativeType.INSTRUCTION_TUNED
247
264
 
248
- log_once(
249
- f"Detected generative type {type_.name!r} for model "
250
- f"{self.model_config.model_id!r}",
251
- level=logging.DEBUG,
252
- )
265
+ if self.log_metadata:
266
+ log_once(
267
+ f"Detected generative type {type_.name!r} for model "
268
+ f"{self.model_config.model_id!r}",
269
+ level=logging.DEBUG,
270
+ )
253
271
  return type_
254
272
 
255
273
  def generate(self, inputs: dict) -> GenerativeModelOutput:
@@ -270,32 +288,11 @@ class LiteLLMModel(BenchmarkModule):
270
288
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
271
289
  dataset_config=self.dataset_config,
272
290
  model_config=self.model_config,
273
- tokenizer=None,
291
+ tokeniser=None,
274
292
  generative_type=self.generative_type,
293
+ log_metadata=self.log_metadata,
275
294
  )
276
295
 
277
- # Sanity check that "JSON" is included in the prompt, as some models require
278
- # this
279
- if self.dataset_config.task in TASKS_USING_JSON:
280
- for conversation in conversations:
281
- if not conversation:
282
- raise InvalidBenchmark(
283
- "Encountered an empty conversation in 'messages'."
284
- )
285
- last_message = conversation[-1]
286
- assert isinstance(last_message, dict), (
287
- f"Expected dict message, got {type(last_message)}"
288
- )
289
- assert "content" in last_message, (
290
- "Expected 'content' key in the last message of the conversation."
291
- )
292
- assert isinstance(last_message["content"], str), (
293
- "Expected 'content' to be a string."
294
- )
295
- assert "json" in last_message["content"].lower(), (
296
- "Prompt must contain 'json' for JSON tasks."
297
- )
298
-
299
296
  all_responses: dict[int, "ModelResponse"] = {}
300
297
  conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
301
298
  enumerate(conversations)
@@ -304,12 +301,16 @@ class LiteLLMModel(BenchmarkModule):
304
301
  if not conversations_to_run:
305
302
  break
306
303
 
304
+ generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
305
+ dataset_config=self.dataset_config
306
+ )
307
+
307
308
  batch_indices, batch_conversations = zip(*conversations_to_run)
308
309
  successes, failures = safe_run(
309
310
  self._generate_async(
310
311
  model_id=self.model_config.model_id,
311
312
  conversations=list(batch_conversations),
312
- **self.get_generation_kwargs(dataset_config=self.dataset_config),
313
+ **generation_kwargs,
313
314
  )
314
315
  )
315
316
 
@@ -336,11 +337,8 @@ class LiteLLMModel(BenchmarkModule):
336
337
  # Attempt to handle the exceptions, to improve the chance of getting
337
338
  # successful generations next time around
338
339
  for _, error in failures:
339
- self._handle_exception(
340
- error=error,
341
- generation_kwargs=self.get_generation_kwargs(
342
- dataset_config=self.dataset_config
343
- ),
340
+ generation_kwargs = self._handle_exception(
341
+ error=error, **generation_kwargs
344
342
  )
345
343
 
346
344
  # Sleep for a second to avoid pinging the API server too quickly
@@ -364,9 +362,7 @@ class LiteLLMModel(BenchmarkModule):
364
362
 
365
363
  return model_output
366
364
 
367
- def _handle_exception(
368
- self, error: Exception, generation_kwargs: dict[str, t.Any]
369
- ) -> None:
365
+ def _handle_exception(self, error: Exception, **generation_kwargs) -> dict:
370
366
  """Handle an exception from the model.
371
367
 
372
368
  Args:
@@ -374,6 +370,9 @@ class LiteLLMModel(BenchmarkModule):
374
370
  The exception to handle.
375
371
  generation_kwargs:
376
372
  The generation kwargs to pass to the model.
373
+
374
+ Returns:
375
+ The updated generation kwargs to pass to the model.
377
376
  """
378
377
  error_msg = str(error).lower()
379
378
  model_id = self.model_config.model_id
@@ -386,6 +385,9 @@ class LiteLLMModel(BenchmarkModule):
386
385
  "logprobs is not supported",
387
386
  "logprobs is not enabled",
388
387
  ]
388
+ logprobs_pattern = re.compile(
389
+ r"does not support parameters: \[.*'top_logprobs'.*\]"
390
+ )
389
391
  temperature_messages = [
390
392
  "'temperature' is not supported with this model.",
391
393
  "temperature is not supported with this model",
@@ -403,6 +405,7 @@ class LiteLLMModel(BenchmarkModule):
403
405
  r"[0-9]+ and ([0-9]+)\."
404
406
  )
405
407
  requires_thinking_disabled_messages = ["thinking.type: Field required"]
408
+ seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
406
409
 
407
410
  if any(msg.lower() in error_msg for msg in stop_messages):
408
411
  log_once(
@@ -411,9 +414,10 @@ class LiteLLMModel(BenchmarkModule):
411
414
  level=logging.DEBUG,
412
415
  )
413
416
  generation_kwargs["stop"] = None
414
- return
417
+ return generation_kwargs
415
418
  elif (
416
419
  any(msg.lower() in error_msg for msg in logprobs_messages)
420
+ or logprobs_pattern.search(string=error_msg)
417
421
  # Special case for Vertex AI models, since they have strict rate
418
422
  # limits on using logprobs. They also have a cap of 5 logprobs, but
419
423
  # we ignore this since the rate limiting makes it unusable anyway.
@@ -425,7 +429,7 @@ class LiteLLMModel(BenchmarkModule):
425
429
  )
426
430
  generation_kwargs.pop("logprobs", None)
427
431
  generation_kwargs.pop("top_logprobs", None)
428
- return
432
+ return generation_kwargs
429
433
  elif any(msg.lower() in error_msg for msg in temperature_messages):
430
434
  log_once(
431
435
  f"The model {model_id!r} does not support "
@@ -433,7 +437,7 @@ class LiteLLMModel(BenchmarkModule):
433
437
  level=logging.DEBUG,
434
438
  )
435
439
  generation_kwargs.pop("temperature", None)
436
- return
440
+ return generation_kwargs
437
441
  elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
438
442
  log_once(
439
443
  f"The model {model_id!r} requires "
@@ -441,8 +445,11 @@ class LiteLLMModel(BenchmarkModule):
441
445
  level=logging.DEBUG,
442
446
  )
443
447
  generation_kwargs["temperature"] = 1.0
444
- return
445
- elif any(msg.lower() in error_msg for msg in max_items_messages):
448
+ return generation_kwargs
449
+ elif (
450
+ any(msg.lower() in error_msg for msg in max_items_messages)
451
+ and self.dataset_config.task == NER
452
+ ):
446
453
  log_once(
447
454
  f"The model {model_id!r} does not support "
448
455
  "maxItems in the JSON schema, so disabling it.",
@@ -454,7 +461,7 @@ class LiteLLMModel(BenchmarkModule):
454
461
  }
455
462
  pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
456
463
  generation_kwargs["response_format"] = pydantic_class
457
- return
464
+ return generation_kwargs
458
465
  elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
459
466
  log_once(
460
467
  f"The model {self.model_config.model_id!r} does not support "
@@ -462,7 +469,7 @@ class LiteLLMModel(BenchmarkModule):
462
469
  level=logging.DEBUG,
463
470
  )
464
471
  generation_kwargs["response_format"] = dict(type="json_object")
465
- return
472
+ return generation_kwargs
466
473
  elif thinking_match := thinking_budget_pattern.search(string=error_msg):
467
474
  thinking_budget = int(thinking_match.group(1))
468
475
  if thinking_budget >= REASONING_MAX_TOKENS:
@@ -471,7 +478,7 @@ class LiteLLMModel(BenchmarkModule):
471
478
  f"{thinking_budget:,} tokens, which is within the limit of "
472
479
  f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
473
480
  f"error message was: {error_msg}."
474
- )
481
+ ) from error
475
482
  log_once(
476
483
  f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
477
484
  "for reasoning, which is less than the default of "
@@ -482,7 +489,7 @@ class LiteLLMModel(BenchmarkModule):
482
489
  generation_kwargs["thinking"] = dict(
483
490
  type="enabled", budget_tokens=thinking_budget - 1
484
491
  )
485
- return
492
+ return generation_kwargs
486
493
  elif (
487
494
  any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
488
495
  and self.generative_type != GenerativeType.REASONING
@@ -494,59 +501,73 @@ class LiteLLMModel(BenchmarkModule):
494
501
  level=logging.DEBUG,
495
502
  )
496
503
  generation_kwargs["thinking"] = dict(type="disabled")
497
- return
504
+ return generation_kwargs
505
+ elif re.search(pattern=seed_pattern, string=error_msg):
506
+ log_once(
507
+ f"The model {model_id!r} does not support the `seed` parameter, so "
508
+ "disabling it.",
509
+ level=logging.DEBUG,
510
+ )
511
+ generation_kwargs.pop("seed", None)
512
+ return generation_kwargs
513
+ # If there are too many I/O connections, we increase the number of allowed file
514
+ # descriptors
515
+ elif "too many open files" in error_msg:
516
+ raise InvalidBenchmark(
517
+ "There are too many file descriptors running. See the current "
518
+ "value by running `ulimit -n`. Try increasing it by running "
519
+ "`ulimit -n <new-value>` and try again."
520
+ ) from error
498
521
  elif isinstance(
499
522
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
500
523
  ):
501
524
  logger.debug(
502
525
  f"Service temporarily unavailable. The error message was: {error}. "
503
- f"Retrying in 5 seconds..."
526
+ "Retrying in 10 seconds..."
504
527
  )
505
- sleep(5)
506
- return
528
+ sleep(10)
529
+ return generation_kwargs
507
530
  elif isinstance(error, UnsupportedParamsError):
508
531
  unsupported_param_match = re.search(
509
532
  pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
510
533
  string=error.message,
511
534
  )
512
535
  if unsupported_param_match is None:
513
- raise InvalidModel(error.message)
536
+ raise InvalidModel(error.message) from error
514
537
  else:
515
538
  unsupported_param = unsupported_param_match.group(0)
516
539
  raise InvalidModel(
517
540
  f"The model {model_id!r} does not support the parameter "
518
541
  f"{unsupported_param!r}. Try again without this parameter. "
519
542
  "Skipping this model."
520
- )
543
+ ) from error
521
544
  elif isinstance(error, (APIConnectionError, OSError)):
522
- # If there are too many I/O connections, we increase the number of allowed
523
- # file descriptors
524
- if "too many open files" in error_msg:
525
- raise InvalidBenchmark(
526
- "There are too many file descriptors running. See the current "
527
- "value by running `ulimit -n`. Try increasing it by running "
528
- "`ulimit -n <new-value>` and try again."
529
- )
530
545
  raise InvalidBenchmark(
531
546
  f"Encountered {type(error)} during generation: {error}."
532
- )
547
+ ) from error
548
+
549
+ if isinstance(error, NotFoundError):
550
+ raise InvalidModel(
551
+ f"The model {model_id!r} was not found. Please check the model ID "
552
+ "and try again."
553
+ ) from error
533
554
 
534
555
  if isinstance(error, RateLimitError):
535
556
  raise InvalidModel(
536
557
  f"You have encountered your rate limit for model {model_id!r}. "
537
558
  "Skipping."
538
- )
559
+ ) from error
539
560
 
540
561
  if isinstance(error, AuthenticationError):
541
562
  raise NeedsAdditionalArgument(
542
563
  cli_argument="--api-key",
543
564
  script_argument="api_key=<your-api-key>",
544
565
  run_with_cli=self.benchmark_config.run_with_cli,
545
- )
566
+ ) from error
546
567
 
547
568
  raise InvalidBenchmark(
548
569
  f"Failed to generate text. The error message was: {error}"
549
- )
570
+ ) from error
550
571
 
551
572
  async def _generate_async(
552
573
  self,
@@ -573,9 +594,9 @@ class LiteLLMModel(BenchmarkModule):
573
594
  # for all the requests, preventing "too many open files" errors
574
595
  router = Router(
575
596
  model_list=[
576
- dict(
597
+ litellm.DeploymentTypedDict(
577
598
  model_name=self.model_config.model_id,
578
- litellm_params=generation_kwargs,
599
+ litellm_params=litellm.LiteLLMParamsTypedDict(model=model_id),
579
600
  )
580
601
  ]
581
602
  )
@@ -585,7 +606,9 @@ class LiteLLMModel(BenchmarkModule):
585
606
  semaphore = asyncio.Semaphore(max_concurrent_calls)
586
607
  requests = [
587
608
  add_semaphore_and_catch_exception(
588
- router.acompletion(model=model_id, messages=conversation),
609
+ router.acompletion(
610
+ model=model_id, messages=conversation, **generation_kwargs
611
+ ),
589
612
  semaphore=semaphore,
590
613
  )
591
614
  for conversation in conversations
@@ -645,6 +668,23 @@ class LiteLLMModel(BenchmarkModule):
645
668
  generation_output = generated_message.content or ""
646
669
  generation_output = generation_output.strip()
647
670
 
671
+ # In the case where we're dealing with a classification task, the model is
672
+ # outputting a JSON dictionary, so we will extract the generated text from
673
+ # within the dictionary
674
+ generation_dct: dict[str, t.Any] | None = None
675
+ if LITELLM_CLASSIFICATION_OUTPUT_KEY in generation_output:
676
+ try:
677
+ generation_dct = json.loads(generation_output)
678
+ assert isinstance(generation_dct, dict)
679
+ if set(generation_dct.keys()) == {
680
+ LITELLM_CLASSIFICATION_OUTPUT_KEY
681
+ }:
682
+ generation_output = str(
683
+ generation_dct[LITELLM_CLASSIFICATION_OUTPUT_KEY]
684
+ ).strip()
685
+ except json.JSONDecodeError:
686
+ pass
687
+
648
688
  # Structure the model output as a GenerativeModelOutput object
649
689
  sequences.append(generation_output)
650
690
  if hasattr(model_response_choices, "logprobs"):
@@ -657,6 +697,23 @@ class LiteLLMModel(BenchmarkModule):
657
697
  ]
658
698
  for content in model_response_choices.logprobs.content or list()
659
699
  ]
700
+
701
+ # If the model outputted a JSON dictionary, we need to find the
702
+ # token index of the value within the dictionary, rather than the
703
+ # first token of the entire output
704
+ if generation_dct:
705
+ key_name = next(iter(generation_dct.keys()))
706
+ logprobs_list = [
707
+ lst
708
+ for lst in logprobs_list
709
+ if (
710
+ lst
711
+ and lst[0]
712
+ and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
713
+ and not key_name.startswith(token)
714
+ )
715
+ ]
716
+
660
717
  scores.append(logprobs_list)
661
718
  else:
662
719
  log_once(
@@ -730,9 +787,7 @@ class LiteLLMModel(BenchmarkModule):
730
787
  repo_info = hf_api.model_info(
731
788
  repo_id=model_id,
732
789
  revision="main",
733
- token=os.getenv("HUGGINGFACE_API_KEY")
734
- or self.benchmark_config.api_key
735
- or True,
790
+ token=get_hf_token(api_key=self.benchmark_config.api_key),
736
791
  )
737
792
  except (
738
793
  RepositoryNotFoundError,
@@ -789,7 +844,7 @@ class LiteLLMModel(BenchmarkModule):
789
844
  run_with_cli=self.benchmark_config.run_with_cli,
790
845
  )
791
846
 
792
- tokenizer = load_tokenizer(
847
+ tokeniser = load_tokeniser(
793
848
  model=None,
794
849
  model_id=model_id,
795
850
  trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -801,10 +856,10 @@ class LiteLLMModel(BenchmarkModule):
801
856
  ):
802
857
  vocab_size = hf_config.vocab_size
803
858
  elif (
804
- hasattr(tokenizer, "vocab_size")
805
- and tokenizer.vocab_size is not None
859
+ hasattr(tokeniser, "vocab_size")
860
+ and tokeniser.vocab_size is not None
806
861
  ):
807
- vocab_size = tokenizer.vocab_size
862
+ vocab_size = tokeniser.vocab_size
808
863
  else:
809
864
  vocab_size = -1
810
865
  return vocab_size
@@ -835,13 +890,15 @@ class LiteLLMModel(BenchmarkModule):
835
890
  if context_length_keys:
836
891
  context_length = model_info[context_length_keys[0]]
837
892
  if context_length is not None:
838
- log_once(
839
- f"Detected context length key {context_length_keys[0]!r} "
840
- f"for Ollama model {ollama_model_id!r}",
841
- level=logging.DEBUG,
842
- )
893
+ if self.log_metadata:
894
+ log_once(
895
+ f"Detected context length key "
896
+ f"{context_length_keys[0]!r} for Ollama model "
897
+ f"{ollama_model_id!r}",
898
+ level=logging.DEBUG,
899
+ )
843
900
  return int(context_length)
844
- else:
901
+ elif self.log_metadata:
845
902
  log_once(
846
903
  f"Tried to get the maximum length of the Ollama model "
847
904
  f"{ollama_model_id!r}, but could not find a context length. "
@@ -869,7 +926,7 @@ class LiteLLMModel(BenchmarkModule):
869
926
  run_with_cli=self.benchmark_config.run_with_cli,
870
927
  )
871
928
 
872
- tokenizer = load_tokenizer(
929
+ tokeniser = load_tokeniser(
873
930
  model=None,
874
931
  model_id=model_id,
875
932
  trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -877,18 +934,18 @@ class LiteLLMModel(BenchmarkModule):
877
934
 
878
935
  all_max_lengths: list[int] = list()
879
936
 
880
- # Add the registered max length of the tokenizer
937
+ # Add the registered max length of the tokeniser
881
938
  if hasattr(
882
- tokenizer, "model_max_length"
883
- ) and tokenizer.model_max_length < int(1e30):
884
- all_max_lengths.append(tokenizer.model_max_length)
939
+ tokeniser, "model_max_length"
940
+ ) and tokeniser.model_max_length < int(1e30):
941
+ all_max_lengths.append(tokeniser.model_max_length)
885
942
 
886
943
  # Add the max length derived from the model's input sizes
887
- if hasattr(tokenizer, "max_model_input_sizes"):
944
+ if hasattr(tokeniser, "max_model_input_sizes"):
888
945
  all_max_lengths.extend(
889
946
  [
890
947
  size
891
- for size in tokenizer.max_model_input_sizes.values()
948
+ for size in tokeniser.max_model_input_sizes.values()
892
949
  if size is not None
893
950
  ]
894
951
  )
@@ -1026,7 +1083,7 @@ class LiteLLMModel(BenchmarkModule):
1026
1083
  f"Service temporarily unavailable. The error message was: {e}. "
1027
1084
  "Retrying in 10 seconds..."
1028
1085
  )
1029
- sleep(5)
1086
+ sleep(10)
1030
1087
  except APIError as e:
1031
1088
  if "'503 Service Unavailable" not in str(e):
1032
1089
  raise e
@@ -1136,7 +1193,10 @@ class LiteLLMModel(BenchmarkModule):
1136
1193
 
1137
1194
  if self.benchmark_config.few_shot:
1138
1195
  few_shot_examples = extract_few_shot_examples(
1139
- dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
1196
+ dataset=dataset,
1197
+ dataset_config=self.dataset_config,
1198
+ benchmark_config=self.benchmark_config,
1199
+ itr_idx=itr_idx,
1140
1200
  )
1141
1201
  else:
1142
1202
  few_shot_examples = list()
@@ -1149,7 +1209,7 @@ class LiteLLMModel(BenchmarkModule):
1149
1209
  dataset_config=self.dataset_config,
1150
1210
  instruction_model=True,
1151
1211
  always_populate_text_field=False,
1152
- tokenizer=None,
1212
+ tokeniser=None,
1153
1213
  ),
1154
1214
  batched=True,
1155
1215
  load_from_cache_file=False,
@@ -1174,7 +1234,6 @@ class LiteLLMModel(BenchmarkModule):
1174
1234
  """
1175
1235
  # Set the core generation arguments
1176
1236
  generation_kwargs: dict[str, t.Any] = dict(
1177
- model=self.model_config.model_id,
1178
1237
  max_completion_tokens=(
1179
1238
  REASONING_MAX_TOKENS
1180
1239
  if self.generative_type == GenerativeType.REASONING
@@ -1191,7 +1250,7 @@ class LiteLLMModel(BenchmarkModule):
1191
1250
 
1192
1251
  # Set up the `response_format` generation argument if we are dealing with a task
1193
1252
  # using structured generation
1194
- if dataset_config.task in TASKS_USING_JSON:
1253
+ if dataset_config.task.uses_structured_output:
1195
1254
  if self.generative_type == GenerativeType.REASONING:
1196
1255
  log_once(
1197
1256
  f"The model {self.model_config.model_id!r} is a reasoning model "
@@ -1200,12 +1259,21 @@ class LiteLLMModel(BenchmarkModule):
1200
1259
  level=logging.DEBUG,
1201
1260
  )
1202
1261
  elif supports_response_schema(model=self.model_config.model_id):
1203
- ner_tag_names = list(dataset_config.prompt_label_mapping.values())
1204
- keys_and_their_types: dict[str, t.Any] = {
1205
- tag_name: (conlist(str, max_length=5), ...)
1206
- for tag_name in ner_tag_names
1207
- }
1208
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
1262
+ if dataset_config.task == NER:
1263
+ ner_tag_names = list(dataset_config.prompt_label_mapping.values())
1264
+ keys_and_their_types: dict[str, t.Any] = {
1265
+ tag_name: (conlist(str, max_length=5), ...)
1266
+ for tag_name in ner_tag_names
1267
+ }
1268
+ pydantic_class = create_model(
1269
+ "AnswerFormat", **keys_and_their_types
1270
+ )
1271
+ else:
1272
+ raise InvalidBenchmark(
1273
+ "This task requires structured generation, but it has not "
1274
+ "been implemented for this task yet. Please open an issue "
1275
+ "at https://github.com/EuroEval/EuroEval/issues."
1276
+ )
1209
1277
  generation_kwargs["response_format"] = pydantic_class
1210
1278
  log_once(
1211
1279
  "Enabling structured generation for model "
@@ -1221,6 +1289,16 @@ class LiteLLMModel(BenchmarkModule):
1221
1289
  "the model does not support schemas.",
1222
1290
  level=logging.DEBUG,
1223
1291
  )
1292
+ elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
1293
+ localised_labels = [
1294
+ self.dataset_config.prompt_label_mapping[label]
1295
+ for label in self.dataset_config.labels
1296
+ ]
1297
+ keys_and_their_types = {
1298
+ LITELLM_CLASSIFICATION_OUTPUT_KEY: (t.Literal[*localised_labels], ...)
1299
+ }
1300
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
1301
+ generation_kwargs["response_format"] = pydantic_class
1224
1302
 
1225
1303
  # If the model is an Ollama reasoning model, we ensure that thinking is enabled
1226
1304
  if self.is_ollama and self.generative_type == GenerativeType.REASONING:
@@ -1234,7 +1312,7 @@ class LiteLLMModel(BenchmarkModule):
1234
1312
  # Handle manually set parameters
1235
1313
  if self.buffer["first_label_token_mapping"]:
1236
1314
  generation_kwargs["logprobs"] = True
1237
- generation_kwargs["top_logprobs"] = MAX_LOGPROBS
1315
+ generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
1238
1316
  if self.model_config.revision == "thinking":
1239
1317
  generation_kwargs["thinking"] = dict(
1240
1318
  type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
@@ -1260,7 +1338,7 @@ class LiteLLMModel(BenchmarkModule):
1260
1338
  # First attempt is a test run with a single conversation to handle errors
1261
1339
  # quickly. We repeat this multiple times to deal with different types of
1262
1340
  # errors, and stop if we get a successful response.
1263
- test_conversation = [
1341
+ test_conversation: list[litellm.AllMessageValues] = [
1264
1342
  litellm.ChatCompletionUserMessage(role="user", content="Test message")
1265
1343
  ]
1266
1344
  for _ in range(5):
@@ -1274,7 +1352,9 @@ class LiteLLMModel(BenchmarkModule):
1274
1352
  if not failures:
1275
1353
  break
1276
1354
  for _, error in failures:
1277
- self._handle_exception(error=error, generation_kwargs=generation_kwargs)
1355
+ generation_kwargs = self._handle_exception(
1356
+ error=error, **generation_kwargs
1357
+ )
1278
1358
 
1279
1359
  return generation_kwargs
1280
1360
 
@@ -1350,11 +1430,11 @@ def try_download_ollama_model(model_id: str) -> bool:
1350
1430
  for model_obj in ollama.list().models
1351
1431
  if model_obj.model is not None
1352
1432
  ]
1353
- except ConnectionError:
1433
+ except ConnectionError as e:
1354
1434
  raise InvalidModel(
1355
1435
  "Ollama does not seem to be running, so we cannot evaluate the model "
1356
1436
  f"{model_id!r}. Please make sure that Ollama is running and try again."
1357
- )
1437
+ ) from e
1358
1438
 
1359
1439
  ollama_model_id = "/".join(model_id.split("/")[1:])
1360
1440
  if ollama_model_id not in downloaded_ollama_models:
@@ -1384,12 +1464,12 @@ def try_download_ollama_model(model_id: str) -> bool:
1384
1464
  raise InvalidModel(
1385
1465
  f"Failed to download Ollama model {ollama_model_id}. "
1386
1466
  f"The error message was: {inner_e}"
1387
- )
1467
+ ) from inner_e
1388
1468
  else:
1389
1469
  raise InvalidModel(
1390
1470
  f"Failed to download Ollama model {ollama_model_id}. "
1391
1471
  f"The error message was: {e}"
1392
- )
1472
+ ) from e
1393
1473
 
1394
1474
  # Download the model
1395
1475
  with tqdm(