EuroEval 16.0.1__py3-none-any.whl → 16.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show
  1. euroeval/benchmark_config_factory.py +6 -1
  2. euroeval/benchmark_modules/base.py +2 -0
  3. euroeval/benchmark_modules/fresh.py +7 -1
  4. euroeval/benchmark_modules/hf.py +26 -21
  5. euroeval/benchmark_modules/litellm.py +258 -131
  6. euroeval/benchmark_modules/vllm.py +79 -40
  7. euroeval/benchmarker.py +11 -2
  8. euroeval/cli.py +14 -1
  9. euroeval/constants.py +1 -1
  10. euroeval/data_models.py +77 -6
  11. euroeval/dataset_configs/__init__.py +1 -0
  12. euroeval/dataset_configs/danish.py +14 -0
  13. euroeval/dataset_configs/dutch.py +14 -0
  14. euroeval/dataset_configs/english.py +22 -0
  15. euroeval/dataset_configs/estonian.py +15 -7
  16. euroeval/dataset_configs/finnish.py +14 -0
  17. euroeval/dataset_configs/french.py +14 -0
  18. euroeval/dataset_configs/german.py +23 -0
  19. euroeval/dataset_configs/italian.py +14 -0
  20. euroeval/dataset_configs/latvian.py +14 -0
  21. euroeval/dataset_configs/norwegian.py +14 -0
  22. euroeval/dataset_configs/polish.py +126 -0
  23. euroeval/dataset_configs/portuguese.py +14 -0
  24. euroeval/dataset_configs/spanish.py +14 -0
  25. euroeval/dataset_configs/swedish.py +25 -0
  26. euroeval/enums.py +12 -0
  27. euroeval/generation.py +17 -8
  28. euroeval/generation_utils.py +58 -10
  29. euroeval/metrics/pipeline.py +1 -1
  30. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  31. euroeval/prompt_templates/multiple_choice.py +27 -1
  32. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  33. euroeval/prompt_templates/reading_comprehension.py +11 -0
  34. euroeval/prompt_templates/sentiment_classification.py +15 -0
  35. euroeval/prompt_templates/summarization.py +27 -1
  36. euroeval/scores.py +5 -0
  37. euroeval/task_group_utils/question_answering.py +29 -29
  38. euroeval/task_group_utils/sequence_classification.py +10 -33
  39. euroeval/task_group_utils/token_classification.py +3 -3
  40. euroeval/tasks.py +4 -4
  41. euroeval/{tokenization_utils.py → tokenisation_utils.py} +40 -23
  42. euroeval/utils.py +36 -3
  43. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/METADATA +1 -1
  44. euroeval-16.1.0.dist-info/RECORD +70 -0
  45. euroeval-16.0.1.dist-info/RECORD +0 -69
  46. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
  47. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
  48. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -31,7 +31,7 @@ from litellm.exceptions import (
31
31
  )
32
32
  from litellm.llms.vertex_ai.common_utils import VertexAIError
33
33
  from litellm.router import Router
34
- from litellm.types.utils import ChoiceLogprobs
34
+ from litellm.types.utils import ChoiceLogprobs, Logprobs
35
35
  from litellm.utils import supports_reasoning, supports_response_schema
36
36
  from pydantic import conlist, create_model
37
37
  from requests.exceptions import RequestException
@@ -65,7 +65,11 @@ from ..exceptions import (
65
65
  NeedsEnvironmentVariable,
66
66
  NeedsExtraInstalled,
67
67
  )
68
- from ..generation_utils import apply_prompt, extract_few_shot_examples
68
+ from ..generation_utils import (
69
+ apply_prompt,
70
+ extract_few_shot_examples,
71
+ raise_if_wrong_params,
72
+ )
69
73
  from ..task_group_utils import (
70
74
  question_answering,
71
75
  sequence_classification,
@@ -73,7 +77,7 @@ from ..task_group_utils import (
73
77
  token_classification,
74
78
  )
75
79
  from ..tasks import NER
76
- from ..tokenization_utils import get_first_label_token_mapping
80
+ from ..tokenisation_utils import get_first_label_token_mapping
77
81
  from ..types import ExtractLabelsFunction
78
82
  from ..utils import (
79
83
  add_semaphore_and_catch_exception,
@@ -81,6 +85,7 @@ from ..utils import (
81
85
  get_hf_token,
82
86
  log_once,
83
87
  safe_run,
88
+ split_model_id,
84
89
  )
85
90
  from .base import BenchmarkModule
86
91
  from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
@@ -153,21 +158,6 @@ NUM_PARAMS_MAPPING = {
153
158
  }
154
159
 
155
160
 
156
- ALLOWED_PARAMS = {
157
- # OpenAI models
158
- r"gpt-5-.*": ["minimal", "low", "medium", "high"],
159
- r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
160
- # Anthropic models
161
- r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
162
- r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
163
- # Gemini models
164
- r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
165
- r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
166
- # xAI models
167
- r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
168
- }
169
-
170
-
171
161
  REASONING_MODELS = [
172
162
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
173
163
  r"(gemini/)?gemini.*thinking.*",
@@ -175,6 +165,15 @@ REASONING_MODELS = [
175
165
  r"(xai/)?grok-3-mini.*",
176
166
  ]
177
167
 
168
+ BASE_DECODER_MODELS = [
169
+ r"gpt-3.5-turbo-instruct.*",
170
+ r"ada-[0-9]{3}",
171
+ r"babbage-[0-9]{3}",
172
+ r"curie-[0-9]{3}",
173
+ r"davinci-[0-9]{3}",
174
+ r"text-davinci-[0-9]{3}",
175
+ ]
176
+
178
177
 
179
178
  class LiteLLMModel(BenchmarkModule):
180
179
  """A generative model from LiteLLM."""
@@ -182,6 +181,26 @@ class LiteLLMModel(BenchmarkModule):
182
181
  fresh_model = False
183
182
  batching_preference = BatchingPreference.ALL_AT_ONCE
184
183
  high_priority = False
184
+ allowed_params = {
185
+ # OpenAI models
186
+ re.compile(r"gpt-5-.*"): ["minimal", "low", "medium", "high"],
187
+ re.compile(r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?"): [
188
+ "low",
189
+ "medium",
190
+ "high",
191
+ ],
192
+ # Anthropic models
193
+ re.compile(r"(anthropic/)?claude-3-7-sonnet.*"): ["no-thinking", "thinking"],
194
+ re.compile(r"(anthropic/)?claude-(sonnet|opus)-4.*"): [
195
+ "no-thinking",
196
+ "thinking",
197
+ ],
198
+ # Gemini models
199
+ re.compile(r"(gemini/)?gemini-2.5-flash-lite.*"): ["no-thinking", "thinking"],
200
+ re.compile(r"(gemini/)?gemini-2.5-flash.*"): ["no-thinking", "thinking"],
201
+ # xAI models
202
+ re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
203
+ }
185
204
 
186
205
  def __init__(
187
206
  self,
@@ -206,6 +225,10 @@ class LiteLLMModel(BenchmarkModule):
206
225
  The generation kwargs to pass to the model. If None, default values will
207
226
  be used.
208
227
  """
228
+ raise_if_wrong_params(
229
+ model_config=model_config, allowed_params=self.allowed_params
230
+ )
231
+
209
232
  # Detect whether the model is an Ollama model, as we need to extract metadata
210
233
  # differently for these models
211
234
  self.is_ollama = model_config.model_id.startswith(
@@ -217,8 +240,6 @@ class LiteLLMModel(BenchmarkModule):
217
240
  else ollama.ShowResponse(model_info=None)
218
241
  )
219
242
 
220
- raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
221
-
222
243
  super().__init__(
223
244
  model_config=model_config,
224
245
  dataset_config=dataset_config,
@@ -242,21 +263,27 @@ class LiteLLMModel(BenchmarkModule):
242
263
  Returns:
243
264
  The generative type of the model, or None if it has not been set yet.
244
265
  """
245
- if self.is_ollama:
266
+ if self.benchmark_config.generative_type is not None:
267
+ type_ = self.benchmark_config.generative_type
268
+ elif self.is_ollama:
246
269
  reasoning_model = "thinking" in (self._ollama_show.capabilities or [])
247
270
  type_ = (
248
271
  GenerativeType.REASONING
249
272
  if reasoning_model
250
273
  else GenerativeType.INSTRUCTION_TUNED
251
274
  )
252
- elif self.model_config.revision in {"thinking"}:
275
+ elif self.model_config.param in {"thinking"}:
253
276
  type_ = GenerativeType.REASONING
254
- elif self.model_config.revision in {"no-thinking"}:
277
+ elif self.model_config.param in {"no-thinking"}:
255
278
  type_ = GenerativeType.INSTRUCTION_TUNED
256
279
  elif re.fullmatch(
257
280
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
258
281
  ):
259
282
  type_ = GenerativeType.REASONING
283
+ elif re.fullmatch(
284
+ pattern="|".join(BASE_DECODER_MODELS), string=self.model_config.model_id
285
+ ):
286
+ type_ = GenerativeType.BASE
260
287
  elif supports_reasoning(model=self.model_config.model_id):
261
288
  type_ = GenerativeType.REASONING
262
289
  else:
@@ -279,9 +306,20 @@ class LiteLLMModel(BenchmarkModule):
279
306
 
280
307
  Returns:
281
308
  The generated model outputs.
309
+
310
+ Raises:
311
+ InvalidBenchmark:
312
+ If the inputs do not contain either 'messages' or 'text' keys.
282
313
  """
283
- assert "messages" in inputs, "The input must contain a 'messages' key."
284
- conversations: list[list[litellm.AllMessageValues]] = inputs["messages"]
314
+ model_inputs: list[list[litellm.AllMessageValues] | str]
315
+ if "messages" in inputs:
316
+ model_inputs = inputs["messages"]
317
+ elif "text" in inputs:
318
+ model_inputs = inputs["text"]
319
+ else:
320
+ raise InvalidBenchmark(
321
+ "The inputs must contain either 'messages' or 'text' keys."
322
+ )
285
323
 
286
324
  # Get the mapping from labels to the first token in the label. We call this each
287
325
  # time we generate a new dataset since the dataset config can change
@@ -294,22 +332,22 @@ class LiteLLMModel(BenchmarkModule):
294
332
  )
295
333
 
296
334
  all_responses: dict[int, "ModelResponse"] = {}
297
- conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
298
- enumerate(conversations)
335
+ inputs_to_run: list[tuple[int, list[litellm.AllMessageValues] | str]] = list(
336
+ enumerate(model_inputs)
299
337
  )
300
338
  for attempt in range(num_attempts := 10):
301
- if not conversations_to_run:
339
+ if not inputs_to_run:
302
340
  break
303
341
 
304
342
  generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
305
343
  dataset_config=self.dataset_config
306
344
  )
307
345
 
308
- batch_indices, batch_conversations = zip(*conversations_to_run)
346
+ batch_indices, batch_inputs = zip(*inputs_to_run)
309
347
  successes, failures = safe_run(
310
348
  self._generate_async(
311
349
  model_id=self.model_config.model_id,
312
- conversations=list(batch_conversations),
350
+ inputs=list(batch_inputs),
313
351
  **generation_kwargs,
314
352
  )
315
353
  )
@@ -321,17 +359,17 @@ class LiteLLMModel(BenchmarkModule):
321
359
 
322
360
  # If all requests were successful, break
323
361
  if not failures:
324
- conversations_to_run = []
362
+ inputs_to_run = []
325
363
  break
326
364
 
327
365
  # Put the failed requests back in the queue to try again
328
- conversations_to_run = [
329
- (batch_indices[idx], conversations[batch_indices[idx]])
366
+ inputs_to_run = [
367
+ (batch_indices[idx], model_inputs[batch_indices[idx]])
330
368
  for idx, _ in failures
331
369
  ]
332
370
  logger.debug(
333
371
  f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
334
- f"{len(conversations_to_run):,} failed message(s)"
372
+ f"{len(inputs_to_run):,} failed message(s)"
335
373
  )
336
374
 
337
375
  # Attempt to handle the exceptions, to improve the chance of getting
@@ -349,14 +387,14 @@ class LiteLLMModel(BenchmarkModule):
349
387
  )
350
388
 
351
389
  # Extract the generations from the model output
352
- ordered_responses = [all_responses[i] for i in range(len(conversations))]
390
+ ordered_responses = [all_responses[i] for i in range(len(model_inputs))]
353
391
  model_output = self._create_model_output(
354
392
  model_responses=ordered_responses, model_id=self.model_config.model_id
355
393
  )
356
394
 
357
- if len(conversations) != len(model_output.sequences):
395
+ if len(model_inputs) != len(model_output.sequences):
358
396
  raise InvalidBenchmark(
359
- f"Number of model inputs ({len(conversations):,}) does not match the "
397
+ f"Number of model inputs ({len(model_inputs):,}) does not match the "
360
398
  f"number of model outputs ({len(model_output.sequences):,})."
361
399
  )
362
400
 
@@ -378,16 +416,24 @@ class LiteLLMModel(BenchmarkModule):
378
416
  model_id = self.model_config.model_id
379
417
 
380
418
  # Error messages that we want to catch and handle
381
- stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
419
+ stop_messages = [
420
+ "stop_sequences",
421
+ "'stop' is not supported with this model",
422
+ "'$.stop' is invalid",
423
+ ]
382
424
  logprobs_messages = [
383
425
  "you are not allowed to request logprobs",
384
426
  "you've reached the maximum number of requests with logprobs",
385
427
  "logprobs is not supported",
386
428
  "logprobs is not enabled",
387
429
  ]
430
+ top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
388
431
  logprobs_pattern = re.compile(
389
432
  r"does not support parameters: \[.*'top_logprobs'.*\]"
390
433
  )
434
+ max_completion_tokens_pattern = re.compile(
435
+ r"does not support parameters: \[.*'max_completion_tokens'.*\]"
436
+ )
391
437
  temperature_messages = [
392
438
  "'temperature' is not supported with this model.",
393
439
  "temperature is not supported with this model",
@@ -406,6 +452,10 @@ class LiteLLMModel(BenchmarkModule):
406
452
  )
407
453
  requires_thinking_disabled_messages = ["thinking.type: Field required"]
408
454
  seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
455
+ response_format_messages = [
456
+ "got an unexpected keyword argument 'response_format'",
457
+ "The model outputs empty dictionaries.",
458
+ ]
409
459
 
410
460
  if any(msg.lower() in error_msg for msg in stop_messages):
411
461
  log_once(
@@ -430,6 +480,24 @@ class LiteLLMModel(BenchmarkModule):
430
480
  generation_kwargs.pop("logprobs", None)
431
481
  generation_kwargs.pop("top_logprobs", None)
432
482
  return generation_kwargs
483
+ elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
484
+ log_once(
485
+ f"The model {model_id!r} does not support the `top_logprobs` argument, "
486
+ "so moving the value to `logprobs`.",
487
+ level=logging.DEBUG,
488
+ )
489
+ generation_kwargs["logprobs"] = generation_kwargs.pop("top_logprobs", None)
490
+ return generation_kwargs
491
+ elif max_completion_tokens_pattern.search(string=error_msg):
492
+ log_once(
493
+ f"The model {model_id!r} does not support max_completion_tokens, so "
494
+ "disabling it.",
495
+ level=logging.DEBUG,
496
+ )
497
+ generation_kwargs["max_tokens"] = generation_kwargs.pop(
498
+ "max_completion_tokens", None
499
+ )
500
+ return generation_kwargs
433
501
  elif any(msg.lower() in error_msg for msg in temperature_messages):
434
502
  log_once(
435
503
  f"The model {model_id!r} does not support "
@@ -510,6 +578,14 @@ class LiteLLMModel(BenchmarkModule):
510
578
  )
511
579
  generation_kwargs.pop("seed", None)
512
580
  return generation_kwargs
581
+ elif any(msg.lower() in error_msg for msg in response_format_messages):
582
+ log_once(
583
+ f"The model {model_id!r} does not support the `response_format` "
584
+ "parameter, so disabling it.",
585
+ level=logging.DEBUG,
586
+ )
587
+ generation_kwargs.pop("response_format", None)
588
+ return generation_kwargs
513
589
  # If there are too many I/O connections, we increase the number of allowed file
514
590
  # descriptors
515
591
  elif "too many open files" in error_msg:
@@ -572,7 +648,7 @@ class LiteLLMModel(BenchmarkModule):
572
648
  async def _generate_async(
573
649
  self,
574
650
  model_id: str,
575
- conversations: list[list[litellm.AllMessageValues]],
651
+ inputs: list[list[litellm.AllMessageValues] | str],
576
652
  **generation_kwargs,
577
653
  ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
578
654
  """Generate outputs from the model asynchronously.
@@ -580,8 +656,8 @@ class LiteLLMModel(BenchmarkModule):
580
656
  Args:
581
657
  model_id:
582
658
  The ID of the model to use for generation.
583
- conversations:
584
- The conversations to pass to the model.
659
+ inputs:
660
+ The inputs to pass to the model.
585
661
  **generation_kwargs:
586
662
  Additional generation arguments to pass to the model.
587
663
 
@@ -604,17 +680,51 @@ class LiteLLMModel(BenchmarkModule):
604
680
  # Get the LLM generations asynchronously
605
681
  max_concurrent_calls = 20
606
682
  semaphore = asyncio.Semaphore(max_concurrent_calls)
607
- requests = [
608
- add_semaphore_and_catch_exception(
609
- router.acompletion(
610
- model=model_id, messages=conversation, **generation_kwargs
611
- ),
612
- semaphore=semaphore,
613
- )
614
- for conversation in conversations
615
- ]
683
+ if self.generative_type == GenerativeType.BASE:
684
+ if not all(isinstance(input_, str) for input_ in inputs):
685
+ raise InvalidBenchmark(
686
+ "For base generative models, all inputs must be strings."
687
+ )
688
+ requests = [
689
+ add_semaphore_and_catch_exception(
690
+ router.atext_completion(
691
+ model=model_id, prompt=input_, **generation_kwargs
692
+ ),
693
+ semaphore=semaphore,
694
+ )
695
+ for input_ in inputs
696
+ if isinstance(input_, str)
697
+ ]
698
+ else:
699
+ if not all(isinstance(input_, list) for input_ in inputs):
700
+ raise InvalidBenchmark(
701
+ "For instruction-tuned and reasoning generative models, all "
702
+ "inputs must be lists of messages."
703
+ )
704
+ requests = [
705
+ add_semaphore_and_catch_exception(
706
+ router.acompletion(
707
+ model=model_id, messages=input_, **generation_kwargs
708
+ ),
709
+ semaphore=semaphore,
710
+ )
711
+ for input_ in inputs
712
+ if isinstance(input_, list)
713
+ ]
616
714
  responses = await tqdm_async.gather(*requests, leave=False)
617
715
 
716
+ # If we are performing structured generation and the model just outputs an empty
717
+ # dictionary, then we convert those to exceptions, to disable structured
718
+ # generation
719
+ if "response_format" in generation_kwargs:
720
+ responses = [
721
+ RuntimeError("The model outputs empty dictionaries.")
722
+ if not isinstance(response, Exception)
723
+ and any(choice.message.content == "{}" for choice in response.choices)
724
+ else response
725
+ for response in responses
726
+ ]
727
+
618
728
  # Separate the successful responses from the failed ones
619
729
  successes = [
620
730
  (idx, response)
@@ -630,7 +740,10 @@ class LiteLLMModel(BenchmarkModule):
630
740
  # Close connections
631
741
  for request in requests:
632
742
  if hasattr(request, "close"):
633
- request.close()
743
+ try:
744
+ request.close()
745
+ except RuntimeError as e:
746
+ logger.debug(f"RuntimeError during request.close(): {e}")
634
747
 
635
748
  return successes, failures
636
749
 
@@ -663,10 +776,18 @@ class LiteLLMModel(BenchmarkModule):
663
776
  continue
664
777
 
665
778
  model_response_choices = model_response.choices[0]
666
- assert isinstance(model_response_choices, litellm.Choices)
667
- generated_message: litellm.Message = model_response_choices.message
668
- generation_output = generated_message.content or ""
669
- generation_output = generation_output.strip()
779
+
780
+ if isinstance(model_response_choices, litellm.Choices):
781
+ generated_message: litellm.Message = model_response_choices.message
782
+ generation_output = generated_message.content or ""
783
+ generation_output = generation_output.strip()
784
+ elif isinstance(model_response_choices, litellm.litellm.TextChoices):
785
+ generation_output = model_response_choices.text or ""
786
+ else:
787
+ raise InvalidBenchmark(
788
+ "The model response choices must be of type Choices or "
789
+ f"TextChoices. Got {type(model_response_choices)}."
790
+ )
670
791
 
671
792
  # In the case where we're dealing with a classification task, the model is
672
793
  # outputting a JSON dictionary, so we will extract the generated text from
@@ -687,40 +808,55 @@ class LiteLLMModel(BenchmarkModule):
687
808
 
688
809
  # Structure the model output as a GenerativeModelOutput object
689
810
  sequences.append(generation_output)
690
- if hasattr(model_response_choices, "logprobs"):
811
+ if (
812
+ hasattr(model_response_choices, "logprobs")
813
+ and model_response_choices.logprobs is not None
814
+ ):
691
815
  logprobs_obj = model_response_choices.logprobs
816
+
817
+ if not isinstance(logprobs_obj, (Logprobs, ChoiceLogprobs)):
818
+ log_once(
819
+ "The logprobs object is malformed, so we won't use logprobs to "
820
+ "determine the labels.",
821
+ level=logging.WARNING,
822
+ )
823
+ continue
824
+
825
+ logprobs_list: list[list[tuple[str, float]]]
692
826
  if isinstance(logprobs_obj, ChoiceLogprobs):
693
- logprobs_list: list[list[tuple[str, float]]] = [
827
+ logprobs_list = [
694
828
  [
695
829
  (top_logprob.token, top_logprob.logprob)
696
830
  for top_logprob in content.top_logprobs
697
831
  ]
698
- for content in model_response_choices.logprobs.content or list()
832
+ for content in logprobs_obj.content or list()
833
+ ]
834
+ else:
835
+ logprobs_list = [
836
+ [
837
+ (token, logprob)
838
+ for token, logprob in (top_logprobs_dct or dict()).items()
839
+ ]
840
+ for top_logprobs_dct in logprobs_obj.top_logprobs or list()
699
841
  ]
700
842
 
701
- # If the model outputted a JSON dictionary, we need to find the
702
- # token index of the value within the dictionary, rather than the
703
- # first token of the entire output
704
- if generation_dct:
705
- key_name = next(iter(generation_dct.keys()))
706
- logprobs_list = [
843
+ # If the model outputted a JSON dictionary, we need to find the
844
+ # token index of the value within the dictionary, rather than the
845
+ # first token of the entire output
846
+ if generation_dct:
847
+ key_name = next(iter(generation_dct.keys()))
848
+ logprobs_list = [
849
+ lst
850
+ for lst in logprobs_list
851
+ if (
707
852
  lst
708
- for lst in logprobs_list
709
- if (
710
- lst
711
- and lst[0]
712
- and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
713
- and not key_name.startswith(token)
714
- )
715
- ]
853
+ and lst[0]
854
+ and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
855
+ and not key_name.startswith(token)
856
+ )
857
+ ]
716
858
 
717
- scores.append(logprobs_list)
718
- else:
719
- log_once(
720
- "The logprobs object is malformed, so we won't use logprobs to "
721
- "determine the labels.",
722
- level=logging.WARNING,
723
- )
859
+ scores.append(logprobs_list)
724
860
 
725
861
  if not sequences:
726
862
  logger.warning(
@@ -1047,7 +1183,7 @@ class LiteLLMModel(BenchmarkModule):
1047
1183
  Whether the model exists, or an error describing why we cannot check
1048
1184
  whether the model exists.
1049
1185
  """
1050
- model_id, _ = model_id.split("@") if "@" in model_id else (model_id, "main")
1186
+ model_id = split_model_id(model_id=model_id).model_id
1051
1187
  if model_id in litellm.model_list:
1052
1188
  return True
1053
1189
 
@@ -1135,10 +1271,29 @@ class LiteLLMModel(BenchmarkModule):
1135
1271
  Returns:
1136
1272
  The model configuration.
1137
1273
  """
1138
- model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "")
1274
+ model_id_components = split_model_id(model_id=model_id)
1275
+
1276
+ # Backwards compatibility: If the revision is set but not the parameter, we
1277
+ # assume that the revision is actually the parameter and log this as a warning.
1278
+ if model_id_components.revision != "main" and model_id_components.param is None:
1279
+ proper_model_id = (
1280
+ f"{model_id_components.model_id}#{model_id_components.revision}"
1281
+ )
1282
+ log_once(
1283
+ f"The model ID {model_id!r} specifies a revision "
1284
+ f"{model_id_components.revision!r} but not a parameter. We assume "
1285
+ "that the revision is actually the parameter and set the revision "
1286
+ "to 'main'. In the future, use the new '#' syntax to specify the "
1287
+ f"parameter (in this case, this would be {proper_model_id!r}), as this "
1288
+ "will be an error in future versions of EuroEval."
1289
+ )
1290
+ model_id_components.param = model_id_components.revision
1291
+ model_id_components.revision = "main"
1292
+
1139
1293
  return ModelConfig(
1140
- model_id=model_id,
1141
- revision=revision,
1294
+ model_id=model_id_components.model_id,
1295
+ revision=model_id_components.revision,
1296
+ param=model_id_components.param,
1142
1297
  task="text-generation",
1143
1298
  languages=list(),
1144
1299
  merge=False,
@@ -1207,7 +1362,7 @@ class LiteLLMModel(BenchmarkModule):
1207
1362
  few_shot_examples=few_shot_examples,
1208
1363
  model_config=self.model_config,
1209
1364
  dataset_config=self.dataset_config,
1210
- instruction_model=True,
1365
+ generative_type=self.generative_type,
1211
1366
  always_populate_text_field=False,
1212
1367
  tokeniser=None,
1213
1368
  ),
@@ -1313,7 +1468,7 @@ class LiteLLMModel(BenchmarkModule):
1313
1468
  if self.buffer["first_label_token_mapping"]:
1314
1469
  generation_kwargs["logprobs"] = True
1315
1470
  generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
1316
- if self.model_config.revision == "thinking":
1471
+ if self.model_config.param == "thinking":
1317
1472
  generation_kwargs["thinking"] = dict(
1318
1473
  type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
1319
1474
  )
@@ -1321,16 +1476,16 @@ class LiteLLMModel(BenchmarkModule):
1321
1476
  f"Enabling thinking mode for model {self.model_config.model_id!r}",
1322
1477
  level=logging.DEBUG,
1323
1478
  )
1324
- elif self.model_config.revision == "no-thinking":
1479
+ elif self.model_config.param == "no-thinking":
1325
1480
  generation_kwargs["thinking"] = dict(budget_tokens=0)
1326
1481
  log_once(
1327
1482
  f"Disabling thinking mode for model {self.model_config.model_id!r}",
1328
1483
  level=logging.DEBUG,
1329
1484
  )
1330
- elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
1331
- generation_kwargs["reasoning_effort"] = self.model_config.revision
1485
+ elif self.model_config.param in {"minimal", "low", "medium", "high"}:
1486
+ generation_kwargs["reasoning_effort"] = self.model_config.param
1332
1487
  log_once(
1333
- f"Enabling reasoning effort {self.model_config.revision!r} for model "
1488
+ f"Enabling reasoning effort {self.model_config.param!r} for model "
1334
1489
  f"{self.model_config.model_id!r}",
1335
1490
  level=logging.DEBUG,
1336
1491
  )
@@ -1338,14 +1493,18 @@ class LiteLLMModel(BenchmarkModule):
1338
1493
  # First attempt is a test run with a single conversation to handle errors
1339
1494
  # quickly. We repeat this multiple times to deal with different types of
1340
1495
  # errors, and stop if we get a successful response.
1341
- test_conversation: list[litellm.AllMessageValues] = [
1342
- litellm.ChatCompletionUserMessage(role="user", content="Test message")
1343
- ]
1344
- for _ in range(5):
1496
+ test_input: list[litellm.AllMessageValues] | str
1497
+ if self.generative_type == GenerativeType.BASE:
1498
+ test_input = "Test message"
1499
+ else:
1500
+ test_input = [
1501
+ litellm.ChatCompletionUserMessage(role="user", content="Test message")
1502
+ ]
1503
+ for _ in range(num_attempts := 10):
1345
1504
  _, failures = safe_run(
1346
1505
  self._generate_async(
1347
1506
  model_id=self.model_config.model_id,
1348
- conversations=[test_conversation],
1507
+ inputs=[test_input],
1349
1508
  **generation_kwargs,
1350
1509
  )
1351
1510
  )
@@ -1355,47 +1514,15 @@ class LiteLLMModel(BenchmarkModule):
1355
1514
  generation_kwargs = self._handle_exception(
1356
1515
  error=error, **generation_kwargs
1357
1516
  )
1517
+ else:
1518
+ raise InvalidModel(
1519
+ "Failed to get a successful response from the model "
1520
+ f"{self.model_config.model_id!r} after {num_attempts} attempts."
1521
+ )
1358
1522
 
1359
1523
  return generation_kwargs
1360
1524
 
1361
1525
 
1362
- def raise_if_wrong_params(
1363
- model_config: ModelConfig, allowed_params: dict[str, list[str]]
1364
- ) -> None:
1365
- """Raise an error if the model configuration has invalid parameters.
1366
-
1367
- Args:
1368
- model_config:
1369
- The model configuration.
1370
- allowed_params:
1371
- The allowed parameters for the model.
1372
-
1373
- Raises:
1374
- InvalidModel:
1375
- If the model configuration has invalid parameters.
1376
- """
1377
- param = model_config.revision
1378
- if param == "":
1379
- return
1380
- for model_regex, allowed_params_list in allowed_params.items():
1381
- if re.fullmatch(pattern=model_regex, string=model_config.model_id):
1382
- if param not in allowed_params_list:
1383
- msg = (
1384
- f"Invalid parameter {param!r} for model {model_config.model_id!r}."
1385
- )
1386
- if allowed_params_list:
1387
- msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
1388
- else:
1389
- msg += " No parameters are allowed."
1390
- raise InvalidModel(msg)
1391
- return
1392
- else:
1393
- raise InvalidModel(
1394
- f"The parameter {param!r} is not supported for the model "
1395
- f"{model_config.model_id!r}."
1396
- )
1397
-
1398
-
1399
1526
  def try_download_ollama_model(model_id: str) -> bool:
1400
1527
  """Try to download an Ollama model.
1401
1528