EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +323 -193
  7. euroeval/benchmark_modules/vllm.py +166 -112
  8. euroeval/benchmarker.py +59 -33
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +8 -7
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -468
  61. euroeval-15.15.0.dist-info/RECORD +0 -63
  62. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,11 +2,11 @@
2
2
 
3
3
  import asyncio
4
4
  import collections.abc as c
5
+ import json
5
6
  import logging
6
- import os
7
7
  import re
8
8
  import typing as t
9
- from functools import cached_property, partial
9
+ from functools import cache, cached_property, partial
10
10
  from time import sleep
11
11
 
12
12
  import litellm
@@ -27,6 +27,7 @@ from litellm.exceptions import (
27
27
  RateLimitError,
28
28
  ServiceUnavailableError,
29
29
  Timeout,
30
+ UnsupportedParamsError,
30
31
  )
31
32
  from litellm.llms.vertex_ai.common_utils import VertexAIError
32
33
  from litellm.router import Router
@@ -37,7 +38,12 @@ from requests.exceptions import RequestException
37
38
  from tqdm.asyncio import tqdm as tqdm_async
38
39
  from tqdm.auto import tqdm
39
40
 
40
- from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
41
+ from ..constants import (
42
+ JSON_STRIP_CHARACTERS,
43
+ LITELLM_CLASSIFICATION_OUTPUT_KEY,
44
+ MAX_LITELLM_LOGPROBS,
45
+ REASONING_MAX_TOKENS,
46
+ )
41
47
  from ..data_models import (
42
48
  BenchmarkConfig,
43
49
  DatasetConfig,
@@ -66,16 +72,18 @@ from ..task_group_utils import (
66
72
  text_to_text,
67
73
  token_classification,
68
74
  )
75
+ from ..tasks import NER
69
76
  from ..tokenization_utils import get_first_label_token_mapping
70
77
  from ..types import ExtractLabelsFunction
71
78
  from ..utils import (
72
79
  add_semaphore_and_catch_exception,
73
80
  create_model_cache_dir,
81
+ get_hf_token,
74
82
  log_once,
75
83
  safe_run,
76
84
  )
77
85
  from .base import BenchmarkModule
78
- from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
86
+ from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
79
87
 
80
88
  if t.TYPE_CHECKING:
81
89
  from datasets import DatasetDict
@@ -87,6 +95,7 @@ logger = logging.getLogger("euroeval")
87
95
 
88
96
  VOCAB_SIZE_MAPPING = {
89
97
  # OpenAI models
98
+ r"gpt-5-.*": 100_256,
90
99
  r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
91
100
  r"gpt-4-[0-9]{4}-preview": 100_256,
92
101
  r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
@@ -105,6 +114,7 @@ VOCAB_SIZE_MAPPING = {
105
114
 
106
115
  MODEL_MAX_LENGTH_MAPPING = {
107
116
  # OpenAI models
117
+ r"gpt-5-.*": 272_000,
108
118
  r"gpt-4(-[0-9]{4})?": 8_191,
109
119
  r"gpt-4-32k(-[0-9]{4})?": 32_767,
110
120
  r"gpt-4-[0-9]{4}-preview": 128_000,
@@ -129,6 +139,7 @@ MODEL_MAX_LENGTH_MAPPING = {
129
139
 
130
140
  NUM_PARAMS_MAPPING = {
131
141
  # OpenAI models
142
+ r"gpt-5-.*": -1,
132
143
  r"gpt-4.*": -1,
133
144
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
134
145
  # Anthropic models
@@ -144,13 +155,14 @@ NUM_PARAMS_MAPPING = {
144
155
 
145
156
  ALLOWED_PARAMS = {
146
157
  # OpenAI models
158
+ r"gpt-5-.*": ["minimal", "low", "medium", "high"],
147
159
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
148
160
  # Anthropic models
149
161
  r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
150
162
  r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
151
163
  # Gemini models
152
164
  r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
153
- r"(gemini/)?gemini-2.5-flash-[0-9].*": ["no-thinking", "thinking"],
165
+ r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
154
166
  # xAI models
155
167
  r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
156
168
  }
@@ -176,6 +188,8 @@ class LiteLLMModel(BenchmarkModule):
176
188
  model_config: ModelConfig,
177
189
  dataset_config: DatasetConfig,
178
190
  benchmark_config: BenchmarkConfig,
191
+ log_metadata: bool = True,
192
+ **generation_kwargs: dict[str, t.Any],
179
193
  ) -> None:
180
194
  """Initialise the model.
181
195
 
@@ -186,6 +200,11 @@ class LiteLLMModel(BenchmarkModule):
186
200
  The dataset configuration.
187
201
  benchmark_config:
188
202
  The benchmark configuration.
203
+ log_metadata:
204
+ Whether to log the model metadata.
205
+ generation_kwargs:
206
+ The generation kwargs to pass to the model. If None, default values will
207
+ be used.
189
208
  """
190
209
  # Detect whether the model is an Ollama model, as we need to extract metadata
191
210
  # differently for these models
@@ -204,13 +223,16 @@ class LiteLLMModel(BenchmarkModule):
204
223
  model_config=model_config,
205
224
  dataset_config=dataset_config,
206
225
  benchmark_config=benchmark_config,
226
+ log_metadata=log_metadata,
207
227
  )
208
228
 
229
+ self.generation_kwargs = generation_kwargs
209
230
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
210
231
  dataset_config=self.dataset_config,
211
232
  model_config=self.model_config,
212
- tokenizer=None,
233
+ tokeniser=None,
213
234
  generative_type=self.generative_type,
235
+ log_metadata=self.log_metadata,
214
236
  )
215
237
 
216
238
  @property
@@ -240,11 +262,12 @@ class LiteLLMModel(BenchmarkModule):
240
262
  else:
241
263
  type_ = GenerativeType.INSTRUCTION_TUNED
242
264
 
243
- log_once(
244
- f"Detected generative type {type_.name!r} for model "
245
- f"{self.model_config.model_id!r}",
246
- level=logging.DEBUG,
247
- )
265
+ if self.log_metadata:
266
+ log_once(
267
+ f"Detected generative type {type_.name!r} for model "
268
+ f"{self.model_config.model_id!r}",
269
+ level=logging.DEBUG,
270
+ )
248
271
  return type_
249
272
 
250
273
  def generate(self, inputs: dict) -> GenerativeModelOutput:
@@ -265,132 +288,11 @@ class LiteLLMModel(BenchmarkModule):
265
288
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
266
289
  dataset_config=self.dataset_config,
267
290
  model_config=self.model_config,
268
- tokenizer=None,
291
+ tokeniser=None,
269
292
  generative_type=self.generative_type,
293
+ log_metadata=self.log_metadata,
270
294
  )
271
295
 
272
- # Set the core generation arguments
273
- generation_kwargs: dict[str, t.Any] = dict(
274
- model=self.model_config.model_id,
275
- max_completion_tokens=(
276
- REASONING_MAX_TOKENS
277
- if self.generative_type == GenerativeType.REASONING
278
- else self.dataset_config.max_generated_tokens
279
- ),
280
- stop=[],
281
- temperature=0.0,
282
- seed=4242,
283
- api_key=self.benchmark_config.api_key,
284
- api_base=self.benchmark_config.api_base,
285
- api_version=self.benchmark_config.api_version,
286
- max_retries=3,
287
- )
288
-
289
- # Set up the `response_format` generation argument if we are dealing with a task
290
- # using structured generation
291
- if self.dataset_config.task in TASKS_USING_JSON:
292
- # Sanity check that "JSON" is included in the prompt, as some models require
293
- # this
294
- for conversation in conversations:
295
- if not conversation:
296
- raise InvalidBenchmark(
297
- "Encountered an empty conversation in 'messages'."
298
- )
299
- last_message = conversation[-1]
300
- assert isinstance(last_message, dict), (
301
- f"Expected dict message, got {type(last_message)}"
302
- )
303
- assert "content" in last_message, (
304
- "Expected 'content' key in the last message of the conversation."
305
- )
306
- assert isinstance(last_message["content"], str), (
307
- "Expected 'content' to be a string."
308
- )
309
- assert "json" in last_message["content"].lower(), (
310
- "Prompt must contain 'json' for JSON tasks."
311
- )
312
-
313
- if self.generative_type == GenerativeType.REASONING:
314
- log_once(
315
- f"The model {self.model_config.model_id!r} is a reasoning model "
316
- "and thus does not support structured generation, so we do not "
317
- "enable it.",
318
- level=logging.DEBUG,
319
- )
320
- elif supports_response_schema(model=self.model_config.model_id):
321
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
322
- keys_and_their_types: dict[str, t.Any] = {
323
- tag_name: (conlist(str, max_length=5), ...)
324
- for tag_name in ner_tag_names
325
- }
326
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
327
- generation_kwargs["response_format"] = pydantic_class
328
- log_once(
329
- "Enabling structured generation for model "
330
- f"{self.model_config.model_id!r} with the JSON schema "
331
- f"{pydantic_class.model_json_schema()}",
332
- level=logging.DEBUG,
333
- )
334
- else:
335
- generation_kwargs["response_format"] = dict(type="json_object")
336
- log_once(
337
- "Enabling structured JSON generation for model "
338
- f"{self.model_config.model_id!r} with no custom JSON schema, as "
339
- "the model does not support schemas.",
340
- level=logging.DEBUG,
341
- )
342
-
343
- # If the model is an Ollama reasoning model, we ensure that thinking is enabled
344
- if self.is_ollama and self.generative_type == GenerativeType.REASONING:
345
- generation_kwargs["think"] = True
346
- log_once(
347
- "Enabling thinking mode for Ollama model "
348
- f"{self.model_config.model_id!r}",
349
- level=logging.DEBUG,
350
- )
351
-
352
- # Handle manually set parameters
353
- if self.buffer["first_label_token_mapping"]:
354
- generation_kwargs["logprobs"] = True
355
- generation_kwargs["top_logprobs"] = MAX_LOGPROBS
356
- if self.model_config.revision == "thinking":
357
- generation_kwargs["thinking"] = dict(
358
- type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
359
- )
360
- log_once(
361
- f"Enabling thinking mode for model {self.model_config.model_id!r}",
362
- level=logging.DEBUG,
363
- )
364
- elif self.model_config.revision == "no-thinking":
365
- generation_kwargs["thinking"] = dict(budget_tokens=0)
366
- log_once(
367
- f"Disabling thinking mode for model {self.model_config.model_id!r}",
368
- level=logging.DEBUG,
369
- )
370
- elif self.model_config.revision in {"low", "medium", "high"}:
371
- generation_kwargs["reasoning_effort"] = self.model_config.revision
372
- log_once(
373
- f"Enabling reasoning effort {self.model_config.revision!r} for model "
374
- f"{self.model_config.model_id!r}",
375
- level=logging.DEBUG,
376
- )
377
-
378
- # Drop generation kwargs that are not supported by the model
379
- litellm.drop_params = True
380
-
381
- # First attempt is a test run with a single conversation to handle errors
382
- # quickly
383
- test_conversation = conversations[0]
384
- _, failures = safe_run(
385
- self._generate_async(
386
- model_id=self.model_config.model_id,
387
- conversations=[test_conversation],
388
- **generation_kwargs,
389
- )
390
- )
391
- for _, error in failures:
392
- self._handle_exception(error=error, generation_kwargs=generation_kwargs)
393
-
394
296
  all_responses: dict[int, "ModelResponse"] = {}
395
297
  conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
396
298
  enumerate(conversations)
@@ -399,6 +301,10 @@ class LiteLLMModel(BenchmarkModule):
399
301
  if not conversations_to_run:
400
302
  break
401
303
 
304
+ generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
305
+ dataset_config=self.dataset_config
306
+ )
307
+
402
308
  batch_indices, batch_conversations = zip(*conversations_to_run)
403
309
  successes, failures = safe_run(
404
310
  self._generate_async(
@@ -431,7 +337,9 @@ class LiteLLMModel(BenchmarkModule):
431
337
  # Attempt to handle the exceptions, to improve the chance of getting
432
338
  # successful generations next time around
433
339
  for _, error in failures:
434
- self._handle_exception(error=error, generation_kwargs=generation_kwargs)
340
+ generation_kwargs = self._handle_exception(
341
+ error=error, **generation_kwargs
342
+ )
435
343
 
436
344
  # Sleep for a second to avoid pinging the API server too quickly
437
345
  sleep(1)
@@ -454,9 +362,7 @@ class LiteLLMModel(BenchmarkModule):
454
362
 
455
363
  return model_output
456
364
 
457
- def _handle_exception(
458
- self, error: Exception, generation_kwargs: dict[str, t.Any]
459
- ) -> None:
365
+ def _handle_exception(self, error: Exception, **generation_kwargs) -> dict:
460
366
  """Handle an exception from the model.
461
367
 
462
368
  Args:
@@ -464,6 +370,9 @@ class LiteLLMModel(BenchmarkModule):
464
370
  The exception to handle.
465
371
  generation_kwargs:
466
372
  The generation kwargs to pass to the model.
373
+
374
+ Returns:
375
+ The updated generation kwargs to pass to the model.
467
376
  """
468
377
  error_msg = str(error).lower()
469
378
  model_id = self.model_config.model_id
@@ -476,6 +385,9 @@ class LiteLLMModel(BenchmarkModule):
476
385
  "logprobs is not supported",
477
386
  "logprobs is not enabled",
478
387
  ]
388
+ logprobs_pattern = re.compile(
389
+ r"does not support parameters: \[.*'top_logprobs'.*\]"
390
+ )
479
391
  temperature_messages = [
480
392
  "'temperature' is not supported with this model.",
481
393
  "temperature is not supported with this model",
@@ -484,6 +396,7 @@ class LiteLLMModel(BenchmarkModule):
484
396
  "`temperature` may only be set to 1",
485
397
  "'temperature' does not support 0.0 with this model. Only the default "
486
398
  "(1) value is supported",
399
+ "Only temperature=1 is supported",
487
400
  ]
488
401
  max_items_messages = ["'maxItems' is not permitted."]
489
402
  no_json_schema_messages = ["Property keys should match pattern"]
@@ -492,6 +405,7 @@ class LiteLLMModel(BenchmarkModule):
492
405
  r"[0-9]+ and ([0-9]+)\."
493
406
  )
494
407
  requires_thinking_disabled_messages = ["thinking.type: Field required"]
408
+ seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
495
409
 
496
410
  if any(msg.lower() in error_msg for msg in stop_messages):
497
411
  log_once(
@@ -500,9 +414,10 @@ class LiteLLMModel(BenchmarkModule):
500
414
  level=logging.DEBUG,
501
415
  )
502
416
  generation_kwargs["stop"] = None
503
- return
417
+ return generation_kwargs
504
418
  elif (
505
419
  any(msg.lower() in error_msg for msg in logprobs_messages)
420
+ or logprobs_pattern.search(string=error_msg)
506
421
  # Special case for Vertex AI models, since they have strict rate
507
422
  # limits on using logprobs. They also have a cap of 5 logprobs, but
508
423
  # we ignore this since the rate limiting makes it unusable anyway.
@@ -514,7 +429,7 @@ class LiteLLMModel(BenchmarkModule):
514
429
  )
515
430
  generation_kwargs.pop("logprobs", None)
516
431
  generation_kwargs.pop("top_logprobs", None)
517
- return
432
+ return generation_kwargs
518
433
  elif any(msg.lower() in error_msg for msg in temperature_messages):
519
434
  log_once(
520
435
  f"The model {model_id!r} does not support "
@@ -522,7 +437,7 @@ class LiteLLMModel(BenchmarkModule):
522
437
  level=logging.DEBUG,
523
438
  )
524
439
  generation_kwargs.pop("temperature", None)
525
- return
440
+ return generation_kwargs
526
441
  elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
527
442
  log_once(
528
443
  f"The model {model_id!r} requires "
@@ -530,8 +445,11 @@ class LiteLLMModel(BenchmarkModule):
530
445
  level=logging.DEBUG,
531
446
  )
532
447
  generation_kwargs["temperature"] = 1.0
533
- return
534
- elif any(msg.lower() in error_msg for msg in max_items_messages):
448
+ return generation_kwargs
449
+ elif (
450
+ any(msg.lower() in error_msg for msg in max_items_messages)
451
+ and self.dataset_config.task == NER
452
+ ):
535
453
  log_once(
536
454
  f"The model {model_id!r} does not support "
537
455
  "maxItems in the JSON schema, so disabling it.",
@@ -543,7 +461,7 @@ class LiteLLMModel(BenchmarkModule):
543
461
  }
544
462
  pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
545
463
  generation_kwargs["response_format"] = pydantic_class
546
- return
464
+ return generation_kwargs
547
465
  elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
548
466
  log_once(
549
467
  f"The model {self.model_config.model_id!r} does not support "
@@ -551,7 +469,7 @@ class LiteLLMModel(BenchmarkModule):
551
469
  level=logging.DEBUG,
552
470
  )
553
471
  generation_kwargs["response_format"] = dict(type="json_object")
554
- return
472
+ return generation_kwargs
555
473
  elif thinking_match := thinking_budget_pattern.search(string=error_msg):
556
474
  thinking_budget = int(thinking_match.group(1))
557
475
  if thinking_budget >= REASONING_MAX_TOKENS:
@@ -560,7 +478,7 @@ class LiteLLMModel(BenchmarkModule):
560
478
  f"{thinking_budget:,} tokens, which is within the limit of "
561
479
  f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
562
480
  f"error message was: {error_msg}."
563
- )
481
+ ) from error
564
482
  log_once(
565
483
  f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
566
484
  "for reasoning, which is less than the default of "
@@ -571,7 +489,7 @@ class LiteLLMModel(BenchmarkModule):
571
489
  generation_kwargs["thinking"] = dict(
572
490
  type="enabled", budget_tokens=thinking_budget - 1
573
491
  )
574
- return
492
+ return generation_kwargs
575
493
  elif (
576
494
  any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
577
495
  and self.generative_type != GenerativeType.REASONING
@@ -583,45 +501,73 @@ class LiteLLMModel(BenchmarkModule):
583
501
  level=logging.DEBUG,
584
502
  )
585
503
  generation_kwargs["thinking"] = dict(type="disabled")
586
- return
504
+ return generation_kwargs
505
+ elif re.search(pattern=seed_pattern, string=error_msg):
506
+ log_once(
507
+ f"The model {model_id!r} does not support the `seed` parameter, so "
508
+ "disabling it.",
509
+ level=logging.DEBUG,
510
+ )
511
+ generation_kwargs.pop("seed", None)
512
+ return generation_kwargs
513
+ # If there are too many I/O connections, we increase the number of allowed file
514
+ # descriptors
515
+ elif "too many open files" in error_msg:
516
+ raise InvalidBenchmark(
517
+ "There are too many file descriptors running. See the current "
518
+ "value by running `ulimit -n`. Try increasing it by running "
519
+ "`ulimit -n <new-value>` and try again."
520
+ ) from error
587
521
  elif isinstance(
588
522
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
589
523
  ):
590
524
  logger.debug(
591
525
  f"Service temporarily unavailable. The error message was: {error}. "
592
- f"Retrying in 5 seconds..."
526
+ "Retrying in 10 seconds..."
593
527
  )
594
- sleep(5)
595
- return
528
+ sleep(10)
529
+ return generation_kwargs
530
+ elif isinstance(error, UnsupportedParamsError):
531
+ unsupported_param_match = re.search(
532
+ pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
533
+ string=error.message,
534
+ )
535
+ if unsupported_param_match is None:
536
+ raise InvalidModel(error.message) from error
537
+ else:
538
+ unsupported_param = unsupported_param_match.group(0)
539
+ raise InvalidModel(
540
+ f"The model {model_id!r} does not support the parameter "
541
+ f"{unsupported_param!r}. Try again without this parameter. "
542
+ "Skipping this model."
543
+ ) from error
596
544
  elif isinstance(error, (APIConnectionError, OSError)):
597
- # If there are too many I/O connections, we increase the number of allowed
598
- # file descriptors
599
- if "too many open files" in error_msg:
600
- raise InvalidBenchmark(
601
- "There are too many file descriptors running. See the current "
602
- "value by running `ulimit -n`. Try increasing it by running "
603
- "`ulimit -n <new-value>` and try again."
604
- )
605
545
  raise InvalidBenchmark(
606
546
  f"Encountered {type(error)} during generation: {error}."
607
- )
547
+ ) from error
548
+
549
+ if isinstance(error, NotFoundError):
550
+ raise InvalidModel(
551
+ f"The model {model_id!r} was not found. Please check the model ID "
552
+ "and try again."
553
+ ) from error
608
554
 
609
555
  if isinstance(error, RateLimitError):
610
556
  raise InvalidModel(
611
557
  f"You have encountered your rate limit for model {model_id!r}. "
612
558
  "Skipping."
613
- )
559
+ ) from error
614
560
 
615
561
  if isinstance(error, AuthenticationError):
616
562
  raise NeedsAdditionalArgument(
617
563
  cli_argument="--api-key",
618
564
  script_argument="api_key=<your-api-key>",
619
565
  run_with_cli=self.benchmark_config.run_with_cli,
620
- )
566
+ ) from error
621
567
 
622
568
  raise InvalidBenchmark(
623
569
  f"Failed to generate text. The error message was: {error}"
624
- )
570
+ ) from error
625
571
 
626
572
  async def _generate_async(
627
573
  self,
@@ -648,9 +594,9 @@ class LiteLLMModel(BenchmarkModule):
648
594
  # for all the requests, preventing "too many open files" errors
649
595
  router = Router(
650
596
  model_list=[
651
- dict(
597
+ litellm.DeploymentTypedDict(
652
598
  model_name=self.model_config.model_id,
653
- litellm_params=generation_kwargs,
599
+ litellm_params=litellm.LiteLLMParamsTypedDict(model=model_id),
654
600
  )
655
601
  ]
656
602
  )
@@ -660,7 +606,9 @@ class LiteLLMModel(BenchmarkModule):
660
606
  semaphore = asyncio.Semaphore(max_concurrent_calls)
661
607
  requests = [
662
608
  add_semaphore_and_catch_exception(
663
- router.acompletion(model=model_id, messages=conversation),
609
+ router.acompletion(
610
+ model=model_id, messages=conversation, **generation_kwargs
611
+ ),
664
612
  semaphore=semaphore,
665
613
  )
666
614
  for conversation in conversations
@@ -720,6 +668,23 @@ class LiteLLMModel(BenchmarkModule):
720
668
  generation_output = generated_message.content or ""
721
669
  generation_output = generation_output.strip()
722
670
 
671
+ # In the case where we're dealing with a classification task, the model is
672
+ # outputting a JSON dictionary, so we will extract the generated text from
673
+ # within the dictionary
674
+ generation_dct: dict[str, t.Any] | None = None
675
+ if LITELLM_CLASSIFICATION_OUTPUT_KEY in generation_output:
676
+ try:
677
+ generation_dct = json.loads(generation_output)
678
+ assert isinstance(generation_dct, dict)
679
+ if set(generation_dct.keys()) == {
680
+ LITELLM_CLASSIFICATION_OUTPUT_KEY
681
+ }:
682
+ generation_output = str(
683
+ generation_dct[LITELLM_CLASSIFICATION_OUTPUT_KEY]
684
+ ).strip()
685
+ except json.JSONDecodeError:
686
+ pass
687
+
723
688
  # Structure the model output as a GenerativeModelOutput object
724
689
  sequences.append(generation_output)
725
690
  if hasattr(model_response_choices, "logprobs"):
@@ -732,6 +697,23 @@ class LiteLLMModel(BenchmarkModule):
732
697
  ]
733
698
  for content in model_response_choices.logprobs.content or list()
734
699
  ]
700
+
701
+ # If the model outputted a JSON dictionary, we need to find the
702
+ # token index of the value within the dictionary, rather than the
703
+ # first token of the entire output
704
+ if generation_dct:
705
+ key_name = next(iter(generation_dct.keys()))
706
+ logprobs_list = [
707
+ lst
708
+ for lst in logprobs_list
709
+ if (
710
+ lst
711
+ and lst[0]
712
+ and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
713
+ and not key_name.startswith(token)
714
+ )
715
+ ]
716
+
735
717
  scores.append(logprobs_list)
736
718
  else:
737
719
  log_once(
@@ -805,9 +787,7 @@ class LiteLLMModel(BenchmarkModule):
805
787
  repo_info = hf_api.model_info(
806
788
  repo_id=model_id,
807
789
  revision="main",
808
- token=os.getenv("HUGGINGFACE_API_KEY")
809
- or self.benchmark_config.api_key
810
- or True,
790
+ token=get_hf_token(api_key=self.benchmark_config.api_key),
811
791
  )
812
792
  except (
813
793
  RepositoryNotFoundError,
@@ -864,7 +844,7 @@ class LiteLLMModel(BenchmarkModule):
864
844
  run_with_cli=self.benchmark_config.run_with_cli,
865
845
  )
866
846
 
867
- tokenizer = load_tokenizer(
847
+ tokeniser = load_tokeniser(
868
848
  model=None,
869
849
  model_id=model_id,
870
850
  trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -876,10 +856,10 @@ class LiteLLMModel(BenchmarkModule):
876
856
  ):
877
857
  vocab_size = hf_config.vocab_size
878
858
  elif (
879
- hasattr(tokenizer, "vocab_size")
880
- and tokenizer.vocab_size is not None
859
+ hasattr(tokeniser, "vocab_size")
860
+ and tokeniser.vocab_size is not None
881
861
  ):
882
- vocab_size = tokenizer.vocab_size
862
+ vocab_size = tokeniser.vocab_size
883
863
  else:
884
864
  vocab_size = -1
885
865
  return vocab_size
@@ -910,13 +890,15 @@ class LiteLLMModel(BenchmarkModule):
910
890
  if context_length_keys:
911
891
  context_length = model_info[context_length_keys[0]]
912
892
  if context_length is not None:
913
- log_once(
914
- f"Detected context length key {context_length_keys[0]!r} "
915
- f"for Ollama model {ollama_model_id!r}",
916
- level=logging.DEBUG,
917
- )
893
+ if self.log_metadata:
894
+ log_once(
895
+ f"Detected context length key "
896
+ f"{context_length_keys[0]!r} for Ollama model "
897
+ f"{ollama_model_id!r}",
898
+ level=logging.DEBUG,
899
+ )
918
900
  return int(context_length)
919
- else:
901
+ elif self.log_metadata:
920
902
  log_once(
921
903
  f"Tried to get the maximum length of the Ollama model "
922
904
  f"{ollama_model_id!r}, but could not find a context length. "
@@ -944,7 +926,7 @@ class LiteLLMModel(BenchmarkModule):
944
926
  run_with_cli=self.benchmark_config.run_with_cli,
945
927
  )
946
928
 
947
- tokenizer = load_tokenizer(
929
+ tokeniser = load_tokeniser(
948
930
  model=None,
949
931
  model_id=model_id,
950
932
  trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -952,18 +934,18 @@ class LiteLLMModel(BenchmarkModule):
952
934
 
953
935
  all_max_lengths: list[int] = list()
954
936
 
955
- # Add the registered max length of the tokenizer
937
+ # Add the registered max length of the tokeniser
956
938
  if hasattr(
957
- tokenizer, "model_max_length"
958
- ) and tokenizer.model_max_length < int(1e30):
959
- all_max_lengths.append(tokenizer.model_max_length)
939
+ tokeniser, "model_max_length"
940
+ ) and tokeniser.model_max_length < int(1e30):
941
+ all_max_lengths.append(tokeniser.model_max_length)
960
942
 
961
943
  # Add the max length derived from the model's input sizes
962
- if hasattr(tokenizer, "max_model_input_sizes"):
944
+ if hasattr(tokeniser, "max_model_input_sizes"):
963
945
  all_max_lengths.extend(
964
946
  [
965
947
  size
966
- for size in tokenizer.max_model_input_sizes.values()
948
+ for size in tokeniser.max_model_input_sizes.values()
967
949
  if size is not None
968
950
  ]
969
951
  )
@@ -1101,7 +1083,7 @@ class LiteLLMModel(BenchmarkModule):
1101
1083
  f"Service temporarily unavailable. The error message was: {e}. "
1102
1084
  "Retrying in 10 seconds..."
1103
1085
  )
1104
- sleep(5)
1086
+ sleep(10)
1105
1087
  except APIError as e:
1106
1088
  if "'503 Service Unavailable" not in str(e):
1107
1089
  raise e
@@ -1211,7 +1193,10 @@ class LiteLLMModel(BenchmarkModule):
1211
1193
 
1212
1194
  if self.benchmark_config.few_shot:
1213
1195
  few_shot_examples = extract_few_shot_examples(
1214
- dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
1196
+ dataset=dataset,
1197
+ dataset_config=self.dataset_config,
1198
+ benchmark_config=self.benchmark_config,
1199
+ itr_idx=itr_idx,
1215
1200
  )
1216
1201
  else:
1217
1202
  few_shot_examples = list()
@@ -1224,7 +1209,7 @@ class LiteLLMModel(BenchmarkModule):
1224
1209
  dataset_config=self.dataset_config,
1225
1210
  instruction_model=True,
1226
1211
  always_populate_text_field=False,
1227
- tokenizer=None,
1212
+ tokeniser=None,
1228
1213
  ),
1229
1214
  batched=True,
1230
1215
  load_from_cache_file=False,
@@ -1233,6 +1218,146 @@ class LiteLLMModel(BenchmarkModule):
1233
1218
 
1234
1219
  return dataset
1235
1220
 
1221
+ @cache
1222
+ def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
1223
+ """Get the generation arguments for the model.
1224
+
1225
+ Args:
1226
+ dataset_config:
1227
+ The dataset configuration, which is used to determine the generative
1228
+ type of the model. We use this as an argument here rather than using
1229
+ `self.dataset_config` to ensure that that the cache is updated when the
1230
+ dataset configuration changes.
1231
+
1232
+ Returns:
1233
+ The generation arguments for the model.
1234
+ """
1235
+ # Set the core generation arguments
1236
+ generation_kwargs: dict[str, t.Any] = dict(
1237
+ max_completion_tokens=(
1238
+ REASONING_MAX_TOKENS
1239
+ if self.generative_type == GenerativeType.REASONING
1240
+ else dataset_config.max_generated_tokens
1241
+ ),
1242
+ stop=[],
1243
+ temperature=0.0,
1244
+ seed=4242,
1245
+ api_key=self.benchmark_config.api_key,
1246
+ api_base=self.benchmark_config.api_base,
1247
+ api_version=self.benchmark_config.api_version,
1248
+ max_retries=3,
1249
+ )
1250
+
1251
+ # Set up the `response_format` generation argument if we are dealing with a task
1252
+ # using structured generation
1253
+ if dataset_config.task.uses_structured_output:
1254
+ if self.generative_type == GenerativeType.REASONING:
1255
+ log_once(
1256
+ f"The model {self.model_config.model_id!r} is a reasoning model "
1257
+ "and thus does not support structured generation, so we do not "
1258
+ "enable it.",
1259
+ level=logging.DEBUG,
1260
+ )
1261
+ elif supports_response_schema(model=self.model_config.model_id):
1262
+ if dataset_config.task == NER:
1263
+ ner_tag_names = list(dataset_config.prompt_label_mapping.values())
1264
+ keys_and_their_types: dict[str, t.Any] = {
1265
+ tag_name: (conlist(str, max_length=5), ...)
1266
+ for tag_name in ner_tag_names
1267
+ }
1268
+ pydantic_class = create_model(
1269
+ "AnswerFormat", **keys_and_their_types
1270
+ )
1271
+ else:
1272
+ raise InvalidBenchmark(
1273
+ "This task requires structured generation, but it has not "
1274
+ "been implemented for this task yet. Please open an issue "
1275
+ "at https://github.com/EuroEval/EuroEval/issues."
1276
+ )
1277
+ generation_kwargs["response_format"] = pydantic_class
1278
+ log_once(
1279
+ "Enabling structured generation for model "
1280
+ f"{self.model_config.model_id!r} with the JSON schema "
1281
+ f"{pydantic_class.model_json_schema()}",
1282
+ level=logging.DEBUG,
1283
+ )
1284
+ else:
1285
+ generation_kwargs["response_format"] = dict(type="json_object")
1286
+ log_once(
1287
+ "Enabling structured JSON generation for model "
1288
+ f"{self.model_config.model_id!r} with no custom JSON schema, as "
1289
+ "the model does not support schemas.",
1290
+ level=logging.DEBUG,
1291
+ )
1292
+ elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
1293
+ localised_labels = [
1294
+ self.dataset_config.prompt_label_mapping[label]
1295
+ for label in self.dataset_config.labels
1296
+ ]
1297
+ keys_and_their_types = {
1298
+ LITELLM_CLASSIFICATION_OUTPUT_KEY: (t.Literal[*localised_labels], ...)
1299
+ }
1300
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
1301
+ generation_kwargs["response_format"] = pydantic_class
1302
+
1303
+ # If the model is an Ollama reasoning model, we ensure that thinking is enabled
1304
+ if self.is_ollama and self.generative_type == GenerativeType.REASONING:
1305
+ generation_kwargs["think"] = True
1306
+ log_once(
1307
+ "Enabling thinking mode for Ollama model "
1308
+ f"{self.model_config.model_id!r}",
1309
+ level=logging.DEBUG,
1310
+ )
1311
+
1312
+ # Handle manually set parameters
1313
+ if self.buffer["first_label_token_mapping"]:
1314
+ generation_kwargs["logprobs"] = True
1315
+ generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
1316
+ if self.model_config.revision == "thinking":
1317
+ generation_kwargs["thinking"] = dict(
1318
+ type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
1319
+ )
1320
+ log_once(
1321
+ f"Enabling thinking mode for model {self.model_config.model_id!r}",
1322
+ level=logging.DEBUG,
1323
+ )
1324
+ elif self.model_config.revision == "no-thinking":
1325
+ generation_kwargs["thinking"] = dict(budget_tokens=0)
1326
+ log_once(
1327
+ f"Disabling thinking mode for model {self.model_config.model_id!r}",
1328
+ level=logging.DEBUG,
1329
+ )
1330
+ elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
1331
+ generation_kwargs["reasoning_effort"] = self.model_config.revision
1332
+ log_once(
1333
+ f"Enabling reasoning effort {self.model_config.revision!r} for model "
1334
+ f"{self.model_config.model_id!r}",
1335
+ level=logging.DEBUG,
1336
+ )
1337
+
1338
+ # First attempt is a test run with a single conversation to handle errors
1339
+ # quickly. We repeat this multiple times to deal with different types of
1340
+ # errors, and stop if we get a successful response.
1341
+ test_conversation: list[litellm.AllMessageValues] = [
1342
+ litellm.ChatCompletionUserMessage(role="user", content="Test message")
1343
+ ]
1344
+ for _ in range(5):
1345
+ _, failures = safe_run(
1346
+ self._generate_async(
1347
+ model_id=self.model_config.model_id,
1348
+ conversations=[test_conversation],
1349
+ **generation_kwargs,
1350
+ )
1351
+ )
1352
+ if not failures:
1353
+ break
1354
+ for _, error in failures:
1355
+ generation_kwargs = self._handle_exception(
1356
+ error=error, **generation_kwargs
1357
+ )
1358
+
1359
+ return generation_kwargs
1360
+
1236
1361
 
1237
1362
  def raise_if_wrong_params(
1238
1363
  model_config: ModelConfig, allowed_params: dict[str, list[str]]
@@ -1264,6 +1389,11 @@ def raise_if_wrong_params(
1264
1389
  msg += " No parameters are allowed."
1265
1390
  raise InvalidModel(msg)
1266
1391
  return
1392
+ else:
1393
+ raise InvalidModel(
1394
+ f"The parameter {param!r} is not supported for the model "
1395
+ f"{model_config.model_id!r}."
1396
+ )
1267
1397
 
1268
1398
 
1269
1399
  def try_download_ollama_model(model_id: str) -> bool:
@@ -1300,11 +1430,11 @@ def try_download_ollama_model(model_id: str) -> bool:
1300
1430
  for model_obj in ollama.list().models
1301
1431
  if model_obj.model is not None
1302
1432
  ]
1303
- except ConnectionError:
1433
+ except ConnectionError as e:
1304
1434
  raise InvalidModel(
1305
1435
  "Ollama does not seem to be running, so we cannot evaluate the model "
1306
1436
  f"{model_id!r}. Please make sure that Ollama is running and try again."
1307
- )
1437
+ ) from e
1308
1438
 
1309
1439
  ollama_model_id = "/".join(model_id.split("/")[1:])
1310
1440
  if ollama_model_id not in downloaded_ollama_models:
@@ -1334,12 +1464,12 @@ def try_download_ollama_model(model_id: str) -> bool:
1334
1464
  raise InvalidModel(
1335
1465
  f"Failed to download Ollama model {ollama_model_id}. "
1336
1466
  f"The error message was: {inner_e}"
1337
- )
1467
+ ) from inner_e
1338
1468
  else:
1339
1469
  raise InvalidModel(
1340
1470
  f"Failed to download Ollama model {ollama_model_id}. "
1341
1471
  f"The error message was: {e}"
1342
- )
1472
+ ) from e
1343
1473
 
1344
1474
  # Download the model
1345
1475
  with tqdm(