EuroEval 15.4.2__py3-none-any.whl → 15.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -4,6 +4,7 @@
4
4
  ### Block unwanted terminal output that happens on importing external modules ###
5
5
 
6
6
  import logging
7
+ import os
7
8
  import sys
8
9
  import warnings
9
10
 
@@ -14,7 +15,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
14
15
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
15
16
  logging.getLogger("datasets").setLevel(logging.CRITICAL)
16
17
  logging.getLogger("vllm").setLevel(logging.CRITICAL)
17
- logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
18
+ os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
18
19
 
19
20
  # Set up logging
20
21
  fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -29,7 +30,6 @@ logging.basicConfig(
29
30
  ### Set the rest up ###
30
31
 
31
32
  import importlib.metadata # noqa: E402
32
- import os # noqa: E402
33
33
 
34
34
  from dotenv import load_dotenv # noqa: E402
35
35
 
@@ -44,6 +44,7 @@ from ..constants import (
44
44
  DUMMY_FILL_VALUE,
45
45
  GENERATIVE_PIPELINE_TAGS,
46
46
  LOCAL_MODELS_REQUIRED_FILES,
47
+ MAX_CONTEXT_LENGTH,
47
48
  MERGE_TAGS,
48
49
  )
49
50
  from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
@@ -245,6 +246,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
245
246
  max_length for max_length in all_max_lengths if max_length >= 128
246
247
  ]
247
248
 
249
+ # We remove the upper cap of maximum context length for the model, as it is
250
+ # highly unlikely that this is the model's actual maximum context length - we
251
+ # would rather not report a value than report an incorrect one.
252
+ all_max_lengths = [
253
+ max_length
254
+ for max_length in all_max_lengths
255
+ if max_length != MAX_CONTEXT_LENGTH
256
+ ]
257
+
248
258
  if len(list(all_max_lengths)) > 0:
249
259
  model_max_length = min(list(all_max_lengths))
250
260
  else:
@@ -1140,8 +1150,7 @@ def align_model_and_tokenizer(
1140
1150
  Returns:
1141
1151
  The fixed model and tokenizer.
1142
1152
  """
1143
- # Ensure that the model max length is at most 5,000, to avoid OOM errors
1144
- model_max_length = min(model_max_length, 5_000)
1153
+ model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
1145
1154
 
1146
1155
  if model_max_length > 0:
1147
1156
  tokenizer.model_max_length = model_max_length
@@ -27,20 +27,17 @@ from litellm.exceptions import (
27
27
  BadRequestError,
28
28
  InternalServerError,
29
29
  NotFoundError,
30
+ RateLimitError,
30
31
  ServiceUnavailableError,
31
32
  Timeout,
32
33
  )
34
+ from litellm.llms.vertex_ai.common_utils import VertexAIError
33
35
  from litellm.types.utils import ModelResponse
34
36
  from requests.exceptions import RequestException
35
37
  from tqdm.auto import tqdm
36
38
  from transformers import Trainer
37
39
 
38
- from ..constants import (
39
- MAX_LOGPROBS,
40
- REASONING_MAX_TOKENS,
41
- TASK_GROUPS_USING_LOGPROBS,
42
- TASKS_USING_JSON,
43
- )
40
+ from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
44
41
  from ..data_models import (
45
42
  BenchmarkConfig,
46
43
  DatasetConfig,
@@ -69,7 +66,7 @@ from ..task_utils import (
69
66
  token_classification,
70
67
  )
71
68
  from ..types import ExtractLabelsFunction
72
- from ..utils import create_model_cache_dir, log_once
69
+ from ..utils import create_model_cache_dir, get_first_label_token_mapping, log_once
73
70
  from .base import BenchmarkModule
74
71
  from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
75
72
 
@@ -78,64 +75,80 @@ logger = logging.getLogger("euroeval")
78
75
 
79
76
  VOCAB_SIZE_MAPPING = {
80
77
  # OpenAI models
81
- "(text-)?(ada|babbage|curie|davinci)(-001)?": 50_257,
82
- "(code|text)-davinci-00[2-9]": 50_281,
83
- "gpt-3.5-turbo(-16k)?(-[0-9]{4})?": 100_256,
84
- "gpt-4-(32k)?(-[0-9]{4})?": 100_256,
85
- "gpt-4-[0-9]{4}-preview": 100_256,
86
- "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
87
- "gpt-4-(vision|turbo)(-preview)?": 100_256,
88
- "gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
89
- "gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
90
- "o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
78
+ r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
79
+ r"gpt-4-[0-9]{4}-preview": 100_256,
80
+ r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
81
+ r"gpt-4-(vision|turbo)(-preview)?": 100_256,
82
+ r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
83
+ r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
84
+ r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
91
85
  # Anthropic models
92
- "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
86
+ r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
87
+ # Gemini models
88
+ r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
89
+ # xAI models
90
+ r"(xai/)?grok.*": -1,
93
91
  }
94
92
 
95
93
 
96
94
  MODEL_MAX_LENGTH_MAPPING = {
97
95
  # OpenAI models
98
- "(text-)?(ada|babbage|curie|davinci)(-001)?": 2_050,
99
- "text-davinci-00[2-9]": 4_098,
100
- "code-davinci-00[1-9]": 8_002,
101
- "gpt-3.5-turbo-0613": 4_096,
102
- "gpt-3.5-turbo(-[0-9]{4})?": 16_385,
103
- "gpt-3.5-turbo-16k(-[0-9]{4})?": 16_384,
104
- "gpt-4(-[0-9]{4})?": 8_191,
105
- "gpt-4-32k(-[0-9]{4})?": 32_767,
106
- "gpt-4-[0-9]{4}-preview": 128_000,
107
- "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
108
- "gpt-4-(vision|turbo)(-preview)?": 128_000,
109
- "gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
110
- "gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
111
- "o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
112
- "o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
113
- "o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
96
+ r"gpt-4(-[0-9]{4})?": 8_191,
97
+ r"gpt-4-32k(-[0-9]{4})?": 32_767,
98
+ r"gpt-4-[0-9]{4}-preview": 128_000,
99
+ r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
100
+ r"gpt-4-(vision|turbo)(-preview)?": 128_000,
101
+ r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
102
+ r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
103
+ r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
104
+ r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
105
+ r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
114
106
  # Anthropic models
115
- "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
107
+ r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
108
+ # Gemini models
109
+ r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
110
+ r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
111
+ r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
112
+ # xAI models
113
+ r"(xai/)?grok.*": 131_072,
116
114
  }
117
115
 
118
116
 
119
117
  NUM_PARAMS_MAPPING = {
120
118
  # OpenAI models
121
- "(text-)?ada(-001)?": 350_000_000,
122
- "(text-)?babbage(-001)?": 3_000_000_000,
123
- "(text-)?curie(-001)?": 13_000_000_000,
124
- "((text|code)-)?davinci(-00[1-9])?": 175_000_000_000,
125
- "gpt-(3.5|4)-turbo-((16|32)k)?(-[0-9]{4})?": -1,
126
- "gpt-4-[0-9]{4}-preview": -1,
127
- "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
128
- "gpt-4-(vision|turbo)(-preview)?": -1,
129
- "gpt-3.5-turbo-instruct(-[0-9]{4})?": -1,
130
- "gpt-4o(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
131
- "gpt-4o-mini(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
132
- "o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
119
+ r"gpt-4.*": -1,
120
+ r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
133
121
  # Anthropic models
134
- "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
122
+ r"(anthropic/)?claude-*": -1,
123
+ # Gemini models
124
+ r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
125
+ r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
126
+ r"(gemini/)?gemini-2.(0|5).*": -1,
127
+ # xAI models
128
+ r"(xai/)?grok.*": -1,
135
129
  }
136
130
 
137
131
 
138
- REASONING_MODELS = ["o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?"]
132
+ ALLOWED_PARAMS = {
133
+ # OpenAI models
134
+ r"gpt-4.*": [],
135
+ r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
136
+ # Anthropic models
137
+ r"(anthropic/)?claude-3-.*": [],
138
+ r"(anthropic/)?claude-3.5-.*": [],
139
+ r"(anthropic/)?claude-3.7-sonnet.*": ["thinking"],
140
+ # Gemini models
141
+ r"(gemini/)?gemini-.*": [],
142
+ # xAI models
143
+ r"(xai/)?grok.*": [],
144
+ }
145
+
146
+
147
+ REASONING_MODELS = [
148
+ r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
149
+ r"(gemini/)?gemini.*thinking.*",
150
+ r"(gemini/)?gemini-2.5-pro.*",
151
+ ]
139
152
 
140
153
 
141
154
  class LiteLLMModel(BenchmarkModule):
@@ -167,12 +180,18 @@ class LiteLLMModel(BenchmarkModule):
167
180
  "ollama/"
168
181
  ) or model_config.model_id.startswith("ollama_chat/")
169
182
 
183
+ raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
184
+
170
185
  super().__init__(
171
186
  model_config=model_config,
172
187
  dataset_config=dataset_config,
173
188
  benchmark_config=benchmark_config,
174
189
  )
175
190
 
191
+ self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
192
+ dataset_config=self.dataset_config, tokenizer=None
193
+ )
194
+
176
195
  @property
177
196
  def generative_type(self) -> GenerativeType | None:
178
197
  """Get the generative type of the model.
@@ -180,7 +199,9 @@ class LiteLLMModel(BenchmarkModule):
180
199
  Returns:
181
200
  The generative type of the model, or None if it has not been set yet.
182
201
  """
183
- if re.fullmatch(
202
+ if self.model_config.revision == "thinking":
203
+ return GenerativeType.REASONING
204
+ elif re.fullmatch(
184
205
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
185
206
  ):
186
207
  return GenerativeType.REASONING
@@ -218,7 +239,13 @@ class LiteLLMModel(BenchmarkModule):
218
239
  api_version=self.benchmark_config.api_version,
219
240
  )
220
241
 
221
- if self.dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS:
242
+ # Get the mapping from labels to the first token in the label. We call this each
243
+ # time we generate a new dataset since the dataset config can change
244
+ self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
245
+ dataset_config=self.dataset_config, tokenizer=None
246
+ )
247
+
248
+ if self.buffer["first_label_token_mapping"]:
222
249
  generation_kwargs["logprobs"] = True
223
250
  generation_kwargs["top_logprobs"] = MAX_LOGPROBS
224
251
 
@@ -227,6 +254,27 @@ class LiteLLMModel(BenchmarkModule):
227
254
  "Prompt must contain 'json' for JSON tasks."
228
255
  )
229
256
  generation_kwargs["response_format"] = dict(type="json_object")
257
+ log_once(
258
+ "Enabling JSON response format for model "
259
+ f"{self.model_config.model_id!r}",
260
+ level=logging.DEBUG,
261
+ )
262
+
263
+ if self.model_config.revision == "thinking":
264
+ generation_kwargs["thinking"] = dict(
265
+ type="enabled", budget_tokens=REASONING_MAX_TOKENS
266
+ )
267
+ log_once(
268
+ f"Enabling thinking mode for model {self.model_config.model_id!r}",
269
+ level=logging.DEBUG,
270
+ )
271
+ elif self.model_config.revision in {"low", "high"}:
272
+ generation_kwargs["reasoning_effort"] = self.model_config.revision
273
+ log_once(
274
+ f"Enabling reasoning effort {self.model_config.revision!r} for model "
275
+ f"{self.model_config.model_id!r}",
276
+ level=logging.DEBUG,
277
+ )
230
278
 
231
279
  # This drops generation kwargs that are not supported by the model
232
280
  litellm.drop_params = True
@@ -235,39 +283,60 @@ class LiteLLMModel(BenchmarkModule):
235
283
  # handle using newlines as stop sequences, so we try both.
236
284
  num_attempts = 10
237
285
  for _ in range(num_attempts):
286
+ stop_messages = ["stop_sequences"]
287
+ logprobs_messages = [
288
+ "you are not allowed to request logprobs",
289
+ "you've reached the maximum number of requests with logprobs",
290
+ "logprobs is not supported",
291
+ "logprobs is not enabled",
292
+ ]
293
+ temperature_messages = [
294
+ "'temperature' is not supported with this model.",
295
+ "temperature is not supported with this model",
296
+ ]
238
297
  try:
239
298
  model_response = litellm.completion(
240
299
  messages=messages, max_retries=3, **generation_kwargs
241
300
  )
242
301
  break
243
- except BadRequestError as e:
244
- if "stop_sequences" in str(e).lower():
302
+ except (BadRequestError, RateLimitError) as e:
303
+ if any(msg.lower() in str(e).lower() for msg in stop_messages):
245
304
  generation_kwargs["stop"] = None
246
- elif "you are not allowed to request logprobs" in str(e).lower():
247
- generation_kwargs.pop("logprobs")
248
- generation_kwargs.pop("top_logprobs")
249
305
  elif (
250
- "'temperature' is not supported with this model." in str(e).lower()
306
+ any(msg.lower() in str(e).lower() for msg in logprobs_messages)
307
+ # Special case for Vertex AI models, since they have strict rate
308
+ # limits on using logprobs. They also have a cap of 5 logprobs, but
309
+ # we ignore this since the rate limiting makes it unusable anyway.
310
+ or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
251
311
  ):
312
+ generation_kwargs.pop("logprobs")
313
+ generation_kwargs.pop("top_logprobs")
314
+ elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
252
315
  generation_kwargs.pop("temperature")
316
+ elif isinstance(e, RateLimitError):
317
+ raise InvalidModel(
318
+ "You have encountered your rate limit for model "
319
+ f"{self.model_config.model_id!r}. The error message was: {e}"
320
+ )
253
321
  else:
254
322
  raise InvalidBenchmark(
255
323
  f"Failed to generate text. The error message was: {e}"
256
324
  )
325
+ except APIError as e:
326
+ raise InvalidBenchmark(
327
+ f"Failed to generate text. The error message was: {e}"
328
+ )
257
329
  except (
330
+ APIConnectionError,
258
331
  Timeout,
259
332
  ServiceUnavailableError,
260
- APIConnectionError,
261
333
  InternalServerError,
262
- ):
334
+ ) as e:
263
335
  logger.debug(
264
- "Service temporarily unavailable. Retrying in 5 seconds..."
336
+ f"Service temporarily unavailable. The error message was: {e}. "
337
+ f"Retrying in 5 seconds..."
265
338
  )
266
339
  sleep(5)
267
- except APIError as e:
268
- raise InvalidBenchmark(
269
- f"Failed to generate text. The error message was: {e}"
270
- )
271
340
  except AuthenticationError:
272
341
  raise NeedsAdditionalArgument(
273
342
  cli_argument="--api-key",
@@ -280,6 +349,15 @@ class LiteLLMModel(BenchmarkModule):
280
349
  )
281
350
 
282
351
  assert isinstance(model_response, ModelResponse)
352
+ if not model_response.choices:
353
+ # This happens for reasoning models, when they don't finish thinking and run
354
+ # out of tokens. Happens quite rarely, but we need to handle it.
355
+ logger.warning(
356
+ f"The model {self.model_config.model_id!r} did not end up generating "
357
+ "any text. This is likely because the model ran out of tokens while "
358
+ "reasoning. Returning an empty string."
359
+ )
360
+ return GenerativeModelOutput(sequences=[""])
283
361
  model_response_choices = model_response.choices[0]
284
362
  assert isinstance(model_response_choices, litellm.Choices)
285
363
  generation_output = model_response_choices.message["content"] or ""
@@ -314,7 +392,7 @@ class LiteLLMModel(BenchmarkModule):
314
392
  # If it is an Ollama model then we can get the number of parameters from the
315
393
  # Ollama Python SDK
316
394
  if self.is_ollama:
317
- ollama_model_id = self.model_config.model_id.split("/")[-1]
395
+ ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
318
396
  model_info = ollama.show(ollama_model_id).modelinfo
319
397
  if model_info is not None:
320
398
  num_params = model_info.get("general.parameter_count")
@@ -334,7 +412,7 @@ class LiteLLMModel(BenchmarkModule):
334
412
  num_labels=self.dataset_config.num_labels,
335
413
  id2label=self.dataset_config.id2label,
336
414
  label2id=self.dataset_config.label2id,
337
- revision=self.model_config.revision,
415
+ revision="main",
338
416
  model_cache_dir=self.model_config.model_cache_dir,
339
417
  api_key=self.benchmark_config.api_key,
340
418
  trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -345,7 +423,7 @@ class LiteLLMModel(BenchmarkModule):
345
423
  try:
346
424
  repo_info = hf_api.model_info(
347
425
  repo_id=model_id,
348
- revision=self.model_config.revision,
426
+ revision="main",
349
427
  token=os.getenv("HUGGINGFACE_API_KEY")
350
428
  or self.benchmark_config.api_key
351
429
  or True,
@@ -398,7 +476,7 @@ class LiteLLMModel(BenchmarkModule):
398
476
  num_labels=self.dataset_config.num_labels,
399
477
  id2label=self.dataset_config.id2label,
400
478
  label2id=self.dataset_config.label2id,
401
- revision=self.model_config.revision,
479
+ revision="main",
402
480
  model_cache_dir=self.model_config.model_cache_dir,
403
481
  api_key=self.benchmark_config.api_key,
404
482
  trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -442,7 +520,7 @@ class LiteLLMModel(BenchmarkModule):
442
520
  # If it is an Ollama model then we can get the maximum length from the Ollama
443
521
  # Python SDK
444
522
  if self.is_ollama:
445
- ollama_model_id = self.model_config.model_id.split("/")[-1]
523
+ ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
446
524
  model_info = ollama.show(ollama_model_id).modelinfo
447
525
  if model_info is not None:
448
526
  context_length_keys = [
@@ -478,7 +556,7 @@ class LiteLLMModel(BenchmarkModule):
478
556
  num_labels=self.dataset_config.num_labels,
479
557
  id2label=self.dataset_config.id2label,
480
558
  label2id=self.dataset_config.label2id,
481
- revision=self.model_config.revision,
559
+ revision="main",
482
560
  model_cache_dir=self.model_config.model_cache_dir,
483
561
  api_key=self.benchmark_config.api_key,
484
562
  trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -563,6 +641,7 @@ class LiteLLMModel(BenchmarkModule):
563
641
  return partial(
564
642
  sequence_classification.extract_labels_from_generation,
565
643
  dataset_config=self.dataset_config,
644
+ first_label_token_mapping=self.buffer["first_label_token_mapping"],
566
645
  )
567
646
  case TaskGroup.TEXT_TO_TEXT:
568
647
  return text_to_text.extract_labels_from_generation
@@ -605,12 +684,13 @@ class LiteLLMModel(BenchmarkModule):
605
684
  Whether the model exists, or an error describing why we cannot check
606
685
  whether the model exists.
607
686
  """
687
+ model_id, _ = model_id.split("@") if "@" in model_id else (model_id, "main")
608
688
  if model_id in litellm.model_list:
609
689
  return True
610
690
 
611
691
  # If it is an Ollama model then try to download it
612
692
  if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
613
- ollama_model_id = model_id.split("/")[-1]
693
+ ollama_model_id = "/".join(model_id.split("/")[1:])
614
694
  downloaded_ollama_models: list[str] = [
615
695
  model_obj.model
616
696
  for model_obj in ollama.list().models
@@ -657,12 +737,29 @@ class LiteLLMModel(BenchmarkModule):
657
737
  api_version=benchmark_config.api_version,
658
738
  )
659
739
  return True
740
+ except (
741
+ APIConnectionError,
742
+ Timeout,
743
+ ServiceUnavailableError,
744
+ InternalServerError,
745
+ ) as e:
746
+ logger.debug(
747
+ f"Service temporarily unavailable. The error message was: {e}. "
748
+ "Retrying in 10 seconds..."
749
+ )
750
+ sleep(5)
751
+ except RateLimitError:
752
+ logger.warning(
753
+ f"Rate limit exceeded for model {model_id!r}. Retrying in 10 "
754
+ "seconds..."
755
+ )
756
+ sleep(10)
660
757
  except APIError as e:
661
758
  if "'503 Service Unavailable" not in str(e):
662
759
  raise e
663
760
  logger.warning(
664
- f"Failed to check if model {model_id!r} exists. Retrying in "
665
- f"{num_attempts} seconds..."
761
+ f"Failed to check if model {model_id!r} exists. Retrying in 10 "
762
+ "seconds..."
666
763
  )
667
764
  sleep(10)
668
765
  except (BadRequestError, NotFoundError):
@@ -708,9 +805,10 @@ class LiteLLMModel(BenchmarkModule):
708
805
  Returns:
709
806
  The model configuration.
710
807
  """
808
+ model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "")
711
809
  return ModelConfig(
712
810
  model_id=model_id,
713
- revision="main",
811
+ revision=revision,
714
812
  task="text-generation",
715
813
  languages=list(),
716
814
  merge=False,
@@ -1025,3 +1123,35 @@ class LiteLLMModel(BenchmarkModule):
1025
1123
 
1026
1124
  examples["messages"] = messages_list
1027
1125
  return examples
1126
+
1127
+
1128
+ def raise_if_wrong_params(
1129
+ model_config: ModelConfig, allowed_params: dict[str, list[str]]
1130
+ ) -> None:
1131
+ """Raise an error if the model configuration has invalid parameters.
1132
+
1133
+ Args:
1134
+ model_config:
1135
+ The model configuration.
1136
+ allowed_params:
1137
+ The allowed parameters for the model.
1138
+
1139
+ Raises:
1140
+ InvalidModel:
1141
+ If the model configuration has invalid parameters.
1142
+ """
1143
+ param = model_config.revision
1144
+ if param == "":
1145
+ return
1146
+ for model_regex, allowed_params_list in allowed_params.items():
1147
+ if re.fullmatch(pattern=model_regex, string=model_config.model_id):
1148
+ if param not in allowed_params_list:
1149
+ msg = (
1150
+ f"Invalid parameter {param!r} for model {model_config.model_id!r}."
1151
+ )
1152
+ if allowed_params_list:
1153
+ msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
1154
+ else:
1155
+ msg += " No parameters are allowed."
1156
+ raise InvalidModel(msg)
1157
+ return
@@ -25,10 +25,10 @@ from urllib3.exceptions import RequestError
25
25
 
26
26
  from ..constants import (
27
27
  GENERATIVE_PIPELINE_TAGS,
28
+ MAX_CONTEXT_LENGTH,
28
29
  MAX_LOGPROBS,
29
30
  MERGE_TAGS,
30
31
  REASONING_MAX_TOKENS,
31
- TASK_GROUPS_USING_LOGPROBS,
32
32
  TASKS_USING_JSON,
33
33
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
34
34
  )
@@ -66,6 +66,7 @@ from ..utils import (
66
66
  get_bos_token,
67
67
  get_end_of_chat_token_ids,
68
68
  get_eos_token,
69
+ get_first_label_token_mapping,
69
70
  get_min_cuda_compute_capability,
70
71
  log_once,
71
72
  should_prompts_be_stripped,
@@ -122,11 +123,8 @@ class VLLMModel(HuggingFaceEncoderModel):
122
123
  ):
123
124
  raise NeedsExtraInstalled(extra="generative")
124
125
 
125
- output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
126
126
  model, tokenizer = load_model_and_tokenizer(
127
- model_config=model_config,
128
- benchmark_config=benchmark_config,
129
- output_scores=output_scores,
127
+ model_config=model_config, benchmark_config=benchmark_config
130
128
  )
131
129
  self._model: LLM = model
132
130
  self._tokenizer: PreTrainedTokenizer = tokenizer
@@ -142,8 +140,12 @@ class VLLMModel(HuggingFaceEncoderModel):
142
140
  benchmark_config=benchmark_config,
143
141
  )
144
142
 
145
- self.buffer["output_scores"] = output_scores
146
- self.buffer["instruction_model"] = self._tokenizer.chat_template is not None
143
+ self.buffer |= dict(
144
+ instruction_model=self._tokenizer.chat_template is not None,
145
+ first_label_token_mapping=get_first_label_token_mapping(
146
+ dataset_config=self.dataset_config, tokenizer=self._tokenizer
147
+ ),
148
+ )
147
149
  if self.model_config.adapter_base_model_id is not None:
148
150
  adapter_path = snapshot_download(
149
151
  repo_id=self.model_config.model_id,
@@ -185,6 +187,7 @@ class VLLMModel(HuggingFaceEncoderModel):
185
187
  return partial(
186
188
  sequence_classification.extract_labels_from_generation,
187
189
  dataset_config=self.dataset_config,
190
+ first_label_token_mapping=self.buffer["first_label_token_mapping"],
188
191
  )
189
192
  case TaskGroup.TEXT_TO_TEXT:
190
193
  return text_to_text.extract_labels_from_generation
@@ -338,6 +341,12 @@ class VLLMModel(HuggingFaceEncoderModel):
338
341
  else:
339
342
  logits_processor = None
340
343
 
344
+ # Get the mapping from labels to the first token in the label. We call this each
345
+ # time we generate a new dataset since the dataset config can change
346
+ self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
347
+ dataset_config=self.dataset_config, tokenizer=self._tokenizer
348
+ )
349
+
341
350
  # Define the parameters used for vLLM generation
342
351
  max_tokens: int = (
343
352
  REASONING_MAX_TOKENS
@@ -346,7 +355,7 @@ class VLLMModel(HuggingFaceEncoderModel):
346
355
  )
347
356
  sampling_params = SamplingParams(
348
357
  max_tokens=max_tokens,
349
- logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
358
+ logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
350
359
  temperature=0.0,
351
360
  stop=[stop_token for stop_token in stop_tokens if stop_token],
352
361
  logits_processors=[logits_processor] if logits_processor else None,
@@ -416,7 +425,7 @@ class VLLMModel(HuggingFaceEncoderModel):
416
425
  completions = [completion.strip() for completion in completions]
417
426
 
418
427
  # Add logprobs scores to the output
419
- if self.buffer["output_scores"]:
428
+ if self.buffer["first_label_token_mapping"]:
420
429
  scores: list[list[list[tuple[str, float]]]] = [
421
430
  [
422
431
  [
@@ -846,7 +855,7 @@ class VLLMModel(HuggingFaceEncoderModel):
846
855
 
847
856
 
848
857
  def load_model_and_tokenizer(
849
- model_config: ModelConfig, benchmark_config: BenchmarkConfig, output_scores: bool
858
+ model_config: ModelConfig, benchmark_config: BenchmarkConfig
850
859
  ) -> "tuple[LLM, PreTrainedTokenizer]":
851
860
  """Load the model and tokenizer.
852
861
 
@@ -855,11 +864,9 @@ def load_model_and_tokenizer(
855
864
  The model configuration.
856
865
  benchmark_config:
857
866
  The benchmark configuration.
858
- output_scores:
859
- Whether to output scores.
860
867
 
861
868
  Returns:
862
- The loaded model and tokenizer.
869
+ A pair (model, tokenizer), with the loaded model and tokenizer
863
870
  """
864
871
  # Prefer base model ID if the model is an adapter - the adapter will be added on
865
872
  # during inference in this case
@@ -893,7 +900,27 @@ def load_model_and_tokenizer(
893
900
  if quantization == "awq" and importlib.util.find_spec("awq") is None:
894
901
  raise NeedsExtraInstalled(extra="quantization")
895
902
 
903
+ # Start with dtype being the "auto" vLLM dtype
896
904
  dtype: str | torch.dtype = "auto"
905
+
906
+ # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
907
+ if hf_model_config.torch_dtype == torch.float32:
908
+ if torch.cuda.is_bf16_supported():
909
+ logger.info(
910
+ "You are loading a model with dtype FP32, which we will convert to "
911
+ "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
912
+ "GPU."
913
+ )
914
+ dtype = torch.bfloat16
915
+ else:
916
+ logger.info(
917
+ "You are loading a model with dtype FP32, which we will convert to "
918
+ "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
919
+ "your GPU."
920
+ )
921
+ dtype = torch.float16
922
+
923
+ # If the model is a quantized model, we need to set the dtype to float16
897
924
  if quantization is not None and hf_model_config.torch_dtype != torch.float16:
898
925
  logger.info(
899
926
  "You are loading a quantized model with dtype "
@@ -902,6 +929,7 @@ def load_model_and_tokenizer(
902
929
  )
903
930
  dtype = torch.float16
904
931
 
932
+ # If the model is a bf16 model, we need to check the CUDA compute capability
905
933
  if hf_model_config.torch_dtype == torch.bfloat16:
906
934
  min_cuda_compute_capability = get_min_cuda_compute_capability()
907
935
  required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
@@ -940,7 +968,17 @@ def load_model_and_tokenizer(
940
968
  if len(true_max_model_len_candidates) > 0:
941
969
  true_max_model_len = min(true_max_model_len_candidates)
942
970
  else:
943
- true_max_model_len = 5_000
971
+ true_max_model_len = MAX_CONTEXT_LENGTH
972
+
973
+ tokenizer = load_tokenizer(
974
+ model_id=model_config.model_id,
975
+ revision=model_config.revision,
976
+ adapter_base_model_id=model_config.adapter_base_model_id,
977
+ trust_remote_code=benchmark_config.trust_remote_code,
978
+ model_max_length=true_max_model_len,
979
+ model_cache_dir=model_config.model_cache_dir,
980
+ token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
981
+ )
944
982
 
945
983
  clear_vllm()
946
984
 
@@ -951,7 +989,7 @@ def load_model_and_tokenizer(
951
989
  model=model_id,
952
990
  tokenizer=model_id,
953
991
  gpu_memory_utilization=0.95,
954
- max_model_len=min(true_max_model_len, 5_000),
992
+ max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
955
993
  download_dir=download_dir,
956
994
  trust_remote_code=benchmark_config.trust_remote_code,
957
995
  revision=revision,
@@ -962,7 +1000,6 @@ def load_model_and_tokenizer(
962
1000
  quantization=quantization,
963
1001
  dtype=dtype,
964
1002
  enforce_eager=True,
965
- max_logprobs=MAX_LOGPROBS if output_scores else None,
966
1003
  # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
967
1004
  # so we disable it for now
968
1005
  enable_prefix_caching=False,
@@ -988,16 +1025,6 @@ def load_model_and_tokenizer(
988
1025
  model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
989
1026
  model.config = hf_model_config
990
1027
 
991
- tokenizer = load_tokenizer(
992
- model_id=model_config.model_id,
993
- revision=model_config.revision,
994
- adapter_base_model_id=model_config.adapter_base_model_id,
995
- trust_remote_code=benchmark_config.trust_remote_code,
996
- model_max_length=true_max_model_len,
997
- model_cache_dir=model_config.model_cache_dir,
998
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
999
- )
1000
-
1001
1028
  return model, tokenizer
1002
1029
 
1003
1030
 
@@ -1157,15 +1184,13 @@ def get_end_of_reasoning_token_id(
1157
1184
 
1158
1185
  # Generate a completion and remove the BOS token from it, to not confuse it with the
1159
1186
  # potential reasoning token
1160
- completion = (
1161
- model.generate(
1162
- prompts=[prompt],
1163
- sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1164
- use_tqdm=False,
1165
- )[0]
1166
- .outputs[0]
1167
- .text
1187
+ model_output = model.generate(
1188
+ prompts=[prompt],
1189
+ sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1190
+ use_tqdm=False,
1168
1191
  )
1192
+ completion = model_output[0].outputs[0].text
1193
+
1169
1194
  if tokenizer.bos_token is not None:
1170
1195
  if isinstance(tokenizer.bos_token, str):
1171
1196
  prompt = prompt.replace(tokenizer.bos_token, "").strip()
euroeval/benchmarker.py CHANGED
@@ -366,14 +366,18 @@ class Benchmarker:
366
366
  dataset_names=benchmark_config.datasets
367
367
  )
368
368
 
369
+ total_benchmarks = len(model_ids) * len(dataset_configs)
370
+ num_finished_benchmarks = 0
371
+
369
372
  current_benchmark_results: list[BenchmarkResult] = list()
370
- for m_id in model_ids:
373
+ for model_id in model_ids:
371
374
  try:
372
375
  model_config = get_model_config(
373
- model_id=m_id, benchmark_config=benchmark_config
376
+ model_id=model_id, benchmark_config=benchmark_config
374
377
  )
375
378
  except InvalidModel as e:
376
379
  logger.info(e.message)
380
+ num_finished_benchmarks += len(dataset_configs)
377
381
  continue
378
382
 
379
383
  loaded_model: BenchmarkModule | None = None
@@ -381,16 +385,18 @@ class Benchmarker:
381
385
  # Skip if we have already benchmarked this model on this dataset and
382
386
  # we are not forcing the benchmark
383
387
  if not benchmark_config.force and model_has_been_benchmarked(
384
- model_id=m_id,
388
+ model_id=model_id,
385
389
  dataset=dataset_config.name,
386
390
  few_shot=benchmark_config.few_shot,
387
391
  validation_split=not benchmark_config.evaluate_test_split,
388
392
  benchmark_results=self.benchmark_results,
389
393
  ):
390
394
  logger.debug(
391
- f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"
392
- " as it has already been benchmarked."
395
+ f"Skipping benchmarking {model_id} on "
396
+ f"{dataset_config.pretty_name}, as it "
397
+ "has already been benchmarked."
393
398
  )
399
+ num_finished_benchmarks += 1
394
400
  continue
395
401
 
396
402
  # We do not re-initialise generative models as their architecture is not
@@ -413,6 +419,15 @@ class Benchmarker:
413
419
  if benchmark_config.raise_errors:
414
420
  raise e
415
421
  logger.info(e.message)
422
+
423
+ # Add the remaining number of benchmarks for the model to
424
+ # our benchmark counter, since we're skipping the
425
+ # rest of them
426
+ num_finished_benchmarks += (
427
+ len(dataset_configs)
428
+ - dataset_configs.index(dataset_config)
429
+ - 1
430
+ )
416
431
  break
417
432
  else:
418
433
  loaded_model.dataset_config = dataset_config
@@ -435,16 +450,24 @@ class Benchmarker:
435
450
  if benchmark_config.raise_errors:
436
451
  raise benchmark_output_or_err
437
452
  logger.info(
438
- f"{m_id} could not be benchmarked on "
453
+ f"{model_id} could not be benchmarked on "
439
454
  f"{dataset_config.pretty_name}. Skipping. The error message "
440
455
  f"raised was {benchmark_output_or_err.message!r}."
441
456
  )
457
+ num_finished_benchmarks += 1
442
458
  continue
443
459
 
444
460
  elif isinstance(benchmark_output_or_err, InvalidModel):
445
461
  if benchmark_config.raise_errors:
446
462
  raise benchmark_output_or_err
447
463
  logger.info(benchmark_output_or_err.message)
464
+
465
+ # Add the remaining number of benchmarks for the model to
466
+ # our benchmark counter, since we're skipping the
467
+ # rest of them
468
+ num_finished_benchmarks += (
469
+ len(dataset_configs) - dataset_configs.index(dataset_config) - 1
470
+ )
448
471
  break
449
472
 
450
473
  else:
@@ -453,6 +476,12 @@ class Benchmarker:
453
476
  if benchmark_config.save_results:
454
477
  record.append_to_results(results_path=self.results_path)
455
478
 
479
+ num_finished_benchmarks += 1
480
+ logger.info(
481
+ f"Finished {num_finished_benchmarks} out of "
482
+ f"{total_benchmarks} benchmarks."
483
+ )
484
+
456
485
  if benchmark_config.clear_model_cache:
457
486
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
458
487
 
euroeval/constants.py CHANGED
@@ -7,6 +7,13 @@ from .tasks import NER
7
7
  DUMMY_FILL_VALUE = 100
8
8
 
9
9
 
10
+ # This is the maximum allowed context length for models for the purpose of this
11
+ # benchmark. We will still report the models' true maximum context length in the
12
+ # metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
13
+ # all tokens in the context.
14
+ MAX_CONTEXT_LENGTH = 5_000
15
+
16
+
10
17
  # We need to raise the amount of tokens generated for reasoning models, to give them
11
18
  # time to think
12
19
  REASONING_MAX_TOKENS = 8_192
@@ -47,7 +54,7 @@ TASK_GROUPS_USING_LOGPROBS = [
47
54
  MAX_LOGPROBS = 10
48
55
 
49
56
 
50
- # We make sure to remove these metric attributed after each iteration, to avoid memory
57
+ # We make sure to remove these metric attributes after each iteration, to avoid memory
51
58
  # leaks
52
59
  METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
53
60
 
euroeval/data_models.py CHANGED
@@ -10,10 +10,9 @@ from dataclasses import dataclass, field
10
10
  import pydantic
11
11
  import torch
12
12
 
13
- from euroeval.utils import get_package_version
14
-
15
13
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
16
14
  from .types import ScoreDict
15
+ from .utils import get_package_version
17
16
 
18
17
 
19
18
  @dataclass
@@ -1643,7 +1643,7 @@ ORANGE_SUM_CONFIG = DatasetConfig(
1643
1643
 
1644
1644
  ILPOST_SUM_CONFIG = DatasetConfig(
1645
1645
  name="ilpost-sum",
1646
- pretty_name="the truncated version of the Italian summarisation dataset IlPost",
1646
+ pretty_name="the truncated version of the Italian summarisation dataset IlPost-Sum",
1647
1647
  huggingface_id="EuroEval/ilpost-sum",
1648
1648
  task=SUMM,
1649
1649
  languages=[IT],
@@ -10,6 +10,7 @@ import numpy as np
10
10
  from evaluate import EvaluationModule
11
11
 
12
12
  from ..data_models import BenchmarkConfig, GenerativeModelOutput
13
+ from ..exceptions import InvalidBenchmark
13
14
  from ..utils import log_once, raise_if_model_output_contains_nan_values
14
15
 
15
16
  if t.TYPE_CHECKING:
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
110
111
  input_batch: dict[str, list],
111
112
  model_output: GenerativeModelOutput,
112
113
  dataset_config: "DatasetConfig",
114
+ first_label_token_mapping: dict[str, str] | bool,
113
115
  ) -> list[str]:
114
116
  """Extract the predicted labels from the generated output.
115
117
 
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
121
123
  The raw generated output of the model.
122
124
  dataset_config:
123
125
  The configuration of the dataset.
126
+ first_label_token_mapping:
127
+ A mapping from labels to the first token in each label, or alternatively a
128
+ Boolean value indicating whether the model should output scores (if the
129
+ mapping is outputted then the model will always output scores).
124
130
 
125
131
  Returns:
126
132
  The predicted labels.
127
133
  """
128
134
  if model_output.scores is not None:
129
135
  return get_closest_logprobs_labels(
130
- generation_logprobs=model_output.scores, dataset_config=dataset_config
136
+ generation_logprobs=model_output.scores,
137
+ dataset_config=dataset_config,
138
+ first_label_token_mapping=first_label_token_mapping,
131
139
  )
132
140
  else:
133
141
  return get_closest_word_edit_labels(
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
138
146
  def get_closest_logprobs_labels(
139
147
  generation_logprobs: list[list[list[tuple[str, float]]]],
140
148
  dataset_config: "DatasetConfig",
149
+ first_label_token_mapping: dict[str, str] | bool,
141
150
  ) -> list[str]:
142
151
  """Get the labels with the highest predicted logprob value.
143
152
 
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
152
161
  (batch_size, num_tokens, num_logprobs).
153
162
  dataset_config:
154
163
  The configuration of the dataset.
164
+ first_label_token_mapping:
165
+ A mapping from labels to the first token in each label, or alternatively a
166
+ Boolean value indicating whether the model should output scores (if the
167
+ mapping is outputted then the model will always output scores).
155
168
 
156
169
  Returns:
157
170
  The predicted labels.
@@ -185,11 +198,29 @@ def get_closest_logprobs_labels(
185
198
  generated_label = "".join(previously_generated_labels) + generated_label
186
199
 
187
200
  # Get the candidate labels that starts with the generated label
188
- candidate_output_labels = {
189
- candidate_label
190
- for candidate_label in candidate_labels
191
- if candidate_label.startswith(generated_label)
192
- }
201
+ if isinstance(first_label_token_mapping, dict):
202
+ if any(
203
+ candidate_label not in first_label_token_mapping
204
+ for candidate_label in candidate_labels
205
+ ):
206
+ raise InvalidBenchmark(
207
+ "There is a label not present in the first label token "
208
+ "mapping - this should never happen! Please report this "
209
+ "issue to the EuroEval team at "
210
+ "github.com/EuroEval/EuroEval/issues."
211
+ )
212
+
213
+ candidate_output_labels = {
214
+ candidate_label
215
+ for candidate_label in candidate_labels
216
+ if generated_label == first_label_token_mapping[candidate_label]
217
+ }
218
+ else:
219
+ candidate_output_labels = {
220
+ candidate_label
221
+ for candidate_label in candidate_labels
222
+ if candidate_label.startswith(generated_label)
223
+ }
193
224
 
194
225
  # If we can uniquely determine the output label, we break the loop. If
195
226
  # there are multiple possible labels then we store the current one, and
@@ -206,7 +237,7 @@ def get_closest_logprobs_labels(
206
237
  else:
207
238
  output_label = candidate_output_labels.pop()
208
239
  candidate_output_labels.add(output_label)
209
- log_once(
240
+ raise InvalidBenchmark(
210
241
  "Multiple candidate labels found for the generated label "
211
242
  f"{generated_label!r}: {candidate_output_labels}. Since "
212
243
  "this is not the first generated label, we cannot "
@@ -214,9 +245,13 @@ def get_closest_logprobs_labels(
214
245
  f"forced to use the arbitrary {output_label!r} as the "
215
246
  "output label, potentially resulting in worse performance. "
216
247
  "Please report this issue to the EuroEval team at "
217
- "github.com/EuroEval/EuroEval/issues.",
218
- level=logging.WARNING,
248
+ "github.com/EuroEval/EuroEval/issues."
219
249
  )
250
+ elif len(candidate_output_labels) == 0:
251
+ logger.debug(
252
+ f"No candidate label found for the generated label "
253
+ f"{generated_label!r}. The generated label is thus ignored."
254
+ )
220
255
 
221
256
  if output_label is not None:
222
257
  output_labels.append(output_label)
euroeval/utils.py CHANGED
@@ -7,12 +7,12 @@ import importlib.util
7
7
  import logging
8
8
  import os
9
9
  import random
10
+ import re
10
11
  import sys
11
12
  import typing as t
12
13
  import warnings
13
14
  from functools import cache
14
15
  from pathlib import Path
15
- from types import TracebackType
16
16
 
17
17
  import litellm
18
18
  import numpy as np
@@ -20,7 +20,6 @@ import requests
20
20
  import torch
21
21
  from datasets.utils import disable_progress_bar
22
22
  from requests.exceptions import RequestException
23
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
24
23
  from transformers import logging as tf_logging
25
24
 
26
25
  from .exceptions import InvalidModel, NaNValueInModelOutput
@@ -29,6 +28,11 @@ if importlib.util.find_spec("ray") is not None:
29
28
  import ray
30
29
 
31
30
  if t.TYPE_CHECKING:
31
+ from types import TracebackType
32
+
33
+ from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
34
+
35
+ from .data_models import DatasetConfig
32
36
  from .types import Predictions
33
37
 
34
38
 
@@ -285,7 +289,7 @@ class HiddenPrints:
285
289
  self,
286
290
  exc_type: t.Type[BaseException],
287
291
  exc_val: BaseException,
288
- exc_tb: TracebackType,
292
+ exc_tb: "TracebackType",
289
293
  ) -> None:
290
294
  """Exit the context manager."""
291
295
  sys.stdout.close()
@@ -355,7 +359,6 @@ def should_prompts_be_stripped(
355
359
  return strip_prompts
356
360
 
357
361
 
358
- # TODO: This is currently not used - maybe remove.
359
362
  def should_prefix_space_be_added_to_labels(
360
363
  labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
361
364
  ) -> bool:
@@ -576,3 +579,96 @@ def get_package_version(package_name: str) -> str | None:
576
579
  return importlib.metadata.version(package_name)
577
580
  except importlib.metadata.PackageNotFoundError:
578
581
  return None
582
+
583
+
584
+ def get_first_label_token_mapping(
585
+ dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
586
+ ) -> dict[str, str] | bool:
587
+ """Check if the model should output scores.
588
+
589
+ Args:
590
+ dataset_config:
591
+ The dataset configuration.
592
+ tokenizer:
593
+ The tokenizer, or None if not available.
594
+
595
+ Returns:
596
+ A mapping from labels to the first token in each label, or alternatively a
597
+ Boolean value indicating whether the model should output scores (if the mapping
598
+ is outputted then the model will always output scores).
599
+ """
600
+ # Importing here to avoid circular imports
601
+ from .constants import TASK_GROUPS_USING_LOGPROBS
602
+
603
+ # If we do not have any tokenizer, then we cannot check if the model should output
604
+ # scores and we just assume it should if the dataset supports it
605
+ output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
606
+ if tokenizer is None:
607
+ if output_scores:
608
+ log_once(
609
+ "The model will output scores, since the dataset supports it and no "
610
+ "tokenizer is available.",
611
+ level=logging.DEBUG,
612
+ )
613
+ else:
614
+ log_once(
615
+ "The model will not output scores, since the dataset does not support "
616
+ "it and no tokenizer is available.",
617
+ level=logging.DEBUG,
618
+ )
619
+ return output_scores
620
+
621
+ # If there are labels associated with the dataset, and that the first token of each
622
+ # label is distinct, then we can safely use the logprobs
623
+ if output_scores and dataset_config.labels:
624
+ local_labels = [
625
+ dataset_config.prompt_label_mapping[label].strip()
626
+ for label in dataset_config.labels
627
+ ]
628
+
629
+ # Get the first token of each label, where we add a prefix space if needed
630
+ add_prefix_space = (
631
+ should_prefix_space_be_added_to_labels(
632
+ labels_to_be_generated=local_labels, tokenizer=tokenizer
633
+ )
634
+ and tokenizer.chat_template is None
635
+ )
636
+ first_tokens = [
637
+ tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
638
+ for label in local_labels
639
+ ]
640
+ first_tokens = [
641
+ re.sub(
642
+ pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
643
+ )
644
+ for token in first_tokens
645
+ ]
646
+
647
+ # Build a mapping from labels to the first token in each label if the first
648
+ # tokens are distinct
649
+ if len(first_tokens) == len(set(first_tokens)):
650
+ log_once(
651
+ "The model will output scores, since the first tokens of the labels "
652
+ "are distinct.",
653
+ level=logging.DEBUG,
654
+ )
655
+ return {
656
+ label: first_token
657
+ for label, first_token in zip(local_labels, first_tokens)
658
+ }
659
+ else:
660
+ log_once(
661
+ "The model will not output scores, since the first tokens of the "
662
+ "labels are not distinct. The first tokens for the labels "
663
+ f"{local_labels} are {first_tokens}"
664
+ )
665
+ return False
666
+
667
+ # Otherwise, we assume that the model should not output scores, to avoid potential
668
+ # evaluation errors. This will force the label extraction to rely on word edit
669
+ # distance instead of logprobs.
670
+ log_once(
671
+ "The model will not output scores, since the dataset does not have labels.",
672
+ level=logging.DEBUG,
673
+ )
674
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.4.2
3
+ Version: 15.5.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -37,7 +37,7 @@ Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
38
  Requires-Dist: huggingface-hub>=0.24.0
39
39
  Requires-Dist: levenshtein>=0.24.0
40
- Requires-Dist: litellm>=1.61.13
40
+ Requires-Dist: litellm>=1.63.0
41
41
  Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
43
  Requires-Dist: ollama>=0.4.7
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
71
71
  Provides-Extra: human-evaluation
72
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
73
  Provides-Extra: test
@@ -218,6 +218,7 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
218
218
  $ uv run src/scripts/create_allocine.py
219
219
  ```
220
220
 
221
+
221
222
  ## Special Thanks :pray:
222
223
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
223
224
  models on the leaderboards.
@@ -1,12 +1,12 @@
1
- euroeval/__init__.py,sha256=l3V3ybiCj0I193jvn8wS9VK4UEc9ajiOq4SojChH6Xs,2615
1
+ euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
2
  euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
- euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
3
+ euroeval/benchmarker.py,sha256=8Qt1NL7k5n-AfFrhR6139wmmsVS7CgRa-QjminH0d_c,47849
4
4
  euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
- euroeval/constants.py,sha256=zL8dm7SEFpIgC2vaPhqzdKydVSWW-ZyMHenWPnNxWqQ,1681
6
+ euroeval/constants.py,sha256=CJavEDvKLSKAC4uyz44sFrY1W1AnjUsxkXF63SoMjw4,1985
7
7
  euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
- euroeval/data_models.py,sha256=b4rOMdhoxkIPcnTQdwqq5iWaF6uia1OzAgdiOBvoGVM,14779
9
- euroeval/dataset_configs.py,sha256=C5Gnp95cBeCmmuRA8Rznt0c4gMOn8Pilk_kDCleDMjg,90640
8
+ euroeval/data_models.py,sha256=QssdR_msDTmsp9yKe0cVba0iCpgBTFTOaOUn44o1cl8,14770
9
+ euroeval/dataset_configs.py,sha256=6WiRW-VAAMIL6-1J6Nb6pCm6mf4I-oQ087zB0es3HHs,90644
10
10
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
11
  euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
12
  euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
20
20
  euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
21
21
  euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
22
22
  euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
23
- euroeval/utils.py,sha256=CFjYMoKdcxLUEM-aF3pxf_3TnGWvGasjfb8pDMJVe9U,18772
23
+ euroeval/utils.py,sha256=bbq7WCcIrMKjBRaZ8EcnRpRAvL_F-tCxiL0We_po3QE,22397
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
26
26
  euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
27
- euroeval/benchmark_modules/hf.py,sha256=Typig7WDqOn_uGE24s_P_9PHvq-V0MrKGD7xbh0aYnk,43244
28
- euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
29
- euroeval/benchmark_modules/vllm.py,sha256=O8-dcVkU2jgZer44EOeTC8E4d-xQjPDOXnoyzXxAToQ,46179
27
+ euroeval/benchmark_modules/hf.py,sha256=VcgWZmSZc4B3FgeUGC0eWQIRv97luU22-KijaBfuqU0,43602
28
+ euroeval/benchmark_modules/litellm.py,sha256=pbTsq6Bb8cnFbdZMUSrUs-XlNAyaCIWNcEKKRIfprx8,45161
29
+ euroeval/benchmark_modules/vllm.py,sha256=7AZrvcwHevrQbXvbjTCp4S6HpM0Obk6CIQLbmUWIn9s,47483
30
30
  euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
31
  euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
32
  euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
33
- euroeval/task_utils/sequence_classification.py,sha256=832iWpPR3CsnlBIYA976eN21WUFQLUmIlDxFIvOsROk,10266
33
+ euroeval/task_utils/sequence_classification.py,sha256=JDZfiTj5RdwYwlhhTqVBj2mVdwmkoykZ6wJzEbWj0lo,12225
34
34
  euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
35
35
  euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
36
- euroeval-15.4.2.dist-info/METADATA,sha256=cvpyWIKPXNKn1Idv7w3C7z8MBVljmw50jBdskL_32oI,10752
37
- euroeval-15.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- euroeval-15.4.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
- euroeval-15.4.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
- euroeval-15.4.2.dist-info/RECORD,,
36
+ euroeval-15.5.0.dist-info/METADATA,sha256=T48YoPuFBEFI5sxgUadzkD3tidIB3TA1mKEKsFuh7fs,10752
37
+ euroeval-15.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.5.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.5.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.5.0.dist-info/RECORD,,