EuroEval 15.9.1__py3-none-any.whl → 15.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -378,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
378
378
  tokenizer=self._tokenizer,
379
379
  ),
380
380
  batched=True,
381
- batch_size=1,
381
+ batch_size=10,
382
382
  remove_columns=dataset["train"].column_names,
383
383
  load_from_cache_file=False,
384
384
  keep_in_memory=True,
@@ -389,7 +389,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
389
389
  tokenizer=self._tokenizer,
390
390
  ),
391
391
  batched=True,
392
- batch_size=1,
392
+ batch_size=10,
393
393
  remove_columns=dataset["val"].column_names,
394
394
  load_from_cache_file=False,
395
395
  keep_in_memory=True,
@@ -400,7 +400,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
400
400
  tokenizer=self._tokenizer,
401
401
  ),
402
402
  batched=True,
403
- batch_size=1,
403
+ batch_size=10,
404
404
  remove_columns=dataset["test"].column_names,
405
405
  load_from_cache_file=False,
406
406
  keep_in_memory=True,
@@ -1,5 +1,6 @@
1
1
  """Generative models from an inference API, using the LiteLLM framework."""
2
2
 
3
+ import asyncio
3
4
  import collections.abc as c
4
5
  import logging
5
6
  import os
@@ -29,6 +30,7 @@ from litellm.exceptions import (
29
30
  Timeout,
30
31
  )
31
32
  from litellm.llms.vertex_ai.common_utils import VertexAIError
33
+ from litellm.router import Router
32
34
  from litellm.types.utils import ChoiceLogprobs, ModelResponse
33
35
  from pydantic import conlist, create_model
34
36
  from requests.exceptions import RequestException
@@ -68,7 +70,7 @@ from ..task_group_utils import (
68
70
  from ..tokenization_utils import get_first_label_token_mapping
69
71
  from ..types import ExtractLabelsFunction
70
72
  from ..utils import (
71
- catch_coroutine_exception,
73
+ add_semaphore_and_catch_exception,
72
74
  create_model_cache_dir,
73
75
  log_once,
74
76
  safe_run,
@@ -201,6 +203,11 @@ class LiteLLMModel(BenchmarkModule):
201
203
  self.is_ollama = model_config.model_id.startswith(
202
204
  "ollama/"
203
205
  ) or model_config.model_id.startswith("ollama_chat/")
206
+ self._ollama_show: ollama.ShowResponse = (
207
+ ollama.show("/".join(model_config.model_id.split("/")[1:]))
208
+ if self.is_ollama
209
+ else ollama.ShowResponse(model_info=None)
210
+ )
204
211
 
205
212
  raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
206
213
 
@@ -224,7 +231,14 @@ class LiteLLMModel(BenchmarkModule):
224
231
  Returns:
225
232
  The generative type of the model, or None if it has not been set yet.
226
233
  """
227
- if self.model_config.revision == "thinking":
234
+ if self.is_ollama:
235
+ reasoning_model = "thinking" in (self._ollama_show.capabilities or [])
236
+ type_ = (
237
+ GenerativeType.REASONING
238
+ if reasoning_model
239
+ else GenerativeType.INSTRUCTION_TUNED
240
+ )
241
+ elif self.model_config.revision in {"thinking"}:
228
242
  type_ = GenerativeType.REASONING
229
243
  elif re.fullmatch(
230
244
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
@@ -251,8 +265,18 @@ class LiteLLMModel(BenchmarkModule):
251
265
  The generated model outputs.
252
266
  """
253
267
  assert "messages" in inputs, "The input must contain a 'messages' key."
254
- messages = inputs["messages"]
268
+ conversations: list[list[litellm.AllMessageValues]] = inputs["messages"]
255
269
 
270
+ # Get the mapping from labels to the first token in the label. We call this each
271
+ # time we generate a new dataset since the dataset config can change
272
+ self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
273
+ dataset_config=self.dataset_config,
274
+ model_config=self.model_config,
275
+ tokenizer=None,
276
+ generative_type=self.generative_type,
277
+ )
278
+
279
+ # Set the core generation arguments
256
280
  generation_kwargs: dict[str, t.Any] = dict(
257
281
  model=self.model_config.model_id,
258
282
  max_completion_tokens=(
@@ -266,33 +290,30 @@ class LiteLLMModel(BenchmarkModule):
266
290
  api_key=self.benchmark_config.api_key,
267
291
  api_base=self.benchmark_config.api_base,
268
292
  api_version=self.benchmark_config.api_version,
293
+ max_retries=3,
269
294
  )
270
295
 
271
- # Get the mapping from labels to the first token in the label. We call this each
272
- # time we generate a new dataset since the dataset config can change
273
- self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
274
- dataset_config=self.dataset_config,
275
- model_config=self.model_config,
276
- tokenizer=None,
277
- generative_type=self.generative_type,
278
- )
279
-
280
- if self.buffer["first_label_token_mapping"]:
281
- generation_kwargs["logprobs"] = True
282
- generation_kwargs["top_logprobs"] = MAX_LOGPROBS
283
-
296
+ # Set up the `response_format` generation argument if we are dealing with a task
297
+ # using structured generation
284
298
  if self.dataset_config.task in TASKS_USING_JSON:
285
- for msg_list in messages:
286
- # msg_list is a list of {'role':…, 'content':…} dicts
287
- if not msg_list:
299
+ # Sanity check that "JSON" is included in the prompt, as some models require
300
+ # this
301
+ for conversation in conversations:
302
+ if not conversation:
288
303
  raise InvalidBenchmark(
289
- "Encountered an empty message list in 'messages'."
304
+ "Encountered an empty conversation in 'messages'."
290
305
  )
291
- last = msg_list[-1]
292
- assert isinstance(last, dict), (
293
- f"Expected dict message, got {type(last)}"
306
+ last_message = conversation[-1]
307
+ assert isinstance(last_message, dict), (
308
+ f"Expected dict message, got {type(last_message)}"
294
309
  )
295
- assert "json" in last["content"].lower(), (
310
+ assert "content" in last_message, (
311
+ "Expected 'content' key in the last message of the conversation."
312
+ )
313
+ assert isinstance(last_message["content"], str), (
314
+ "Expected 'content' to be a string."
315
+ )
316
+ assert "json" in last_message["content"].lower(), (
296
317
  "Prompt must contain 'json' for JSON tasks."
297
318
  )
298
319
 
@@ -328,6 +349,19 @@ class LiteLLMModel(BenchmarkModule):
328
349
  level=logging.DEBUG,
329
350
  )
330
351
 
352
+ # If the model is an Ollama reasoning model, we ensure that thinking is enabled
353
+ if self.is_ollama and self.generative_type == GenerativeType.REASONING:
354
+ generation_kwargs["think"] = True
355
+ log_once(
356
+ "Enabling thinking mode for Ollama model "
357
+ f"{self.model_config.model_id!r}",
358
+ level=logging.DEBUG,
359
+ )
360
+
361
+ # Handle manually set parameters
362
+ if self.buffer["first_label_token_mapping"]:
363
+ generation_kwargs["logprobs"] = True
364
+ generation_kwargs["top_logprobs"] = MAX_LOGPROBS
331
365
  if self.model_config.revision == "thinking":
332
366
  generation_kwargs["thinking"] = dict(
333
367
  type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
@@ -344,66 +378,67 @@ class LiteLLMModel(BenchmarkModule):
344
378
  level=logging.DEBUG,
345
379
  )
346
380
 
347
- # This drops generation kwargs that are not supported by the model
381
+ # Drop generation kwargs that are not supported by the model
348
382
  litellm.drop_params = True
349
383
 
350
- # Extract the generated sequences from the model response. Some APIs cannot
351
- # handle using newlines as stop sequences, so we try both.
352
- num_attempts = 10
353
-
354
- all_responses = {}
355
- all_failures = []
356
- to_run = list(enumerate(messages))
357
-
358
- for attempt in range(num_attempts):
359
- if not to_run:
384
+ all_responses: dict[int, ModelResponse] = {}
385
+ conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
386
+ enumerate(conversations)
387
+ )
388
+ for attempt in range(num_attempts := 10):
389
+ if not conversations_to_run:
360
390
  break
361
391
 
362
- batch_indices, batch_msgs = zip(*to_run)
363
- model_response, failures = safe_run(
392
+ batch_indices, batch_conversations = zip(*conversations_to_run)
393
+ successes, failures = safe_run(
364
394
  self._generate_async(
365
- messages=list(batch_msgs),
366
- generation_kwargs=generation_kwargs,
367
- max_retries=3,
368
- max_reruns=15,
395
+ model_id=self.model_config.model_id,
396
+ conversations=list(batch_conversations),
397
+ **generation_kwargs,
369
398
  )
370
399
  )
371
400
 
372
- for orig_idx, response in zip(batch_indices, model_response):
401
+ # Store the successful model outputs
402
+ for idx, response in successes:
403
+ orig_idx = batch_indices[idx]
373
404
  all_responses[orig_idx] = response
374
405
 
406
+ # If all requests were successful, break
375
407
  if not failures:
376
- to_run = []
408
+ conversations_to_run = []
377
409
  break
378
410
 
379
- all_failures.extend(failures)
380
- to_run = [(orig_idx, messages[orig_idx]) for orig_idx, _ in failures]
411
+ # Put the failed requests back in the queue to try again
412
+ conversations_to_run = [
413
+ (batch_indices[idx], conversations[batch_indices[idx]])
414
+ for idx, _ in failures
415
+ ]
381
416
  logger.debug(
382
- f"Attempt {attempt + 1}/{num_attempts}: "
383
- f"retrying {len(to_run)} failed message(s)"
417
+ f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
418
+ f"{len(conversations_to_run):,} failed message(s)"
384
419
  )
385
420
 
421
+ # Attempt to handle the exceptions, to improve the chance of getting
422
+ # successful generations next time around
386
423
  for _, error in failures:
387
424
  self._handle_exception(error=error, generation_kwargs=generation_kwargs)
388
- else:
389
- raise InvalidBenchmark(
390
- message=f"Failed to generate text, after {num_attempts} attempts."
391
- )
392
425
 
393
- if to_run:
426
+ # Sleep for a second to avoid pinging the API server too quickly
427
+ sleep(1)
428
+ else:
394
429
  raise InvalidBenchmark(
395
- f"Failed to generate text after {num_attempts} attempts. "
396
- f"Errors: {all_failures}"
430
+ message=f"Failed to generate text, after {num_attempts:,} attempts."
397
431
  )
398
432
 
399
- ordered_responses = [all_responses[i] for i in range(len(messages))]
433
+ # Extract the generations from the model output
434
+ ordered_responses = [all_responses[i] for i in range(len(conversations))]
400
435
  model_output = self._create_model_output(
401
436
  model_responses=ordered_responses, model_id=self.model_config.model_id
402
437
  )
403
438
 
404
- if len(messages) != len(model_output.sequences):
439
+ if len(conversations) != len(model_output.sequences):
405
440
  raise InvalidBenchmark(
406
- f"Number of model inputs ({len(messages):,}) does not match the "
441
+ f"Number of model inputs ({len(conversations):,}) does not match the "
407
442
  f"number of model outputs ({len(model_output.sequences):,})."
408
443
  )
409
444
 
@@ -462,8 +497,8 @@ class LiteLLMModel(BenchmarkModule):
462
497
  f"The model {model_id!r} does not support logprobs, so disabling it.",
463
498
  level=logging.DEBUG,
464
499
  )
465
- generation_kwargs.pop("logprobs")
466
- generation_kwargs.pop("top_logprobs")
500
+ generation_kwargs.pop("logprobs", None)
501
+ generation_kwargs.pop("top_logprobs", None)
467
502
  return
468
503
  elif any(msg.lower() in error_msg for msg in temperature_messages):
469
504
  log_once(
@@ -471,7 +506,7 @@ class LiteLLMModel(BenchmarkModule):
471
506
  "temperature, so disabling it.",
472
507
  level=logging.DEBUG,
473
508
  )
474
- generation_kwargs.pop("temperature")
509
+ generation_kwargs.pop("temperature", None)
475
510
  return
476
511
  elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
477
512
  log_once(
@@ -503,14 +538,7 @@ class LiteLLMModel(BenchmarkModule):
503
538
  generation_kwargs["response_format"] = dict(type="json_object")
504
539
  return
505
540
  elif isinstance(
506
- error,
507
- (
508
- APIConnectionError,
509
- Timeout,
510
- ServiceUnavailableError,
511
- InternalServerError,
512
- SystemError,
513
- ),
541
+ error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
514
542
  ):
515
543
  logger.debug(
516
544
  f"Service temporarily unavailable. The error message was: {error}. "
@@ -518,6 +546,18 @@ class LiteLLMModel(BenchmarkModule):
518
546
  )
519
547
  sleep(5)
520
548
  return
549
+ elif isinstance(error, (APIConnectionError, OSError)):
550
+ # If there are too many I/O connections, we increase the number of allowed
551
+ # file descriptors
552
+ if "too many open files" in error_msg:
553
+ raise InvalidBenchmark(
554
+ "There are too many file descriptors running. See the current "
555
+ "value by running `ulimit -n`. Try increasing it by running "
556
+ "`ulimit -n <new-value>` and try again."
557
+ )
558
+ raise InvalidBenchmark(
559
+ f"Encountered {type(error)} during generation: {error}."
560
+ )
521
561
 
522
562
  if isinstance(error, RateLimitError):
523
563
  raise InvalidModel(
@@ -538,69 +578,66 @@ class LiteLLMModel(BenchmarkModule):
538
578
 
539
579
  async def _generate_async(
540
580
  self,
541
- messages: list[dict[str, t.Any]],
542
- generation_kwargs: dict[str, t.Any],
543
- max_retries: int,
544
- max_reruns: int,
545
- ) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
581
+ model_id: str,
582
+ conversations: list[list[litellm.AllMessageValues]],
583
+ **generation_kwargs,
584
+ ) -> tuple[list[tuple[int, ModelResponse]], list[tuple[int, Exception]]]:
546
585
  """Generate outputs from the model asynchronously.
547
586
 
548
587
  Args:
549
- messages:
550
- The messages to pass to the model.
551
- generation_kwargs:
552
- The generation kwargs to pass to the model.
553
- max_retries:
554
- The maximum number of retries to make.
555
- max_reruns:
556
- The maximum number of reruns to make.
588
+ model_id:
589
+ The ID of the model to use for generation.
590
+ conversations:
591
+ The conversations to pass to the model.
592
+ **generation_kwargs:
593
+ Additional generation arguments to pass to the model.
557
594
 
558
595
  Returns:
559
- A tuple containing the successful responses and the failed responses.
596
+ A tuple (successes, failures), each being a list of tuples (idx, content),
597
+ where the `idx` corresponds to the index of `conversations`, and `content`
598
+ is either the model response or an Exception.
560
599
  """
561
- success = []
562
- all_failures = {}
563
- to_run = list(enumerate(messages))
564
- prev_fail_count = len(to_run)
565
- rerun_count = 0
566
-
567
- while to_run and rerun_count < max_reruns and prev_fail_count > 0:
568
- requests = [
569
- litellm.acompletion(
570
- messages=msg, max_retries=max_retries, **generation_kwargs
600
+ # Create a LiteLLM router, which will ensure that we only use a single client
601
+ # for all the requests, preventing "too many open files" errors
602
+ router = Router(
603
+ model_list=[
604
+ dict(
605
+ model_name=self.model_config.model_id,
606
+ litellm_params=generation_kwargs,
571
607
  )
572
- for _, msg in to_run
573
608
  ]
574
- wrapped_requests = [
575
- catch_coroutine_exception(request) for request in requests
576
- ]
577
- responses = await tqdm_async.gather(*wrapped_requests, leave=False)
578
-
579
- next_to_run = []
580
- current_fail_count = 0
609
+ )
581
610
 
582
- for (orig_idx, _), response in zip(to_run, responses):
583
- if isinstance(response, Exception):
584
- current_fail_count += 1
585
- all_failures[orig_idx] = response
586
- next_to_run.append((orig_idx, messages[orig_idx]))
587
- else:
588
- success.append(response)
611
+ # Get the LLM generations asynchronously
612
+ max_concurrent_calls = 20
613
+ semaphore = asyncio.Semaphore(max_concurrent_calls)
614
+ requests = [
615
+ add_semaphore_and_catch_exception(
616
+ router.acompletion(model=model_id, messages=conversation),
617
+ semaphore=semaphore,
618
+ )
619
+ for conversation in conversations
620
+ ]
621
+ responses = await tqdm_async.gather(*requests, leave=False)
589
622
 
590
- if current_fail_count >= prev_fail_count:
591
- logger.warning(
592
- "Retry loop aborting due to no progress: "
593
- f"current_fail_count={current_fail_count}, "
594
- f"prev_fail_count={prev_fail_count}"
595
- )
596
- break
623
+ # Separate the successful responses from the failed ones
624
+ successes = [
625
+ (idx, response)
626
+ for idx, response in enumerate(responses)
627
+ if not isinstance(response, Exception)
628
+ ]
629
+ failures = [
630
+ (idx, response)
631
+ for idx, response in enumerate(responses)
632
+ if isinstance(response, Exception)
633
+ ]
597
634
 
598
- prev_fail_count = current_fail_count
599
- to_run = next_to_run
600
- rerun_count += 1
635
+ # Close connections
636
+ for request in requests:
637
+ if hasattr(request, "close"):
638
+ request.close()
601
639
 
602
- failures = [(orig_idx, all_failures[orig_idx]) for orig_idx, _ in to_run]
603
- return success, failures
640
+ return successes, failures
604
641
 
605
642
  @staticmethod
606
643
  def _create_model_output(
@@ -690,8 +727,7 @@ class LiteLLMModel(BenchmarkModule):
690
727
  # If it is an Ollama model then we can get the number of parameters from the
691
728
  # Ollama Python SDK
692
729
  if self.is_ollama:
693
- ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
694
- model_info = ollama.show(ollama_model_id).modelinfo
730
+ model_info = self._ollama_show.modelinfo
695
731
  if model_info is not None:
696
732
  num_params = model_info.get("general.parameter_count")
697
733
  if num_params is not None:
@@ -819,7 +855,7 @@ class LiteLLMModel(BenchmarkModule):
819
855
  # Python SDK
820
856
  if self.is_ollama:
821
857
  ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
822
- model_info = ollama.show(ollama_model_id).modelinfo
858
+ model_info = self._ollama_show.modelinfo
823
859
  if model_info is not None:
824
860
  context_length_keys = [
825
861
  key for key in model_info.keys() if "context_length" in key.lower()