EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show
  1. euroeval/__init__.py +7 -4
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +5 -2
  5. euroeval/benchmark_modules/hf.py +107 -66
  6. euroeval/benchmark_modules/litellm.py +103 -55
  7. euroeval/benchmark_modules/vllm.py +155 -82
  8. euroeval/benchmarker.py +184 -129
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +1 -1
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +14 -11
  14. euroeval/data_models.py +12 -4
  15. euroeval/dataset_configs/__init__.py +3 -0
  16. euroeval/dataset_configs/czech.py +79 -0
  17. euroeval/dataset_configs/danish.py +10 -13
  18. euroeval/dataset_configs/dutch.py +0 -3
  19. euroeval/dataset_configs/english.py +0 -3
  20. euroeval/dataset_configs/estonian.py +11 -1
  21. euroeval/dataset_configs/finnish.py +0 -3
  22. euroeval/dataset_configs/french.py +0 -3
  23. euroeval/dataset_configs/german.py +0 -3
  24. euroeval/dataset_configs/italian.py +0 -3
  25. euroeval/dataset_configs/latvian.py +2 -4
  26. euroeval/dataset_configs/lithuanian.py +68 -0
  27. euroeval/dataset_configs/norwegian.py +0 -3
  28. euroeval/dataset_configs/polish.py +0 -3
  29. euroeval/dataset_configs/portuguese.py +0 -3
  30. euroeval/dataset_configs/slovak.py +60 -0
  31. euroeval/dataset_configs/spanish.py +0 -3
  32. euroeval/dataset_configs/swedish.py +10 -15
  33. euroeval/finetuning.py +21 -15
  34. euroeval/generation.py +10 -10
  35. euroeval/generation_utils.py +2 -3
  36. euroeval/logging_utils.py +250 -0
  37. euroeval/metrics/base.py +0 -3
  38. euroeval/metrics/huggingface.py +10 -6
  39. euroeval/metrics/llm_as_a_judge.py +5 -3
  40. euroeval/metrics/pipeline.py +22 -9
  41. euroeval/metrics/speed.py +0 -3
  42. euroeval/model_cache.py +11 -14
  43. euroeval/model_config.py +4 -5
  44. euroeval/model_loading.py +3 -0
  45. euroeval/prompt_templates/linguistic_acceptability.py +30 -3
  46. euroeval/prompt_templates/multiple_choice.py +34 -1
  47. euroeval/prompt_templates/named_entity_recognition.py +71 -11
  48. euroeval/prompt_templates/reading_comprehension.py +41 -3
  49. euroeval/prompt_templates/sentiment_classification.py +34 -1
  50. euroeval/prompt_templates/summarization.py +26 -6
  51. euroeval/scores.py +7 -7
  52. euroeval/speed_benchmark.py +3 -5
  53. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  54. euroeval/task_group_utils/question_answering.py +0 -3
  55. euroeval/task_group_utils/sequence_classification.py +43 -31
  56. euroeval/task_group_utils/text_to_text.py +17 -8
  57. euroeval/task_group_utils/token_classification.py +10 -9
  58. euroeval/tokenisation_utils.py +22 -20
  59. euroeval/utils.py +30 -147
  60. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
  61. euroeval-16.4.0.dist-info/RECORD +75 -0
  62. euroeval-16.2.2.dist-info/RECORD +0 -70
  63. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  64. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  65. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -6,7 +6,7 @@ import json
6
6
  import logging
7
7
  import re
8
8
  import typing as t
9
- from functools import cache, cached_property, partial
9
+ from functools import cached_property, partial
10
10
  from time import sleep
11
11
 
12
12
  import litellm
@@ -36,8 +36,8 @@ from litellm.utils import supports_reasoning, supports_response_schema
36
36
  from pydantic import conlist, create_model
37
37
  from requests.exceptions import RequestException
38
38
  from tqdm.asyncio import tqdm as tqdm_async
39
- from tqdm.auto import tqdm
40
39
 
40
+ from ..caching_utils import cache_arguments
41
41
  from ..constants import (
42
42
  JSON_STRIP_CHARACTERS,
43
43
  LITELLM_CLASSIFICATION_OUTPUT_KEY,
@@ -70,6 +70,7 @@ from ..generation_utils import (
70
70
  extract_few_shot_examples,
71
71
  raise_if_wrong_params,
72
72
  )
73
+ from ..logging_utils import get_pbar, log, log_once
73
74
  from ..task_group_utils import (
74
75
  question_answering,
75
76
  sequence_classification,
@@ -83,7 +84,6 @@ from ..utils import (
83
84
  add_semaphore_and_catch_exception,
84
85
  create_model_cache_dir,
85
86
  get_hf_token,
86
- log_once,
87
87
  safe_run,
88
88
  split_model_id,
89
89
  )
@@ -95,8 +95,6 @@ if t.TYPE_CHECKING:
95
95
  from litellm.types.utils import ModelResponse
96
96
  from transformers.trainer import Trainer
97
97
 
98
- logger = logging.getLogger("euroeval")
99
-
100
98
 
101
99
  VOCAB_SIZE_MAPPING = {
102
100
  # OpenAI models
@@ -133,6 +131,7 @@ MODEL_MAX_LENGTH_MAPPING = {
133
131
  r"gpt-4.1.*": 1_047_576,
134
132
  # Anthropic models
135
133
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
134
+ r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
136
135
  # Gemini models
137
136
  r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
138
137
  r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
@@ -367,9 +366,11 @@ class LiteLLMModel(BenchmarkModule):
367
366
  (batch_indices[idx], model_inputs[batch_indices[idx]])
368
367
  for idx, _ in failures
369
368
  ]
370
- logger.debug(
369
+ log(
371
370
  f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
372
- f"{len(inputs_to_run):,} failed message(s)"
371
+ f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
372
+ f"{failures[0][1]}.",
373
+ level=logging.DEBUG,
373
374
  )
374
375
 
375
376
  # Attempt to handle the exceptions, to improve the chance of getting
@@ -421,14 +422,19 @@ class LiteLLMModel(BenchmarkModule):
421
422
  "'stop' is not supported with this model",
422
423
  "'$.stop' is invalid",
423
424
  ]
425
+ stop_pattern = re.compile(r"does not support parameters: \[.*'stop'.*\]")
424
426
  logprobs_messages = [
425
427
  "you are not allowed to request logprobs",
426
428
  "you've reached the maximum number of requests with logprobs",
427
429
  "logprobs is not supported",
428
430
  "logprobs is not enabled",
431
+ "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
429
432
  ]
430
- top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
431
433
  logprobs_pattern = re.compile(
434
+ r"does not support parameters: \[.*'logprobs'.*\]"
435
+ )
436
+ top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
437
+ top_logprobs_pattern = re.compile(
432
438
  r"does not support parameters: \[.*'top_logprobs'.*\]"
433
439
  )
434
440
  max_completion_tokens_pattern = re.compile(
@@ -437,6 +443,7 @@ class LiteLLMModel(BenchmarkModule):
437
443
  temperature_messages = [
438
444
  "'temperature' is not supported with this model.",
439
445
  "temperature is not supported with this model",
446
+ r"does not support parameters: \[.*'temperature'.*\]",
440
447
  ]
441
448
  temperature_must_be_one_messages = [
442
449
  "`temperature` may only be set to 1",
@@ -454,10 +461,13 @@ class LiteLLMModel(BenchmarkModule):
454
461
  seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
455
462
  response_format_messages = [
456
463
  "got an unexpected keyword argument 'response_format'",
457
- "The model outputs empty dictionaries.",
464
+ "the model returned empty outputs",
458
465
  ]
459
466
 
460
- if any(msg.lower() in error_msg for msg in stop_messages):
467
+ if (
468
+ any(msg.lower() in error_msg for msg in stop_messages)
469
+ or stop_pattern.search(string=error_msg) is not None
470
+ ):
461
471
  log_once(
462
472
  f"The model {model_id!r} does not support "
463
473
  "stop sequences, so disabling them.",
@@ -467,7 +477,7 @@ class LiteLLMModel(BenchmarkModule):
467
477
  return generation_kwargs
468
478
  elif (
469
479
  any(msg.lower() in error_msg for msg in logprobs_messages)
470
- or logprobs_pattern.search(string=error_msg)
480
+ or logprobs_pattern.search(string=error_msg) is not None
471
481
  # Special case for Vertex AI models, since they have strict rate
472
482
  # limits on using logprobs. They also have a cap of 5 logprobs, but
473
483
  # we ignore this since the rate limiting makes it unusable anyway.
@@ -477,10 +487,15 @@ class LiteLLMModel(BenchmarkModule):
477
487
  f"The model {model_id!r} does not support logprobs, so disabling it.",
478
488
  level=logging.DEBUG,
479
489
  )
490
+ self.buffer["first_label_token_mapping"] = False
480
491
  generation_kwargs.pop("logprobs", None)
481
492
  generation_kwargs.pop("top_logprobs", None)
493
+ generation_kwargs.pop("response_format", None)
482
494
  return generation_kwargs
483
- elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
495
+ elif (
496
+ any(msg.lower() in error_msg for msg in top_logprobs_messages)
497
+ or top_logprobs_pattern.search(string=error_msg) is not None
498
+ ):
484
499
  log_once(
485
500
  f"The model {model_id!r} does not support the `top_logprobs` argument, "
486
501
  "so moving the value to `logprobs`.",
@@ -597,9 +612,10 @@ class LiteLLMModel(BenchmarkModule):
597
612
  elif isinstance(
598
613
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
599
614
  ):
600
- logger.debug(
615
+ log(
601
616
  f"Service temporarily unavailable. The error message was: {error}. "
602
- "Retrying in 10 seconds..."
617
+ "Retrying in 10 seconds...",
618
+ level=logging.DEBUG,
603
619
  )
604
620
  sleep(10)
605
621
  return generation_kwargs
@@ -629,10 +645,32 @@ class LiteLLMModel(BenchmarkModule):
629
645
  ) from error
630
646
 
631
647
  if isinstance(error, RateLimitError):
632
- raise InvalidModel(
648
+ log(
633
649
  f"You have encountered your rate limit for model {model_id!r}. "
634
- "Skipping."
635
- ) from error
650
+ "Retrying in 10 seconds...",
651
+ level=logging.DEBUG,
652
+ )
653
+ sleep(10)
654
+ return generation_kwargs
655
+
656
+ if (
657
+ isinstance(error, BadRequestError)
658
+ and (
659
+ retry_match := re.search(
660
+ pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
661
+ string=error_msg,
662
+ )
663
+ )
664
+ is not None
665
+ ):
666
+ retry_seconds = float(retry_match.group(1))
667
+ log(
668
+ f"Bad request error encountered. Retrying in {retry_seconds:.1f} "
669
+ "seconds...",
670
+ level=logging.DEBUG,
671
+ )
672
+ sleep(retry_seconds)
673
+ return generation_kwargs
636
674
 
637
675
  if isinstance(error, AuthenticationError):
638
676
  raise NeedsAdditionalArgument(
@@ -711,19 +749,19 @@ class LiteLLMModel(BenchmarkModule):
711
749
  for input_ in inputs
712
750
  if isinstance(input_, list)
713
751
  ]
714
- responses = await tqdm_async.gather(*requests, leave=False)
715
-
716
- # If we are performing structured generation and the model just outputs an empty
717
- # dictionary, then we convert those to exceptions, to disable structured
718
- # generation
719
- if "response_format" in generation_kwargs:
720
- responses = [
721
- RuntimeError("The model outputs empty dictionaries.")
722
- if not isinstance(response, Exception)
723
- and any(choice.message.content == "{}" for choice in response.choices)
724
- else response
725
- for response in responses
726
- ]
752
+ responses = await tqdm_async.gather(
753
+ *requests, colour="yellow", ascii="—▰", leave=False
754
+ )
755
+
756
+ # If the outputs are empty, convert them to exceptions
757
+ if all(
758
+ not isinstance(response, Exception)
759
+ and response.choices[0].message.content == "{}"
760
+ for response in responses
761
+ ):
762
+ responses = [ValueError("The model returned empty outputs.")] * len(
763
+ responses
764
+ )
727
765
 
728
766
  # Separate the successful responses from the failed ones
729
767
  successes = [
@@ -743,7 +781,9 @@ class LiteLLMModel(BenchmarkModule):
743
781
  try:
744
782
  request.close()
745
783
  except RuntimeError as e:
746
- logger.debug(f"RuntimeError during request.close(): {e}")
784
+ log(
785
+ f"RuntimeError during request.close(): {e}", level=logging.DEBUG
786
+ )
747
787
 
748
788
  return successes, failures
749
789
 
@@ -768,10 +808,11 @@ class LiteLLMModel(BenchmarkModule):
768
808
  for model_response in model_responses:
769
809
  if not model_response.choices:
770
810
  sequences.append("")
771
- logger.warning(
811
+ log(
772
812
  f"The model {model_id!r} did not end up "
773
813
  "generating any text. This is likely because the model ran "
774
- "out of tokens while reasoning. Returning an empty string."
814
+ "out of tokens while reasoning. Returning an empty string.",
815
+ level=logging.WARNING,
775
816
  )
776
817
  continue
777
818
 
@@ -859,11 +900,12 @@ class LiteLLMModel(BenchmarkModule):
859
900
  scores.append(logprobs_list)
860
901
 
861
902
  if not sequences:
862
- logger.warning(
903
+ log(
863
904
  "No sequences were generated by the model "
864
905
  f"{model_id!r}. This may be due to the "
865
906
  "model running out of tokens or an issue with the input data. "
866
- "Returning an empty GenerativeModelOutput."
907
+ "Returning an empty GenerativeModelOutput.",
908
+ level=logging.WARNING,
867
909
  )
868
910
  return GenerativeModelOutput(sequences=[], scores=None)
869
911
 
@@ -984,7 +1026,7 @@ class LiteLLMModel(BenchmarkModule):
984
1026
  model=None,
985
1027
  model_id=model_id,
986
1028
  trust_remote_code=self.benchmark_config.trust_remote_code,
987
- model_cache_dir=self.model_config.model_cache_dir,
1029
+ model_config=self.model_config,
988
1030
  )
989
1031
 
990
1032
  if (
@@ -1067,7 +1109,7 @@ class LiteLLMModel(BenchmarkModule):
1067
1109
  model=None,
1068
1110
  model_id=model_id,
1069
1111
  trust_remote_code=self.benchmark_config.trust_remote_code,
1070
- model_cache_dir=self.model_config.model_cache_dir,
1112
+ model_config=self.model_config,
1071
1113
  )
1072
1114
 
1073
1115
  all_max_lengths: list[int] = list()
@@ -1142,6 +1184,7 @@ class LiteLLMModel(BenchmarkModule):
1142
1184
  return partial(
1143
1185
  sequence_classification.extract_labels_from_generation,
1144
1186
  dataset_config=self.dataset_config,
1187
+ model_config=self.model_config,
1145
1188
  first_label_token_mapping=self.buffer["first_label_token_mapping"],
1146
1189
  )
1147
1190
  case TaskGroup.TEXT_TO_TEXT:
@@ -1217,17 +1260,19 @@ class LiteLLMModel(BenchmarkModule):
1217
1260
  ServiceUnavailableError,
1218
1261
  InternalServerError,
1219
1262
  ) as e:
1220
- logger.debug(
1263
+ log(
1221
1264
  f"Service temporarily unavailable. The error message was: {e}. "
1222
- "Retrying in 10 seconds..."
1265
+ "Retrying in 10 seconds...",
1266
+ level=logging.DEBUG,
1223
1267
  )
1224
1268
  sleep(10)
1225
1269
  except APIError as e:
1226
1270
  if "'503 Service Unavailable" not in str(e):
1227
1271
  raise e
1228
- logger.warning(
1272
+ log(
1229
1273
  f"Failed to check if model {model_id!r} exists. Retrying in 10 "
1230
- "seconds..."
1274
+ "seconds...",
1275
+ level=logging.WARNING,
1231
1276
  )
1232
1277
  sleep(10)
1233
1278
  except (BadRequestError, NotFoundError):
@@ -1240,21 +1285,25 @@ class LiteLLMModel(BenchmarkModule):
1240
1285
  case 0:
1241
1286
  pass
1242
1287
  case 1:
1243
- logger.warning(
1288
+ log(
1244
1289
  f"Could not find the model ID {model_id!r}. Did you mean "
1245
- f"{candidate_models[0]!r}?"
1290
+ f"{candidate_models[0]!r}?",
1291
+ level=logging.WARNING,
1246
1292
  )
1247
1293
  case _:
1248
1294
  candidate_models_str = "', '".join(candidate_models)
1249
- logger.warning(
1295
+ log(
1250
1296
  f"Could not find the model ID {model_id!r}. Did you mean "
1251
- f"any of the following model IDs: '{candidate_models_str}'?"
1297
+ "any of the following model IDs: "
1298
+ f"'{candidate_models_str}'?",
1299
+ level=logging.WARNING,
1252
1300
  )
1253
1301
  return False
1254
1302
  else:
1255
- logger.error(
1303
+ log(
1256
1304
  f"Failed to check if model {model_id!r} exists after {num_attempts} "
1257
- "attempts. Assuming it does not exist."
1305
+ "attempts. Assuming it does not exist.",
1306
+ level=logging.ERROR,
1258
1307
  )
1259
1308
  return False
1260
1309
 
@@ -1287,7 +1336,8 @@ class LiteLLMModel(BenchmarkModule):
1287
1336
  "that the revision is actually the parameter and set the revision "
1288
1337
  "to 'main'. In the future, use the new '#' syntax to specify the "
1289
1338
  f"parameter (in this case, this would be {proper_model_id!r}), as this "
1290
- "will be an error in future versions of EuroEval."
1339
+ "will be an error in future versions of EuroEval.",
1340
+ level=logging.WARNING,
1291
1341
  )
1292
1342
  model_id_components.param = model_id_components.revision
1293
1343
  model_id_components.revision = "main"
@@ -1375,7 +1425,7 @@ class LiteLLMModel(BenchmarkModule):
1375
1425
 
1376
1426
  return dataset
1377
1427
 
1378
- @cache
1428
+ @cache_arguments()
1379
1429
  def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
1380
1430
  """Get the generation arguments for the model.
1381
1431
 
@@ -1583,7 +1633,8 @@ def try_download_ollama_model(model_id: str) -> bool:
1583
1633
  f"The model {model_id!r} cannot be found on Ollama, but the "
1584
1634
  f"model {model_id_with_prefix} *was* found, so we would "
1585
1635
  "recommend you cancelling this run and trying the evaluation "
1586
- "with that model ID instead."
1636
+ "with that model ID instead.",
1637
+ level=logging.WARNING,
1587
1638
  )
1588
1639
  return False
1589
1640
  except ollama.ResponseError as inner_e:
@@ -1601,11 +1652,8 @@ def try_download_ollama_model(model_id: str) -> bool:
1601
1652
  ) from e
1602
1653
 
1603
1654
  # Download the model
1604
- with tqdm(
1605
- desc=f"Downloading {ollama_model_id}",
1606
- unit_scale=True,
1607
- unit="B",
1608
- leave=False,
1655
+ with get_pbar(
1656
+ desc=f"Downloading {ollama_model_id}", unit_scale=True, unit="B"
1609
1657
  ) as pbar:
1610
1658
  for status in response:
1611
1659
  if status.total is not None: