EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +3 -2
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +99 -62
  6. euroeval/benchmark_modules/litellm.py +101 -41
  7. euroeval/benchmark_modules/vllm.py +91 -83
  8. euroeval/benchmarker.py +84 -78
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/constants.py +6 -0
  12. euroeval/data_loading.py +14 -11
  13. euroeval/data_models.py +12 -4
  14. euroeval/dataset_configs/__init__.py +2 -0
  15. euroeval/dataset_configs/czech.py +79 -0
  16. euroeval/dataset_configs/danish.py +10 -11
  17. euroeval/dataset_configs/dutch.py +0 -1
  18. euroeval/dataset_configs/english.py +0 -1
  19. euroeval/dataset_configs/estonian.py +11 -1
  20. euroeval/dataset_configs/finnish.py +0 -1
  21. euroeval/dataset_configs/french.py +0 -1
  22. euroeval/dataset_configs/german.py +0 -1
  23. euroeval/dataset_configs/italian.py +0 -1
  24. euroeval/dataset_configs/latvian.py +0 -1
  25. euroeval/dataset_configs/lithuanian.py +9 -3
  26. euroeval/dataset_configs/norwegian.py +0 -1
  27. euroeval/dataset_configs/polish.py +0 -1
  28. euroeval/dataset_configs/portuguese.py +0 -1
  29. euroeval/dataset_configs/slovak.py +60 -0
  30. euroeval/dataset_configs/spanish.py +0 -1
  31. euroeval/dataset_configs/swedish.py +10 -12
  32. euroeval/finetuning.py +21 -15
  33. euroeval/generation.py +10 -10
  34. euroeval/generation_utils.py +2 -3
  35. euroeval/logging_utils.py +250 -0
  36. euroeval/metrics/base.py +0 -3
  37. euroeval/metrics/huggingface.py +9 -5
  38. euroeval/metrics/llm_as_a_judge.py +5 -3
  39. euroeval/metrics/pipeline.py +17 -9
  40. euroeval/metrics/speed.py +0 -3
  41. euroeval/model_cache.py +11 -14
  42. euroeval/model_config.py +4 -5
  43. euroeval/model_loading.py +3 -0
  44. euroeval/prompt_templates/linguistic_acceptability.py +21 -3
  45. euroeval/prompt_templates/multiple_choice.py +25 -1
  46. euroeval/prompt_templates/named_entity_recognition.py +51 -11
  47. euroeval/prompt_templates/reading_comprehension.py +31 -3
  48. euroeval/prompt_templates/sentiment_classification.py +23 -1
  49. euroeval/prompt_templates/summarization.py +26 -6
  50. euroeval/scores.py +7 -7
  51. euroeval/speed_benchmark.py +3 -5
  52. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  53. euroeval/task_group_utils/question_answering.py +0 -3
  54. euroeval/task_group_utils/sequence_classification.py +43 -31
  55. euroeval/task_group_utils/text_to_text.py +17 -8
  56. euroeval/task_group_utils/token_classification.py +10 -9
  57. euroeval/tokenisation_utils.py +14 -12
  58. euroeval/utils.py +29 -146
  59. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
  60. euroeval-16.4.0.dist-info/RECORD +75 -0
  61. euroeval-16.3.0.dist-info/RECORD +0 -71
  62. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  63. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  64. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -6,7 +6,7 @@ import json
6
6
  import logging
7
7
  import re
8
8
  import typing as t
9
- from functools import cache, cached_property, partial
9
+ from functools import cached_property, partial
10
10
  from time import sleep
11
11
 
12
12
  import litellm
@@ -36,8 +36,8 @@ from litellm.utils import supports_reasoning, supports_response_schema
36
36
  from pydantic import conlist, create_model
37
37
  from requests.exceptions import RequestException
38
38
  from tqdm.asyncio import tqdm as tqdm_async
39
- from tqdm.auto import tqdm
40
39
 
40
+ from ..caching_utils import cache_arguments
41
41
  from ..constants import (
42
42
  JSON_STRIP_CHARACTERS,
43
43
  LITELLM_CLASSIFICATION_OUTPUT_KEY,
@@ -70,6 +70,7 @@ from ..generation_utils import (
70
70
  extract_few_shot_examples,
71
71
  raise_if_wrong_params,
72
72
  )
73
+ from ..logging_utils import get_pbar, log, log_once
73
74
  from ..task_group_utils import (
74
75
  question_answering,
75
76
  sequence_classification,
@@ -83,7 +84,6 @@ from ..utils import (
83
84
  add_semaphore_and_catch_exception,
84
85
  create_model_cache_dir,
85
86
  get_hf_token,
86
- log_once,
87
87
  safe_run,
88
88
  split_model_id,
89
89
  )
@@ -95,8 +95,6 @@ if t.TYPE_CHECKING:
95
95
  from litellm.types.utils import ModelResponse
96
96
  from transformers.trainer import Trainer
97
97
 
98
- logger = logging.getLogger("euroeval")
99
-
100
98
 
101
99
  VOCAB_SIZE_MAPPING = {
102
100
  # OpenAI models
@@ -133,6 +131,7 @@ MODEL_MAX_LENGTH_MAPPING = {
133
131
  r"gpt-4.1.*": 1_047_576,
134
132
  # Anthropic models
135
133
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
134
+ r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
136
135
  # Gemini models
137
136
  r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
138
137
  r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
@@ -367,10 +366,11 @@ class LiteLLMModel(BenchmarkModule):
367
366
  (batch_indices[idx], model_inputs[batch_indices[idx]])
368
367
  for idx, _ in failures
369
368
  ]
370
- logger.debug(
369
+ log(
371
370
  f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
372
371
  f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
373
- f"{failures[0][1]}."
372
+ f"{failures[0][1]}.",
373
+ level=logging.DEBUG,
374
374
  )
375
375
 
376
376
  # Attempt to handle the exceptions, to improve the chance of getting
@@ -422,14 +422,19 @@ class LiteLLMModel(BenchmarkModule):
422
422
  "'stop' is not supported with this model",
423
423
  "'$.stop' is invalid",
424
424
  ]
425
+ stop_pattern = re.compile(r"does not support parameters: \[.*'stop'.*\]")
425
426
  logprobs_messages = [
426
427
  "you are not allowed to request logprobs",
427
428
  "you've reached the maximum number of requests with logprobs",
428
429
  "logprobs is not supported",
429
430
  "logprobs is not enabled",
431
+ "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
430
432
  ]
431
- top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
432
433
  logprobs_pattern = re.compile(
434
+ r"does not support parameters: \[.*'logprobs'.*\]"
435
+ )
436
+ top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
437
+ top_logprobs_pattern = re.compile(
433
438
  r"does not support parameters: \[.*'top_logprobs'.*\]"
434
439
  )
435
440
  max_completion_tokens_pattern = re.compile(
@@ -438,6 +443,7 @@ class LiteLLMModel(BenchmarkModule):
438
443
  temperature_messages = [
439
444
  "'temperature' is not supported with this model.",
440
445
  "temperature is not supported with this model",
446
+ r"does not support parameters: \[.*'temperature'.*\]",
441
447
  ]
442
448
  temperature_must_be_one_messages = [
443
449
  "`temperature` may only be set to 1",
@@ -454,10 +460,14 @@ class LiteLLMModel(BenchmarkModule):
454
460
  requires_thinking_disabled_messages = ["thinking.type: Field required"]
455
461
  seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
456
462
  response_format_messages = [
457
- "got an unexpected keyword argument 'response_format'"
463
+ "got an unexpected keyword argument 'response_format'",
464
+ "the model returned empty outputs",
458
465
  ]
459
466
 
460
- if any(msg.lower() in error_msg for msg in stop_messages):
467
+ if (
468
+ any(msg.lower() in error_msg for msg in stop_messages)
469
+ or stop_pattern.search(string=error_msg) is not None
470
+ ):
461
471
  log_once(
462
472
  f"The model {model_id!r} does not support "
463
473
  "stop sequences, so disabling them.",
@@ -467,7 +477,7 @@ class LiteLLMModel(BenchmarkModule):
467
477
  return generation_kwargs
468
478
  elif (
469
479
  any(msg.lower() in error_msg for msg in logprobs_messages)
470
- or logprobs_pattern.search(string=error_msg)
480
+ or logprobs_pattern.search(string=error_msg) is not None
471
481
  # Special case for Vertex AI models, since they have strict rate
472
482
  # limits on using logprobs. They also have a cap of 5 logprobs, but
473
483
  # we ignore this since the rate limiting makes it unusable anyway.
@@ -477,10 +487,15 @@ class LiteLLMModel(BenchmarkModule):
477
487
  f"The model {model_id!r} does not support logprobs, so disabling it.",
478
488
  level=logging.DEBUG,
479
489
  )
490
+ self.buffer["first_label_token_mapping"] = False
480
491
  generation_kwargs.pop("logprobs", None)
481
492
  generation_kwargs.pop("top_logprobs", None)
493
+ generation_kwargs.pop("response_format", None)
482
494
  return generation_kwargs
483
- elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
495
+ elif (
496
+ any(msg.lower() in error_msg for msg in top_logprobs_messages)
497
+ or top_logprobs_pattern.search(string=error_msg) is not None
498
+ ):
484
499
  log_once(
485
500
  f"The model {model_id!r} does not support the `top_logprobs` argument, "
486
501
  "so moving the value to `logprobs`.",
@@ -597,9 +612,10 @@ class LiteLLMModel(BenchmarkModule):
597
612
  elif isinstance(
598
613
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
599
614
  ):
600
- logger.debug(
615
+ log(
601
616
  f"Service temporarily unavailable. The error message was: {error}. "
602
- "Retrying in 10 seconds..."
617
+ "Retrying in 10 seconds...",
618
+ level=logging.DEBUG,
603
619
  )
604
620
  sleep(10)
605
621
  return generation_kwargs
@@ -629,10 +645,32 @@ class LiteLLMModel(BenchmarkModule):
629
645
  ) from error
630
646
 
631
647
  if isinstance(error, RateLimitError):
632
- raise InvalidModel(
648
+ log(
633
649
  f"You have encountered your rate limit for model {model_id!r}. "
634
- "Skipping."
635
- ) from error
650
+ "Retrying in 10 seconds...",
651
+ level=logging.DEBUG,
652
+ )
653
+ sleep(10)
654
+ return generation_kwargs
655
+
656
+ if (
657
+ isinstance(error, BadRequestError)
658
+ and (
659
+ retry_match := re.search(
660
+ pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
661
+ string=error_msg,
662
+ )
663
+ )
664
+ is not None
665
+ ):
666
+ retry_seconds = float(retry_match.group(1))
667
+ log(
668
+ f"Bad request error encountered. Retrying in {retry_seconds:.1f} "
669
+ "seconds...",
670
+ level=logging.DEBUG,
671
+ )
672
+ sleep(retry_seconds)
673
+ return generation_kwargs
636
674
 
637
675
  if isinstance(error, AuthenticationError):
638
676
  raise NeedsAdditionalArgument(
@@ -711,7 +749,19 @@ class LiteLLMModel(BenchmarkModule):
711
749
  for input_ in inputs
712
750
  if isinstance(input_, list)
713
751
  ]
714
- responses = await tqdm_async.gather(*requests, leave=False)
752
+ responses = await tqdm_async.gather(
753
+ *requests, colour="yellow", ascii="—▰", leave=False
754
+ )
755
+
756
+ # If the outputs are empty, convert them to exceptions
757
+ if all(
758
+ not isinstance(response, Exception)
759
+ and response.choices[0].message.content == "{}"
760
+ for response in responses
761
+ ):
762
+ responses = [ValueError("The model returned empty outputs.")] * len(
763
+ responses
764
+ )
715
765
 
716
766
  # Separate the successful responses from the failed ones
717
767
  successes = [
@@ -731,7 +781,9 @@ class LiteLLMModel(BenchmarkModule):
731
781
  try:
732
782
  request.close()
733
783
  except RuntimeError as e:
734
- logger.debug(f"RuntimeError during request.close(): {e}")
784
+ log(
785
+ f"RuntimeError during request.close(): {e}", level=logging.DEBUG
786
+ )
735
787
 
736
788
  return successes, failures
737
789
 
@@ -756,10 +808,11 @@ class LiteLLMModel(BenchmarkModule):
756
808
  for model_response in model_responses:
757
809
  if not model_response.choices:
758
810
  sequences.append("")
759
- logger.warning(
811
+ log(
760
812
  f"The model {model_id!r} did not end up "
761
813
  "generating any text. This is likely because the model ran "
762
- "out of tokens while reasoning. Returning an empty string."
814
+ "out of tokens while reasoning. Returning an empty string.",
815
+ level=logging.WARNING,
763
816
  )
764
817
  continue
765
818
 
@@ -847,11 +900,12 @@ class LiteLLMModel(BenchmarkModule):
847
900
  scores.append(logprobs_list)
848
901
 
849
902
  if not sequences:
850
- logger.warning(
903
+ log(
851
904
  "No sequences were generated by the model "
852
905
  f"{model_id!r}. This may be due to the "
853
906
  "model running out of tokens or an issue with the input data. "
854
- "Returning an empty GenerativeModelOutput."
907
+ "Returning an empty GenerativeModelOutput.",
908
+ level=logging.WARNING,
855
909
  )
856
910
  return GenerativeModelOutput(sequences=[], scores=None)
857
911
 
@@ -1130,6 +1184,7 @@ class LiteLLMModel(BenchmarkModule):
1130
1184
  return partial(
1131
1185
  sequence_classification.extract_labels_from_generation,
1132
1186
  dataset_config=self.dataset_config,
1187
+ model_config=self.model_config,
1133
1188
  first_label_token_mapping=self.buffer["first_label_token_mapping"],
1134
1189
  )
1135
1190
  case TaskGroup.TEXT_TO_TEXT:
@@ -1205,17 +1260,19 @@ class LiteLLMModel(BenchmarkModule):
1205
1260
  ServiceUnavailableError,
1206
1261
  InternalServerError,
1207
1262
  ) as e:
1208
- logger.debug(
1263
+ log(
1209
1264
  f"Service temporarily unavailable. The error message was: {e}. "
1210
- "Retrying in 10 seconds..."
1265
+ "Retrying in 10 seconds...",
1266
+ level=logging.DEBUG,
1211
1267
  )
1212
1268
  sleep(10)
1213
1269
  except APIError as e:
1214
1270
  if "'503 Service Unavailable" not in str(e):
1215
1271
  raise e
1216
- logger.warning(
1272
+ log(
1217
1273
  f"Failed to check if model {model_id!r} exists. Retrying in 10 "
1218
- "seconds..."
1274
+ "seconds...",
1275
+ level=logging.WARNING,
1219
1276
  )
1220
1277
  sleep(10)
1221
1278
  except (BadRequestError, NotFoundError):
@@ -1228,21 +1285,25 @@ class LiteLLMModel(BenchmarkModule):
1228
1285
  case 0:
1229
1286
  pass
1230
1287
  case 1:
1231
- logger.warning(
1288
+ log(
1232
1289
  f"Could not find the model ID {model_id!r}. Did you mean "
1233
- f"{candidate_models[0]!r}?"
1290
+ f"{candidate_models[0]!r}?",
1291
+ level=logging.WARNING,
1234
1292
  )
1235
1293
  case _:
1236
1294
  candidate_models_str = "', '".join(candidate_models)
1237
- logger.warning(
1295
+ log(
1238
1296
  f"Could not find the model ID {model_id!r}. Did you mean "
1239
- f"any of the following model IDs: '{candidate_models_str}'?"
1297
+ "any of the following model IDs: "
1298
+ f"'{candidate_models_str}'?",
1299
+ level=logging.WARNING,
1240
1300
  )
1241
1301
  return False
1242
1302
  else:
1243
- logger.error(
1303
+ log(
1244
1304
  f"Failed to check if model {model_id!r} exists after {num_attempts} "
1245
- "attempts. Assuming it does not exist."
1305
+ "attempts. Assuming it does not exist.",
1306
+ level=logging.ERROR,
1246
1307
  )
1247
1308
  return False
1248
1309
 
@@ -1275,7 +1336,8 @@ class LiteLLMModel(BenchmarkModule):
1275
1336
  "that the revision is actually the parameter and set the revision "
1276
1337
  "to 'main'. In the future, use the new '#' syntax to specify the "
1277
1338
  f"parameter (in this case, this would be {proper_model_id!r}), as this "
1278
- "will be an error in future versions of EuroEval."
1339
+ "will be an error in future versions of EuroEval.",
1340
+ level=logging.WARNING,
1279
1341
  )
1280
1342
  model_id_components.param = model_id_components.revision
1281
1343
  model_id_components.revision = "main"
@@ -1363,7 +1425,7 @@ class LiteLLMModel(BenchmarkModule):
1363
1425
 
1364
1426
  return dataset
1365
1427
 
1366
- @cache
1428
+ @cache_arguments()
1367
1429
  def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
1368
1430
  """Get the generation arguments for the model.
1369
1431
 
@@ -1571,7 +1633,8 @@ def try_download_ollama_model(model_id: str) -> bool:
1571
1633
  f"The model {model_id!r} cannot be found on Ollama, but the "
1572
1634
  f"model {model_id_with_prefix} *was* found, so we would "
1573
1635
  "recommend you cancelling this run and trying the evaluation "
1574
- "with that model ID instead."
1636
+ "with that model ID instead.",
1637
+ level=logging.WARNING,
1575
1638
  )
1576
1639
  return False
1577
1640
  except ollama.ResponseError as inner_e:
@@ -1589,11 +1652,8 @@ def try_download_ollama_model(model_id: str) -> bool:
1589
1652
  ) from e
1590
1653
 
1591
1654
  # Download the model
1592
- with tqdm(
1593
- desc=f"Downloading {ollama_model_id}",
1594
- unit_scale=True,
1595
- unit="B",
1596
- leave=False,
1655
+ with get_pbar(
1656
+ desc=f"Downloading {ollama_model_id}", unit_scale=True, unit="B"
1597
1657
  ) as pbar:
1598
1658
  for status in response:
1599
1659
  if status.total is not None: