EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show
  1. euroeval/benchmark_config_factory.py +6 -1
  2. euroeval/benchmark_modules/base.py +2 -0
  3. euroeval/benchmark_modules/fresh.py +7 -1
  4. euroeval/benchmark_modules/hf.py +26 -21
  5. euroeval/benchmark_modules/litellm.py +258 -131
  6. euroeval/benchmark_modules/vllm.py +79 -40
  7. euroeval/benchmarker.py +11 -2
  8. euroeval/cli.py +14 -1
  9. euroeval/constants.py +1 -1
  10. euroeval/data_models.py +77 -6
  11. euroeval/dataset_configs/__init__.py +1 -0
  12. euroeval/dataset_configs/danish.py +14 -0
  13. euroeval/dataset_configs/dutch.py +14 -0
  14. euroeval/dataset_configs/english.py +22 -0
  15. euroeval/dataset_configs/estonian.py +15 -7
  16. euroeval/dataset_configs/finnish.py +14 -0
  17. euroeval/dataset_configs/french.py +14 -0
  18. euroeval/dataset_configs/german.py +23 -0
  19. euroeval/dataset_configs/italian.py +14 -0
  20. euroeval/dataset_configs/latvian.py +14 -0
  21. euroeval/dataset_configs/norwegian.py +14 -0
  22. euroeval/dataset_configs/polish.py +126 -0
  23. euroeval/dataset_configs/portuguese.py +14 -0
  24. euroeval/dataset_configs/spanish.py +14 -0
  25. euroeval/dataset_configs/swedish.py +25 -0
  26. euroeval/enums.py +12 -0
  27. euroeval/generation.py +17 -8
  28. euroeval/generation_utils.py +65 -11
  29. euroeval/metrics/pipeline.py +1 -1
  30. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  31. euroeval/prompt_templates/multiple_choice.py +27 -1
  32. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  33. euroeval/prompt_templates/reading_comprehension.py +11 -0
  34. euroeval/prompt_templates/sentiment_classification.py +15 -0
  35. euroeval/prompt_templates/summarization.py +27 -1
  36. euroeval/scores.py +5 -0
  37. euroeval/task_group_utils/question_answering.py +29 -29
  38. euroeval/task_group_utils/sequence_classification.py +11 -34
  39. euroeval/task_group_utils/token_classification.py +3 -3
  40. euroeval/tasks.py +4 -4
  41. euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
  42. euroeval/utils.py +36 -3
  43. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
  44. euroeval-16.1.1.dist-info/RECORD +70 -0
  45. euroeval-16.0.1.dist-info/RECORD +0 -69
  46. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
  47. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
  48. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,7 @@ import torch
8
8
 
9
9
  from .data_models import BenchmarkConfig
10
10
  from .dataset_configs import get_all_dataset_configs
11
- from .enums import Device
11
+ from .enums import Device, GenerativeType
12
12
  from .exceptions import InvalidBenchmark
13
13
  from .languages import get_all_languages
14
14
  from .tasks import SPEED, get_all_tasks
@@ -43,6 +43,7 @@ def build_benchmark_config(
43
43
  api_base: str | None,
44
44
  api_version: str | None,
45
45
  gpu_memory_utilization: float,
46
+ generative_type: GenerativeType | None,
46
47
  debug: bool,
47
48
  run_with_cli: bool,
48
49
  requires_safetensors: bool,
@@ -107,6 +108,9 @@ def build_benchmark_config(
107
108
  faster evaluation, but at the risk of running out of GPU memory. Only reduce
108
109
  this if you are running out of GPU memory. Only relevant if the model is
109
110
  generative.
111
+ generative_type:
112
+ The type of generative model. Only relevant if the model is generative. If
113
+ not specified, the type will be inferred automatically.
110
114
  debug:
111
115
  Whether to run the benchmark in debug mode.
112
116
  run_with_cli:
@@ -157,6 +161,7 @@ def build_benchmark_config(
157
161
  api_base=api_base,
158
162
  api_version=api_version,
159
163
  gpu_memory_utilization=gpu_memory_utilization,
164
+ generative_type=generative_type,
160
165
  debug=debug,
161
166
  run_with_cli=run_with_cli,
162
167
  requires_safetensors=requires_safetensors,
@@ -2,6 +2,7 @@
2
2
 
3
3
  import collections.abc as c
4
4
  import logging
5
+ import re
5
6
  import sys
6
7
  import typing as t
7
8
  from abc import ABC, abstractmethod
@@ -55,6 +56,7 @@ class BenchmarkModule(ABC):
55
56
  fresh_model: bool
56
57
  batching_preference: "BatchingPreference"
57
58
  high_priority: bool
59
+ allowed_params: dict[re.Pattern, list[str]] = {re.compile(r".*"): []}
58
60
 
59
61
  def __init__(
60
62
  self,
@@ -25,6 +25,7 @@ from ..exceptions import (
25
25
  NeedsEnvironmentVariable,
26
26
  NeedsExtraInstalled,
27
27
  )
28
+ from ..generation_utils import raise_if_wrong_params
28
29
  from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
29
30
  from .hf import (
30
31
  HuggingFaceEncoderModel,
@@ -64,6 +65,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
64
65
  log_metadata:
65
66
  Whether to log metadata about the model and the benchmark.
66
67
  """
68
+ raise_if_wrong_params(
69
+ model_config=model_config, allowed_params=self.allowed_params
70
+ )
71
+
67
72
  # This is already set when calling `super.__init__`, but we need it to get a
68
73
  # value from `self.model_max_length`, so we set it here as well.
69
74
  self.model_config = model_config
@@ -183,9 +188,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
183
188
  """
184
189
  return ModelConfig(
185
190
  model_id=model_id,
191
+ revision="main",
192
+ param=None,
186
193
  task="fill-mask",
187
194
  languages=list(),
188
- revision="main",
189
195
  merge=False,
190
196
  inference_backend=InferenceBackend.TRANSFORMERS,
191
197
  model_type=ModelType.ENCODER,
@@ -14,6 +14,7 @@ from huggingface_hub import HfApi
14
14
  from huggingface_hub import whoami as hf_whoami
15
15
  from huggingface_hub.errors import (
16
16
  GatedRepoError,
17
+ HfHubHTTPError,
17
18
  HFValidationError,
18
19
  LocalTokenNotFoundError,
19
20
  RepositoryNotFoundError,
@@ -56,13 +57,14 @@ from ..exceptions import (
56
57
  NeedsEnvironmentVariable,
57
58
  NeedsExtraInstalled,
58
59
  )
60
+ from ..generation_utils import raise_if_wrong_params
59
61
  from ..languages import get_all_languages
60
62
  from ..task_group_utils import (
61
63
  multiple_choice_classification,
62
64
  question_answering,
63
65
  token_classification,
64
66
  )
65
- from ..tokenization_utils import get_bos_token, get_eos_token
67
+ from ..tokenisation_utils import get_bos_token, get_eos_token
66
68
  from ..utils import (
67
69
  block_terminal_output,
68
70
  create_model_cache_dir,
@@ -70,6 +72,7 @@ from ..utils import (
70
72
  get_hf_token,
71
73
  internet_connection_available,
72
74
  log_once,
75
+ split_model_id,
73
76
  )
74
77
  from .base import BenchmarkModule
75
78
 
@@ -110,6 +113,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
110
113
  log_metadata:
111
114
  Whether to log the model metadata.
112
115
  """
116
+ raise_if_wrong_params(
117
+ model_config=model_config, allowed_params=self.allowed_params
118
+ )
119
+
113
120
  model, tokeniser = load_model_and_tokeniser(
114
121
  model_config=model_config,
115
122
  dataset_config=dataset_config,
@@ -247,15 +254,6 @@ class HuggingFaceEncoderModel(BenchmarkModule):
247
254
  max_length for max_length in all_max_lengths if max_length >= 128
248
255
  ]
249
256
 
250
- # We remove the upper cap of maximum context length for the model, as it is
251
- # highly unlikely that this is the model's actual maximum context length - we
252
- # would rather not report a value than report an incorrect one.
253
- all_max_lengths = [
254
- max_length
255
- for max_length in all_max_lengths
256
- if max_length != MAX_CONTEXT_LENGTH
257
- ]
258
-
259
257
  if len(list(all_max_lengths)) > 0:
260
258
  model_max_length = min(list(all_max_lengths))
261
259
  else:
@@ -483,11 +481,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
483
481
  Whether the model exists, or an error describing why we cannot check
484
482
  whether the model exists.
485
483
  """
486
- model_id, revision = (
487
- model_id.split("@") if "@" in model_id else (model_id, "main")
488
- )
484
+ model_id_components = split_model_id(model_id=model_id)
489
485
  model_info = get_model_repo_info(
490
- model_id=model_id, revision=revision, benchmark_config=benchmark_config
486
+ model_id=model_id_components.model_id,
487
+ revision=model_id_components.revision,
488
+ benchmark_config=benchmark_config,
491
489
  )
492
490
  return (
493
491
  model_info is not None
@@ -509,11 +507,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
509
507
  Returns:
510
508
  The model configuration.
511
509
  """
512
- model_id, revision = (
513
- model_id.split("@") if "@" in model_id else (model_id, "main")
514
- )
510
+ model_id_components = split_model_id(model_id=model_id)
515
511
  model_info = get_model_repo_info(
516
- model_id=model_id, revision=revision, benchmark_config=benchmark_config
512
+ model_id=model_id_components.model_id,
513
+ revision=model_id_components.revision,
514
+ benchmark_config=benchmark_config,
517
515
  )
518
516
  if model_info is None:
519
517
  raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -522,8 +520,9 @@ class HuggingFaceEncoderModel(BenchmarkModule):
522
520
  language_codes = list(language_mapping.keys())
523
521
 
524
522
  model_config = ModelConfig(
525
- model_id=model_id,
526
- revision=revision,
523
+ model_id=model_id_components.model_id,
524
+ revision=model_id_components.revision,
525
+ param=model_id_components.param,
527
526
  task=model_info.pipeline_tag,
528
527
  languages=[
529
528
  language_mapping[tag]
@@ -710,7 +709,6 @@ def get_model_repo_info(
710
709
  """
711
710
  token = get_hf_token(api_key=benchmark_config.api_key)
712
711
  hf_api = HfApi(token=token)
713
- model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "main")
714
712
 
715
713
  # Get information on the model.
716
714
  # The first case is when the model is a local model, in which case we create a dummy
@@ -753,6 +751,13 @@ def get_model_repo_info(
753
751
  return None
754
752
  except (RepositoryNotFoundError, HFValidationError):
755
753
  return None
754
+ except HfHubHTTPError as e:
755
+ if "unauthorized" in str(e).lower():
756
+ raise InvalidModel(
757
+ "It seems like your specified Hugging Face API key is invalid. "
758
+ "Please double-check your API key."
759
+ ) from e
760
+ raise InvalidModel(str(e)) from e
756
761
  except (OSError, RequestException) as e:
757
762
  if internet_connection_available():
758
763
  errors.append(e)