EuroEval 15.13.0__py3-none-any.whl → 15.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
86
86
  os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
87
87
 
88
88
 
89
+ # Allow long max model length in vLLM. This happens when vLLM registers that the model
90
+ # has a shorter context length than the value we are inserting. But since we do a
91
+ # thorough check of the model's config before setting the context length, we trust our
92
+ # own checks and ignore the internal vLLM check.
93
+ os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
94
+
95
+
89
96
  # Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
90
97
  # The error comes from the `aiohttp` package, and this environment variable forces the
91
98
  # use of `httpx` instead.
@@ -31,6 +31,7 @@ from litellm.exceptions import (
31
31
  from litellm.llms.vertex_ai.common_utils import VertexAIError
32
32
  from litellm.router import Router
33
33
  from litellm.types.utils import ChoiceLogprobs
34
+ from litellm.utils import supports_reasoning, supports_response_schema
34
35
  from pydantic import conlist, create_model
35
36
  from requests.exceptions import RequestException
36
37
  from tqdm.asyncio import tqdm as tqdm_async
@@ -234,6 +235,8 @@ class LiteLLMModel(BenchmarkModule):
234
235
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
235
236
  ):
236
237
  type_ = GenerativeType.REASONING
238
+ elif supports_reasoning(model=self.model_config.model_id):
239
+ type_ = GenerativeType.REASONING
237
240
  else:
238
241
  type_ = GenerativeType.INSTRUCTION_TUNED
239
242
 
@@ -314,9 +317,7 @@ class LiteLLMModel(BenchmarkModule):
314
317
  "enable it.",
315
318
  level=logging.DEBUG,
316
319
  )
317
- elif litellm.utils.supports_response_schema(
318
- model=self.model_config.model_id
319
- ):
320
+ elif supports_response_schema(model=self.model_config.model_id):
320
321
  ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
321
322
  keys_and_their_types: dict[str, t.Any] = {
322
323
  tag_name: (conlist(str, max_length=5), ...)
@@ -361,7 +362,7 @@ class LiteLLMModel(BenchmarkModule):
361
362
  level=logging.DEBUG,
362
363
  )
363
364
  elif self.model_config.revision == "no-thinking":
364
- generation_kwargs["thinking"] = dict(type="disabled", budget_tokens=0)
365
+ generation_kwargs["thinking"] = dict(budget_tokens=0)
365
366
  log_once(
366
367
  f"Disabling thinking mode for model {self.model_config.model_id!r}",
367
368
  level=logging.DEBUG,
@@ -377,6 +378,19 @@ class LiteLLMModel(BenchmarkModule):
377
378
  # Drop generation kwargs that are not supported by the model
378
379
  litellm.drop_params = True
379
380
 
381
+ # First attempt is a test run with a single conversation to handle errors
382
+ # quickly
383
+ test_conversation = conversations[0]
384
+ _, failures = safe_run(
385
+ self._generate_async(
386
+ model_id=self.model_config.model_id,
387
+ conversations=[test_conversation],
388
+ **generation_kwargs,
389
+ )
390
+ )
391
+ for _, error in failures:
392
+ self._handle_exception(error=error, generation_kwargs=generation_kwargs)
393
+
380
394
  all_responses: dict[int, "ModelResponse"] = {}
381
395
  conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
382
396
  enumerate(conversations)
@@ -477,6 +491,7 @@ class LiteLLMModel(BenchmarkModule):
477
491
  r"the thinking budget [0-9]+ is invalid. please choose a value between "
478
492
  r"[0-9]+ and ([0-9]+)\."
479
493
  )
494
+ requires_thinking_disabled_messages = ["thinking.type: Field required"]
480
495
 
481
496
  if any(msg.lower() in error_msg for msg in stop_messages):
482
497
  log_once(
@@ -557,6 +572,18 @@ class LiteLLMModel(BenchmarkModule):
557
572
  type="enabled", budget_tokens=thinking_budget - 1
558
573
  )
559
574
  return
575
+ elif (
576
+ any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
577
+ and self.generative_type != GenerativeType.REASONING
578
+ ):
579
+ log_once(
580
+ f"The model {model_id!r} requires the `thinking.type` field to be "
581
+ f"set to `disabled` rather than just setting `budget_tokens` to 0. "
582
+ "Setting `thinking.type` to `disabled`.",
583
+ level=logging.DEBUG,
584
+ )
585
+ generation_kwargs["thinking"] = dict(type="disabled")
586
+ return
560
587
  elif isinstance(
561
588
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
562
589
  ):
@@ -77,10 +77,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
77
77
  destroy_model_parallel,
78
78
  )
79
79
  from vllm.lora.request import LoRARequest
80
-
81
- if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
82
- from outlines.models.vllm import adapt_tokenizer
83
- from outlines.processors.structured import JSONLogitsProcessor
80
+ from vllm.sampling_params import GuidedDecodingParams
84
81
 
85
82
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
86
83
  import ray
@@ -327,7 +324,7 @@ class VLLMModel(HuggingFaceEncoderModel):
327
324
  if end_of_chat_token:
328
325
  stop_tokens.append(end_of_chat_token)
329
326
 
330
- logits_processor = None
327
+ structured_generation_schema = None
331
328
  if self.dataset_config.task in TASKS_USING_JSON:
332
329
  if self.generative_type == GenerativeType.REASONING:
333
330
  log_once(
@@ -342,15 +339,13 @@ class VLLMModel(HuggingFaceEncoderModel):
342
339
  tag_name: (conlist(str, max_length=5), ...)
343
340
  for tag_name in ner_tag_names
344
341
  }
345
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
346
- logits_processor = JSONLogitsProcessor(
347
- schema=pydantic_class,
348
- tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
349
- whitespace_pattern=r" ?",
342
+ answer_format_class = create_model(
343
+ "AnswerFormat", **keys_and_their_types
350
344
  )
345
+ structured_generation_schema = answer_format_class.model_json_schema()
351
346
  log_once(
352
347
  "Using structured generation with the JSON schema "
353
- f"{pydantic_class.model_json_schema()}",
348
+ f"{structured_generation_schema}",
354
349
  level=logging.DEBUG,
355
350
  )
356
351
 
@@ -374,7 +369,11 @@ class VLLMModel(HuggingFaceEncoderModel):
374
369
  logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
375
370
  temperature=0.0,
376
371
  stop=[stop_token for stop_token in stop_tokens if stop_token],
377
- logits_processors=[logits_processor] if logits_processor else None,
372
+ guided_decoding=(
373
+ GuidedDecodingParams(json=structured_generation_schema)
374
+ if structured_generation_schema
375
+ else None
376
+ ),
378
377
  )
379
378
 
380
379
  # If any of the prompts are empty then we need to replace them with a BOS token
euroeval/data_models.py CHANGED
@@ -259,7 +259,7 @@ class BenchmarkResult(pydantic.BaseModel):
259
259
  transformers_version: str | None = get_package_version("transformers")
260
260
  torch_version: str | None = get_package_version("torch")
261
261
  vllm_version: str | None = get_package_version("vllm")
262
- outlines_version: str | None = get_package_version("outlines")
262
+ xgrammar_version: str | None = get_package_version("xgrammar")
263
263
 
264
264
  @classmethod
265
265
  def from_dict(cls, config: dict) -> "BenchmarkResult":
@@ -128,3 +128,13 @@ MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
128
128
  languages=[DA],
129
129
  unofficial=True,
130
130
  )
131
+
132
+ GOLDENSWAG_DA_CONFIG = DatasetConfig(
133
+ name="goldenswag-da",
134
+ pretty_name="the truncated version of the Danish common-sense reasoning "
135
+ "dataset GoldenSwag-da, translated from the English GoldenSwag dataset",
136
+ huggingface_id="EuroEval/goldenswag-da-mini",
137
+ task=COMMON_SENSE,
138
+ languages=[DA],
139
+ unofficial=True,
140
+ )
@@ -120,3 +120,13 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
120
120
  languages=[NL],
121
121
  unofficial=True,
122
122
  )
123
+
124
+ GOLDENSWAG_NL_CONFIG = DatasetConfig(
125
+ name="goldenswag-nl",
126
+ pretty_name="the truncated version of the Dutch common-sense reasoning "
127
+ "dataset GoldenSwag-nl, translated from the English GoldenSwag dataset",
128
+ huggingface_id="EuroEval/goldenswag-nl-mini",
129
+ task=COMMON_SENSE,
130
+ languages=[NL],
131
+ unofficial=True,
132
+ )
@@ -78,3 +78,13 @@ MULTI_WIKI_QA_FI_CONFIG = DatasetConfig(
78
78
  languages=[FI],
79
79
  unofficial=True,
80
80
  )
81
+
82
+ GOLDENSWAG_FI_CONFIG = DatasetConfig(
83
+ name="goldenswag-fi",
84
+ pretty_name="the truncated version of the Finnish common-sense reasoning "
85
+ "dataset GoldenSwag-fi, translated from the English GoldenSwag dataset",
86
+ huggingface_id="EuroEval/goldenswag-fi-mini",
87
+ task=COMMON_SENSE,
88
+ languages=[FI],
89
+ unofficial=True,
90
+ )
@@ -91,3 +91,13 @@ MULTI_WIKI_QA_FR_CONFIG = DatasetConfig(
91
91
  languages=[FR],
92
92
  unofficial=True,
93
93
  )
94
+
95
+ GOLDENSWAG_FR_CONFIG = DatasetConfig(
96
+ name="goldenswag-fr",
97
+ pretty_name="the truncated version of the French common-sense reasoning "
98
+ "dataset GoldenSwag-fr, translated from the English GoldenSwag dataset",
99
+ huggingface_id="EuroEval/goldenswag-fr-mini",
100
+ task=COMMON_SENSE,
101
+ languages=[FR],
102
+ unofficial=True,
103
+ )
@@ -99,3 +99,13 @@ MULTI_WIKI_QA_DE_CONFIG = DatasetConfig(
99
99
  languages=[DE],
100
100
  unofficial=True,
101
101
  )
102
+
103
+ GOLDENSWAG_DE_CONFIG = DatasetConfig(
104
+ name="goldenswag-de",
105
+ pretty_name="the truncated version of the German common-sense reasoning "
106
+ "dataset GoldenSwag-de, translated from the English GoldenSwag dataset",
107
+ huggingface_id="EuroEval/goldenswag-de-mini",
108
+ task=COMMON_SENSE,
109
+ languages=[DE],
110
+ unofficial=True,
111
+ )
@@ -99,3 +99,13 @@ MULTI_WIKI_QA_IT_CONFIG = DatasetConfig(
99
99
  languages=[IT],
100
100
  unofficial=True,
101
101
  )
102
+
103
+ GOLDENSWAG_IT_CONFIG = DatasetConfig(
104
+ name="goldenswag-it",
105
+ pretty_name="the truncated version of the Italian common-sense reasoning "
106
+ "dataset GoldenSwag-it, translated from the English GoldenSwag dataset",
107
+ huggingface_id="EuroEval/goldenswag-it-mini",
108
+ task=COMMON_SENSE,
109
+ languages=[IT],
110
+ unofficial=True,
111
+ )
@@ -97,3 +97,13 @@ MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
97
97
  languages=[ES],
98
98
  unofficial=True,
99
99
  )
100
+
101
+ GOLDENSWAG_ES_CONFIG = DatasetConfig(
102
+ name="goldenswag-es",
103
+ pretty_name="the truncated version of the Spanish common-sense reasoning "
104
+ "dataset GoldenSwag-es, translated from the English GoldenSwag dataset",
105
+ huggingface_id="EuroEval/goldenswag-es-mini",
106
+ task=COMMON_SENSE,
107
+ languages=[ES],
108
+ unofficial=True,
109
+ )
@@ -108,3 +108,13 @@ MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
108
108
  languages=[SV],
109
109
  unofficial=True,
110
110
  )
111
+
112
+ GOLDENSWAG_SV_CONFIG = DatasetConfig(
113
+ name="goldenswag-sv",
114
+ pretty_name="the truncated version of the Swedish common-sense reasoning "
115
+ "dataset GoldenSwag-sv, translated from the English GoldenSwag dataset",
116
+ huggingface_id="EuroEval/goldenswag-sv-mini",
117
+ task=COMMON_SENSE,
118
+ languages=[SV],
119
+ unofficial=True,
120
+ )
euroeval/generation.py CHANGED
@@ -235,7 +235,7 @@ def generate_single_iteration(
235
235
  )
236
236
 
237
237
  itr_scores: dict[str, float] = model.compute_metrics(
238
- model_outputs_and_labels=(all_preds, ground_truth)
238
+ model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
239
239
  )
240
240
 
241
241
  return itr_scores
@@ -620,7 +620,8 @@ class HumanEvaluator:
620
620
  )
621
621
  ground_truth = self.active_dataset["label"]
622
622
  itr_scores: dict[str, float] = self.compute_metrics(
623
- model_outputs_and_labels=(all_preds, ground_truth)
623
+ model_outputs_and_labels=(all_preds, ground_truth),
624
+ dataset=self.active_dataset,
624
625
  )
625
626
 
626
627
  # We reverse the order, as the Info messages are printed in reverse order
euroeval/metrics.py CHANGED
@@ -14,6 +14,7 @@ from .exceptions import InvalidBenchmark
14
14
  from .utils import HiddenPrints
15
15
 
16
16
  if t.TYPE_CHECKING:
17
+ from datasets.arrow_dataset import Dataset
17
18
  from evaluate import EvaluationModule
18
19
 
19
20
  logger = logging.getLogger(__name__)
@@ -49,7 +50,9 @@ class Metric(abc.ABC):
49
50
  )
50
51
 
51
52
  @abc.abstractmethod
52
- def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
53
+ def __call__(
54
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
55
+ ) -> float | None:
53
56
  """Calculate the metric score.
54
57
 
55
58
  Args:
@@ -57,6 +60,9 @@ class Metric(abc.ABC):
57
60
  The model predictions.
58
61
  references:
59
62
  The ground truth references.
63
+ dataset:
64
+ The dataset used for evaluation. This is only used in case any
65
+ additional metadata is used to compute the metrics.
60
66
 
61
67
  Returns:
62
68
  The calculated metric score, or None if the score should be ignored.
@@ -125,7 +131,9 @@ class HuggingFaceMetric(Metric):
125
131
  )
126
132
  self.metric: "EvaluationModule | None" = None
127
133
 
128
- def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
134
+ def __call__(
135
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
136
+ ) -> float | None:
129
137
  """Calculate the metric score.
130
138
 
131
139
  Args:
@@ -133,6 +141,9 @@ class HuggingFaceMetric(Metric):
133
141
  The model predictions.
134
142
  references:
135
143
  The ground truth references.
144
+ dataset:
145
+ The dataset used for evaluation. This is only used in case any
146
+ additional metadata is used to compute the metrics.
136
147
 
137
148
  Returns:
138
149
  The calculated metric score, or None if the score should be ignored.
@@ -213,7 +224,9 @@ class LLMAsAJudgeMetric(Metric):
213
224
  self.condition_formatting_fn = condition_formatting_fn
214
225
  self.system_prompt = system_prompt
215
226
 
216
- def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
227
+ def __call__(
228
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
229
+ ) -> float | None:
217
230
  """Calculate the metric score using the judge model.
218
231
 
219
232
  Args:
@@ -221,6 +234,9 @@ class LLMAsAJudgeMetric(Metric):
221
234
  The model predictions.
222
235
  references:
223
236
  The ground truth references.
237
+ dataset:
238
+ The dataset used for evaluation. This is only used in case any
239
+ additional metadata is used to compute the metrics.
224
240
 
225
241
  Returns:
226
242
  The calculated metric score, or None if the score should be ignored.
@@ -343,7 +359,7 @@ class SpeedMetric(Metric):
343
359
  postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
344
360
  )
345
361
 
346
- def __call__(self, _: t.Sequence, __: t.Sequence) -> float | None:
362
+ def __call__(self, _: t.Sequence, __: t.Sequence, ___: "Dataset") -> float | None:
347
363
  """Not used with the speed metric, but required for consistency."""
348
364
  raise NotImplementedError
349
365
 
@@ -69,7 +69,7 @@ MULTIPLE_CHOICE_TEMPLATES = {
69
69
  IT: PromptConfig(
70
70
  default_prompt_prefix="Le seguenti sono domande a scelta multipla "
71
71
  "(con relative risposte).",
72
- default_prompt_template="Domanda: {text}\nRéponse: {label}",
72
+ default_prompt_template="Domanda: {text}\nRisposta: {label}",
73
73
  default_instruction_prompt="Domanda: {text}\n\nRispondete alla domanda "
74
74
  "precedente con {labels_str}, e nient'altro.",
75
75
  default_prompt_label_mapping="auto",
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
149
149
  def compute_metrics(
150
150
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
151
151
  dataset_config: "DatasetConfig",
152
+ dataset: "Dataset",
152
153
  ) -> dict[str, float]:
153
154
  """Compute the metrics needed for evaluation.
154
155
 
@@ -158,6 +159,9 @@ def compute_metrics(
158
159
  contains the true labels.
159
160
  dataset_config:
160
161
  The configuration of the dataset.
162
+ dataset:
163
+ The dataset used for evaluation. This is only used in case any additional
164
+ metadata is used to compute the metrics.
161
165
 
162
166
  Returns:
163
167
  A dictionary with the names of the metrics as keys and the metric values as
@@ -181,7 +185,9 @@ def compute_metrics(
181
185
 
182
186
  results: dict[str, float] = dict()
183
187
  for metric in dataset_config.task.metrics:
184
- score: float | None = metric(predictions=predictions, references=labels)
188
+ score: float | None = metric(
189
+ predictions=predictions, references=labels, dataset=dataset
190
+ )
185
191
 
186
192
  # The metric returns None if we are running on multi-GPU and the current
187
193
  # process is not the main process
@@ -11,6 +11,7 @@ from ..exceptions import InvalidBenchmark
11
11
  from ..utils import log_once, raise_if_model_output_contains_nan_values
12
12
 
13
13
  if t.TYPE_CHECKING:
14
+ from datasets.arrow_dataset import Dataset
14
15
  from transformers.trainer_utils import EvalPrediction
15
16
 
16
17
  from ..data_models import DatasetConfig, GenerativeModelOutput
@@ -23,6 +24,7 @@ logger = logging.getLogger("euroeval")
23
24
  def compute_metrics(
24
25
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
25
26
  dataset_config: "DatasetConfig",
27
+ dataset: "Dataset",
26
28
  ) -> dict[str, float]:
27
29
  """Compute the metrics needed for evaluation.
28
30
 
@@ -32,6 +34,9 @@ def compute_metrics(
32
34
  contains the true labels.
33
35
  dataset_config:
34
36
  The configuration of the dataset.
37
+ dataset:
38
+ The dataset used for evaluation. This is only used in case any additional
39
+ metadata is used to compute the metrics.
35
40
 
36
41
  Returns:
37
42
  A dictionary with the names of the metrics as keys and the metric values as
@@ -73,7 +78,9 @@ def compute_metrics(
73
78
 
74
79
  results: dict[str, float] = dict()
75
80
  for metric in dataset_config.task.metrics:
76
- score: float | None = metric(predictions=predictions, references=label_ids)
81
+ score: float | None = metric(
82
+ predictions=predictions, references=label_ids, dataset=dataset
83
+ )
77
84
 
78
85
  # The metric returns None if we are running on multi-GPU and the current
79
86
  # process is not the main process
@@ -11,6 +11,7 @@ from ..metrics import HuggingFaceMetric
11
11
  from ..utils import raise_if_model_output_contains_nan_values
12
12
 
13
13
  if t.TYPE_CHECKING:
14
+ from datasets.arrow_dataset import Dataset
14
15
  from transformers.trainer_utils import EvalPrediction
15
16
 
16
17
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
@@ -24,6 +25,7 @@ def compute_metrics(
24
25
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
25
26
  dataset_config: "DatasetConfig",
26
27
  benchmark_config: "BenchmarkConfig",
28
+ dataset: "Dataset",
27
29
  ) -> dict[str, float]:
28
30
  """Compute the metrics needed for evaluation.
29
31
 
@@ -35,6 +37,9 @@ def compute_metrics(
35
37
  The configuration of the dataset.
36
38
  benchmark_config:
37
39
  The configuration of the benchmark.
40
+ dataset:
41
+ The dataset used for evaluation. This is only used in case any additional
42
+ metadata is used to compute the metrics.
38
43
 
39
44
  Returns:
40
45
  A dictionary with the names of the metrics as keys and the metric values as
@@ -69,7 +74,9 @@ def compute_metrics(
69
74
 
70
75
  while True:
71
76
  try:
72
- score: float | None = metric(predictions=predictions, references=labels)
77
+ score: float | None = metric(
78
+ predictions=predictions, references=labels, dataset=dataset
79
+ )
73
80
  break
74
81
  except Exception as e:
75
82
  oom_error = [
@@ -12,6 +12,7 @@ from ..exceptions import InvalidBenchmark
12
12
  from ..utils import raise_if_model_output_contains_nan_values
13
13
 
14
14
  if t.TYPE_CHECKING:
15
+ from datasets.arrow_dataset import Dataset
15
16
  from transformers.tokenization_utils import PreTrainedTokenizer
16
17
  from transformers.tokenization_utils_base import BatchEncoding
17
18
  from transformers.trainer_utils import EvalPrediction
@@ -27,6 +28,7 @@ def compute_metrics(
27
28
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
28
29
  has_misc_tags: bool,
29
30
  dataset_config: "DatasetConfig",
31
+ dataset: "Dataset",
30
32
  ) -> dict[str, float]:
31
33
  """Compute the metrics needed for evaluation.
32
34
 
@@ -38,6 +40,9 @@ def compute_metrics(
38
40
  Whether the dataset has MISC tags.
39
41
  dataset_config:
40
42
  The configuration of the dataset.
43
+ dataset:
44
+ The dataset used for evaluation. This is only used in case any additional
45
+ metadata is used to compute the metrics.
41
46
 
42
47
  Returns:
43
48
  A dictionary with the names of the metrics as keys and the metric values as
@@ -136,7 +141,9 @@ def compute_metrics(
136
141
  for metric in dataset_config.task.metrics
137
142
  if metric.name == "micro_f1"
138
143
  )
139
- micro_f1_score = metric(predictions=predictions, references=list(labels))
144
+ micro_f1_score = metric(
145
+ predictions=predictions, references=list(labels), dataset=dataset
146
+ )
140
147
 
141
148
  # Compute the metrics without MISC tags
142
149
  # We manually set the F1 metric to be 100% if both the labels and the models
@@ -158,7 +165,7 @@ def compute_metrics(
158
165
  if metric.name == "micro_f1_no_misc"
159
166
  )
160
167
  micro_f1_no_misc_score = metric(
161
- predictions=predictions_no_misc, references=labels_no_misc
168
+ predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
162
169
  )
163
170
 
164
171
  # Raise error if the metrics are invalid
euroeval/types.py CHANGED
@@ -5,6 +5,7 @@ import typing as t
5
5
  from transformers.trainer_utils import EvalPrediction
6
6
 
7
7
  if t.TYPE_CHECKING:
8
+ from datasets.arrow_dataset import Dataset
8
9
  from numpy.typing import NDArray
9
10
 
10
11
  from .data_models import GenerativeModelOutput
@@ -25,12 +26,16 @@ class ComputeMetricsFunction(t.Protocol):
25
26
  "NDArray | list[str] | list[list[str]]",
26
27
  "NDArray | list[str] | list[list[str]]",
27
28
  ],
29
+ dataset: "Dataset",
28
30
  ) -> dict[str, float]:
29
31
  """Compute the metrics.
30
32
 
31
33
  Args:
32
34
  model_outputs_and_labels:
33
35
  The model outputs and labels.
36
+ dataset:
37
+ The dataset used for evaluation. This is only used in case any
38
+ additional metadata is used to compute the metrics.
34
39
 
35
40
  Returns:
36
41
  The computed metrics.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.13.0
3
+ Version: 15.15.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,13 +61,11 @@ Provides-Extra: all
61
61
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
- Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
66
65
  Provides-Extra: generative
67
66
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
67
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
- Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
68
+ Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'generative'
71
69
  Provides-Extra: human-evaluation
72
70
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
71
  Provides-Extra: test
@@ -1,19 +1,19 @@
1
- euroeval/__init__.py,sha256=fZyR9R3C3vwGJS3CrCJ6ySr_FDnMu_Aqnz0FdadWEEs,3399
1
+ euroeval/__init__.py,sha256=ZZoVc6tKWz_h8Pw2n26PV-q_Gd4TM_02O235ZBRUNJw,3756
2
2
  euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
3
3
  euroeval/benchmarker.py,sha256=SDBzdCa4I8u1XDeN_1mKTFzfaaQbbY_oWcHt3niADxk,48497
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
5
  euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
6
6
  euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
7
7
  euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
8
- euroeval/data_models.py,sha256=gPHyIoN2A5_O-cJgyb6jhn6enH8zsiIBI09W_wdHMQs,22031
8
+ euroeval/data_models.py,sha256=qSCNq3PV7qo--gibqEvvu4cXkEkhGGAb6UiZW8U_KiU,22031
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
11
  euroeval/finetuning.py,sha256=BrPZ-6qFY8K-dwfaRwNetVYfYburoQwLQty6pn6iP_s,11340
12
- euroeval/generation.py,sha256=1fqFEWwM2RzI3uPZem95VFWbN8EfrKZQTrHEP34ihHs,11622
12
+ euroeval/generation.py,sha256=lmvu__6w3cLxi0zBtXSlyZvV8CJpV3BdajUoIEA9ElA,11639
13
13
  euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
14
- euroeval/human_evaluation.py,sha256=Jtz3K5Lqne48wPZWf4EAd3d-n_wX27nGJHigjhV1D7s,27537
14
+ euroeval/human_evaluation.py,sha256=FLuTl1DHxCiWB_laVVQHIH86yXvA_ZeNNSrUmyExZXI,27579
15
15
  euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
16
- euroeval/metrics.py,sha256=nxosyoRjlk7TcoAOkjU7zx2TB43b9tA8M1m4V1s5eKU,15516
16
+ euroeval/metrics.py,sha256=d59VRsjGFA2h2s4J8zRgdGxCu_pA3YhfvKxkK6pN6GI,16185
17
17
  euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
18
18
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
19
19
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
@@ -21,43 +21,43 @@ euroeval/scores.py,sha256=TatSbjia7Zwj71gQFyV_gCHyppMbOgeaZgNCib8G86k,2849
21
21
  euroeval/speed_benchmark.py,sha256=6bFGeMmtdl_6owkxNQ3ZKiyQQS58k0NApzlsbDgBW5s,4037
22
22
  euroeval/tasks.py,sha256=btxf29M5rUP7JjBl6u9aQlHQAxrJNP4bRbdEQtDnmDA,3376
23
23
  euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
24
- euroeval/types.py,sha256=EIYMNOqqHqibnbNw-fvdst6HwTvq32gtxhr7jL7i-xM,2511
24
+ euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
25
25
  euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
26
26
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
27
27
  euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
28
28
  euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
29
29
  euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
30
- euroeval/benchmark_modules/litellm.py,sha256=_gKBbJsXzo_cHJVaeuQpHRBENEZUGS_vcC-uGIhhmHA,52111
31
- euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
30
+ euroeval/benchmark_modules/litellm.py,sha256=qv-k2ntk48OF4ikevQ95k4zLbBkZYOZ2z-GAisA-tFY,53374
31
+ euroeval/benchmark_modules/vllm.py,sha256=Uq81tgNSkajuawdJ1lH1s9Te9wubYd-CyBbM-B5YZcA,38693
32
32
  euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
33
- euroeval/dataset_configs/danish.py,sha256=-y-n08hTApwTdSVdjRlZYa3gOX92cTGhg8xsuG-Lhww,3691
34
- euroeval/dataset_configs/dutch.py,sha256=siyFeEKYx2gBpyqQPtOZ0cD8FTsIMUqzRX5xrQfrNXI,3480
33
+ euroeval/dataset_configs/danish.py,sha256=0lDtvpgszXY1XaPjTU8yA3oNCU8W2OllvrBWgn6pkhk,4027
34
+ euroeval/dataset_configs/dutch.py,sha256=ekZxLL9d09BUMijCxy9EFa2heNQVvySPySOjhWdtJc8,3815
35
35
  euroeval/dataset_configs/english.py,sha256=uQAaGWpHk8xqFCeIhmmPXYTb1cZomeEdRaRe9qIZQrg,2858
36
36
  euroeval/dataset_configs/faroese.py,sha256=gkgxQTWGFbfg9Eo1z-NSLROgKDcaij9tAN2mfgtrt0M,1647
37
- euroeval/dataset_configs/finnish.py,sha256=OyveLgyii0hOlo6HZsqAq4rwDrj8tl2qstRfQKugURo,2342
38
- euroeval/dataset_configs/french.py,sha256=DKKZEtohWkw_ouBaxWcPzp-K6NhQNtvCKxj8NLbIpUc,2678
39
- euroeval/dataset_configs/german.py,sha256=3bfRgkqIGkAhcw4kwcJN9PKuJSmi1r6AFTJY-IWKgWM,2856
37
+ euroeval/dataset_configs/finnish.py,sha256=UZwy0_d17O2L-v2AKOu3OlDwFPcLGTZNAOt7ZKlr4K8,2679
38
+ euroeval/dataset_configs/french.py,sha256=Hei2M4bGIz8hVtaPKQlQATcmK-0bFBNEocEszR3gia0,3014
39
+ euroeval/dataset_configs/german.py,sha256=sRYtOl6CYf4kZkeINfff6xoKBG4OsDxb2b72lKwELGc,3192
40
40
  euroeval/dataset_configs/icelandic.py,sha256=g21IHjcwEZvf_yJ9PobeuBOqRiLOk0oCdEjY34g-UMk,4497
41
- euroeval/dataset_configs/italian.py,sha256=rHLMkSXT0kFoQlkwHODxO50WBRIfGtkAnW_C-sfIu74,2957
41
+ euroeval/dataset_configs/italian.py,sha256=4SEmdUyfGbbwMPhv_9nL3JNJtoDKHLAlWuvr7Ihmi9o,3294
42
42
  euroeval/dataset_configs/norwegian.py,sha256=-WvQM44xCwjrqBzlAy4rjf6v87fGera2JmZV_069TeQ,6003
43
43
  euroeval/dataset_configs/portuguese.py,sha256=3SqbwD0PNTILGALzh50pVoEwC-spRD75ZeE2NEj151E,2367
44
- euroeval/dataset_configs/spanish.py,sha256=VKfBIpBRR38ckuULw7Ftmc-0smsm6GshUAik2-Y1Npw,2855
45
- euroeval/dataset_configs/swedish.py,sha256=WpExi4TJqy_Ruwy4Kvde94jM605vT_88el_KKUzLV4E,3108
44
+ euroeval/dataset_configs/spanish.py,sha256=Bm0Z19Mh2qYXR0RIRlqEkzfVb5KiqJRectfuY7JLql4,3192
45
+ euroeval/dataset_configs/swedish.py,sha256=js4paNsuC0nQzPpf6_BzHBf7MT60XUpP1-qM2uxRtQs,3445
46
46
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
47
47
  euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
48
- euroeval/prompt_templates/multiple_choice.py,sha256=F9ItGQtnaaez15A8MQ1UCpKRDsLM-AZyRdYetGAofa0,5494
48
+ euroeval/prompt_templates/multiple_choice.py,sha256=wHnQCE5bv947L6hSK5zJitE37V-PbuNYAp156mWaIYA,5494
49
49
  euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
50
50
  euroeval/prompt_templates/reading_comprehension.py,sha256=3Nch-9zHfUDIwy-k5mP-TRhHQRQ9nad8HdhpJ1S8nGc,7072
51
51
  euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
52
52
  euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
53
53
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
54
54
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
55
- euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6zmoaPgYVyzMmOkNjr58,27284
56
- euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
57
- euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
58
- euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
59
- euroeval-15.13.0.dist-info/METADATA,sha256=HnDtAE2-sYFmSl4yM9PQhgUrfklR_OB5C5aXPOgz5U8,13478
60
- euroeval-15.13.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
- euroeval-15.13.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
- euroeval-15.13.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
- euroeval-15.13.0.dist-info/RECORD,,
55
+ euroeval/task_group_utils/question_answering.py,sha256=6jpiHukzA7IrJh4vVYyZDDyvD5Xc2GsxoXzpm_PHpXw,27503
56
+ euroeval/task_group_utils/sequence_classification.py,sha256=ihJO55f3Dy565d3ByYGMuSINasnjAADaTrM59LwZzA0,12977
57
+ euroeval/task_group_utils/text_to_text.py,sha256=go0y6X9QAv5iywlLAclb8cqFX_3QlAT-1-VNZ9zMWFA,4832
58
+ euroeval/task_group_utils/token_classification.py,sha256=BDqOfopdH5Bbj67HTEbZd9KZtNCDNket8NrCTfxZFzQ,17773
59
+ euroeval-15.15.0.dist-info/METADATA,sha256=ldIaYcwIlgDbuHPz_uHKrcYbmh-GLB9T239BjqYRalk,13377
60
+ euroeval-15.15.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
+ euroeval-15.15.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
+ euroeval-15.15.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
+ euroeval-15.15.0.dist-info/RECORD,,