EuroEval 16.0.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -13,6 +13,7 @@ from termcolor import colored
13
13
 
14
14
  # Block specific warnings before importing anything else, as they can be noisy
15
15
  warnings.filterwarnings("ignore", category=UserWarning)
16
+ warnings.filterwarnings("ignore", category=FutureWarning)
16
17
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
17
18
  logging.getLogger("datasets").setLevel(logging.CRITICAL)
18
19
  logging.getLogger("vllm").setLevel(logging.CRITICAL)
@@ -101,6 +102,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
101
102
  os.environ["VLLM_USE_V1"] = "1"
102
103
 
103
104
 
105
+ # Use the FlashInfer flash-attention backend for vLLM
106
+ os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
107
+
108
+
104
109
  # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
105
110
  # former and LiteLLM uses the latter
106
111
  if os.getenv("HUGGINGFACE_API_KEY"):
@@ -337,31 +337,6 @@ class VLLMModel(HuggingFaceEncoderModel):
337
337
  if end_of_chat_token:
338
338
  stop_tokens.append(end_of_chat_token)
339
339
 
340
- structured_generation_schema = None
341
- if self.dataset_config.task.uses_structured_output:
342
- if self.generative_type == GenerativeType.REASONING:
343
- log_once(
344
- f"The model {self.model_config.model_id!r} is a reasoning model "
345
- "and thus does not support structured generation, so we do not "
346
- "enable it.",
347
- level=logging.DEBUG,
348
- )
349
- else:
350
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
351
- keys_and_their_types: dict[str, t.Any] = {
352
- tag_name: (conlist(str, max_length=5), ...)
353
- for tag_name in ner_tag_names
354
- }
355
- answer_format_class = create_model(
356
- "AnswerFormat", **keys_and_their_types
357
- )
358
- structured_generation_schema = answer_format_class.model_json_schema()
359
- log_once(
360
- "Using structured generation with the JSON schema "
361
- f"{structured_generation_schema}",
362
- level=logging.DEBUG,
363
- )
364
-
365
340
  # Get the mapping from labels to the first token in the label. We call this each
366
341
  # time we generate a new dataset since the dataset config can change
367
342
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
@@ -382,8 +357,29 @@ class VLLMModel(HuggingFaceEncoderModel):
382
357
  "error was. Skipping this evaluation."
383
358
  )
384
359
 
385
- # Define the guided decoding that we will use for structured generation
386
- if structured_generation_schema is not None:
360
+ structured_generation_schema = None
361
+ if (
362
+ self.dataset_config.task.uses_structured_output
363
+ or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
364
+ ) and self.generative_type == GenerativeType.REASONING:
365
+ guided_decoding = None
366
+ logger.debug(
367
+ "The dataset uses structured output, but we are not using it as the "
368
+ "model is a reasoning model."
369
+ )
370
+ elif self.dataset_config.task.uses_structured_output:
371
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
372
+ keys_and_their_types: dict[str, t.Any] = {
373
+ tag_name: (conlist(str, max_length=5), ...)
374
+ for tag_name in ner_tag_names
375
+ }
376
+ answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
377
+ structured_generation_schema = answer_format_class.model_json_schema()
378
+ log_once(
379
+ "Using structured generation with the JSON schema: "
380
+ f"{json.dumps(structured_generation_schema)}",
381
+ level=logging.DEBUG,
382
+ )
387
383
  guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
388
384
  elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
389
385
  guided_decoding = GuidedDecodingParams(
@@ -392,8 +388,17 @@ class VLLMModel(HuggingFaceEncoderModel):
392
388
  for label in self.dataset_config.labels
393
389
  ]
394
390
  )
391
+ log_once(
392
+ "Using structured generation with the choices: "
393
+ f"{guided_decoding.choice!r}.",
394
+ level=logging.DEBUG,
395
+ )
395
396
  else:
396
397
  guided_decoding = None
398
+ log_once(
399
+ "Not using structured generation as the dataset does not require it.",
400
+ level=logging.DEBUG,
401
+ )
397
402
 
398
403
  # Define the parameters used for vLLM generation
399
404
  max_tokens: int = (
@@ -439,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
439
444
  # Generate sequences using vLLM
440
445
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
441
446
  num_attempts = 3
447
+ truncation_attempts = 0
442
448
  for _ in range(num_attempts):
443
449
  try:
444
450
  raw_outputs = self._model.generate(
@@ -466,12 +472,19 @@ class VLLMModel(HuggingFaceEncoderModel):
466
472
  "Prompts are too long, so truncating them and trying again..."
467
473
  )
468
474
  logger.debug(f"The error message was: {str(e)}")
475
+
476
+ # If we have already tried truncating the prompts a few times, then
477
+ # we truncate a bit more aggressively
478
+ extra_truncation = 50 * truncation_attempts
479
+ truncation_attempts += 1
480
+
469
481
  tokenized_prompts = self._tokeniser(
470
482
  text=prompts,
471
483
  truncation=True,
472
484
  max_length=max(
473
485
  min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
474
- - max_tokens,
486
+ - max_tokens
487
+ - extra_truncation,
475
488
  0,
476
489
  ),
477
490
  )
euroeval/constants.py CHANGED
@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
75
75
 
76
76
  # These characters are stripped from JSON output when trying to identify the label
77
77
  JSON_STRIP_CHARACTERS = ' {}\n\r":'
78
+
79
+
80
+ # The number of tokens we generate when evaluating generative models on classification
81
+ # tasks. We also use this to determine whether we should store logprobs in the model
82
+ # outputs (and cache).
83
+ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
euroeval/data_models.py CHANGED
@@ -125,6 +125,12 @@ class Task:
125
125
  A list of generative model types that are allowed to be evaluated on this
126
126
  task. If None, all generative model types are allowed. Only relevant if
127
127
  `allowed_model_types` includes generative models.
128
+ allow_invalid_model_outputs (optional):
129
+ Whether to allow invalid model outputs. This is only relevant for generative
130
+ models on classification tasks, where the model may generate an output
131
+ which is not one of the allowed labels. If True, the model output will be
132
+ mapped to the closest valid label. If False, the model output will be
133
+ considered incorrect and the evaluation will be aborted. Defaults to True.
128
134
  """
129
135
 
130
136
  name: str
@@ -148,6 +154,7 @@ class Task:
148
154
  GenerativeType.REASONING,
149
155
  ]
150
156
  )
157
+ allow_invalid_model_outputs: bool = True
151
158
 
152
159
  def __post_init__(self) -> None:
153
160
  """Post-initialisation checks."""
@@ -430,7 +437,6 @@ class DatasetConfig:
430
437
  if self._prompt_prefix is None
431
438
  else self._prompt_prefix
432
439
  )
433
- prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
434
440
  return prompt_prefix
435
441
 
436
442
  @property
@@ -443,7 +449,6 @@ class DatasetConfig:
443
449
  if self._prompt_template is None
444
450
  else self._prompt_template
445
451
  )
446
- prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
447
452
  return prompt_template
448
453
 
449
454
  @property
@@ -456,9 +461,6 @@ class DatasetConfig:
456
461
  if self._instruction_prompt is None
457
462
  else self._instruction_prompt
458
463
  )
459
- instruction_prompt = instruction_prompt.replace(
460
- "{labels_str}", self._labels_str
461
- )
462
464
  return instruction_prompt
463
465
 
464
466
  @property
@@ -519,15 +521,16 @@ class DatasetConfig:
519
521
  """Return a hash of the dataset configuration."""
520
522
  return hash(self.name)
521
523
 
522
- @property
523
- def _labels_str(self) -> str:
524
+ def get_labels_str(self, labels: list[str] | None = None) -> str:
524
525
  """Converts a set of labels to a natural string, in the specified language.
525
526
 
526
527
  If the task is NER, we separate using 'and' and use the mapped labels instead of
527
528
  the BIO NER labels.
528
529
 
529
530
  Args:
530
- language: The language to be used when converting the labels.
531
+ labels (optional):
532
+ The labels to convert to a natural string. If None, uses all the labels
533
+ in the dataset. Defaults to None.
531
534
 
532
535
  Returns:
533
536
  The natural string representation of the labels in specified language.
@@ -539,16 +542,17 @@ class DatasetConfig:
539
542
  else:
540
543
  sep_word = main_language.or_separator
541
544
 
542
- local_labels: list[str] = []
543
- for label in self.labels:
544
- if label not in self.prompt_label_mapping:
545
- continue
546
- local_label = self.prompt_label_mapping[label]
547
- if local_label not in local_labels:
548
- local_labels.append(local_label)
545
+ if labels is None:
546
+ labels = list()
547
+ for english_label in self.labels:
548
+ if english_label not in self.prompt_label_mapping:
549
+ continue
550
+ label = self.prompt_label_mapping[english_label]
551
+ if label not in labels:
552
+ labels.append(label)
549
553
 
550
554
  # Convert labels to single-quoted labels - and remove duplicates
551
- quoted_labels = [f"'{label}'" for label in local_labels]
555
+ quoted_labels = [f"'{label}'" for label in labels]
552
556
 
553
557
  if not quoted_labels:
554
558
  return ""
@@ -84,7 +84,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
84
84
  languages=[DA],
85
85
  splits=["test"],
86
86
  bootstrap_samples=False,
87
- _instruction_prompt="{text}",
88
87
  )
89
88
 
90
89
 
@@ -159,7 +158,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
159
158
  languages=[DA],
160
159
  splits=["test"],
161
160
  bootstrap_samples=False,
162
- _instruction_prompt="{text}",
163
161
  unofficial=True,
164
162
  )
165
163
 
@@ -172,6 +170,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
172
170
  languages=[DA],
173
171
  splits=["test"],
174
172
  bootstrap_samples=False,
175
- _instruction_prompt="{text}",
176
173
  unofficial=True,
177
174
  )
@@ -9,7 +9,7 @@ import typing as t
9
9
  from .enums import TaskGroup
10
10
  from .exceptions import InvalidBenchmark
11
11
  from .tokenization_utils import apply_chat_template
12
- from .utils import log_once
12
+ from .utils import extract_multiple_choice_labels, log_once
13
13
 
14
14
  if t.TYPE_CHECKING:
15
15
  from datasets import DatasetDict
@@ -230,18 +230,49 @@ def apply_prompt(
230
230
  return dataset_config.prompt_template.format(**kwargs), ""
231
231
 
232
232
  match dataset_config.task.task_group:
233
- case (
234
- TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
235
- ):
233
+ case TaskGroup.SEQUENCE_CLASSIFICATION:
234
+ labels_str = dataset_config.get_labels_str()
235
+ few_shot_sections = [
236
+ create_prompt(
237
+ text=example["text"].replace("\n", " ").strip(),
238
+ label=example["label"].replace("\n", " ").strip(),
239
+ labels_str=labels_str,
240
+ )
241
+ for example in few_shot_examples
242
+ ]
243
+ new_sections = [
244
+ create_prompt(
245
+ text=text.replace("\n", " ").strip(),
246
+ label="",
247
+ labels_str=labels_str,
248
+ )
249
+ for text in examples["text"]
250
+ ]
251
+
252
+ case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
236
253
  few_shot_sections = [
237
254
  create_prompt(
238
255
  text=example["text"].replace("\n", " ").strip(),
239
256
  label=example["label"].replace("\n", " ").strip(),
257
+ labels_str=dataset_config.get_labels_str(
258
+ labels=extract_multiple_choice_labels(
259
+ prompt=example["text"],
260
+ candidate_labels=dataset_config.labels,
261
+ )
262
+ ),
240
263
  )
241
264
  for example in few_shot_examples
242
265
  ]
243
266
  new_sections = [
244
- create_prompt(text=text.replace("\n", " ").strip(), label="")
267
+ create_prompt(
268
+ text=text.replace("\n", " ").strip(),
269
+ label="",
270
+ labels_str=dataset_config.get_labels_str(
271
+ labels=extract_multiple_choice_labels(
272
+ prompt=text, candidate_labels=dataset_config.labels
273
+ )
274
+ ),
275
+ )
245
276
  for text in examples["text"]
246
277
  ]
247
278
 
@@ -259,6 +290,7 @@ def apply_prompt(
259
290
  ]
260
291
 
261
292
  case TaskGroup.TOKEN_CLASSIFICATION:
293
+ labels_str = dataset_config.get_labels_str()
262
294
 
263
295
  def create_label(example: dict) -> str:
264
296
  prompt_labels = dataset_config.prompt_label_mapping.values()
@@ -280,12 +312,15 @@ def apply_prompt(
280
312
  create_prompt(
281
313
  text=" ".join(example["tokens"]).replace("\n", " ").strip(),
282
314
  label=create_label(example=example),
315
+ labels_str=labels_str,
283
316
  )
284
317
  for example in few_shot_examples
285
318
  ]
286
319
  new_sections = [
287
320
  create_prompt(
288
- text=" ".join(tokens).replace("\n", " ").strip(), label=""
321
+ text=" ".join(tokens).replace("\n", " ").strip(),
322
+ label="",
323
+ labels_str=labels_str,
289
324
  )
290
325
  for tokens in examples["tokens"]
291
326
  ]
@@ -375,4 +410,7 @@ def apply_prompt(
375
410
  for new_prompt, _ in new_sections
376
411
  ]
377
412
 
413
+ # Always add the final prompts without few-shot examples, too, for analysis
414
+ examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
415
+
378
416
  return examples
@@ -26,6 +26,27 @@ logger: logging.Logger = logging.getLogger("euroeval")
26
26
  T = t.TypeVar("T", bound=int | float | str | bool)
27
27
 
28
28
 
29
+ class PreprocessingFunction(t.Protocol):
30
+ """A protocol for a preprocessing function."""
31
+
32
+ def __call__(
33
+ self, predictions: c.Sequence[int], dataset: "Dataset"
34
+ ) -> c.Sequence[int]:
35
+ """Preprocess the model predictions before they are passed to the pipeline.
36
+
37
+ Args:
38
+ predictions:
39
+ The model predictions.
40
+ dataset:
41
+ The dataset used for evaluation. This is only used in case any
42
+ additional metadata is used to compute the metrics.
43
+
44
+ Returns:
45
+ The preprocessed model predictions.
46
+ """
47
+ ...
48
+
49
+
29
50
  class PipelineMetric(Metric):
30
51
  """Load a scikit-learn pipeline and use it to get scores from the predictions."""
31
52
 
@@ -36,7 +57,7 @@ class PipelineMetric(Metric):
36
57
  pipeline_repo: str,
37
58
  pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
38
59
  pipeline_file_name: str = "pipeline.pkl",
39
- preprocessing_fn: c.Callable[[c.Sequence[T]], c.Sequence[T]] = lambda x: x,
60
+ preprocessing_fn: PreprocessingFunction | None = None,
40
61
  postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
41
62
  ) -> None:
42
63
  """Initialise the pipeline transform metric.
@@ -101,7 +122,10 @@ class PipelineMetric(Metric):
101
122
  """
102
123
  if self.pipeline is None:
103
124
  self.pipeline = self._download_pipeline()
104
- predictions = self.preprocessing_fn(predictions)
125
+ if self.preprocessing_fn is not None:
126
+ predictions = self.preprocessing_fn(
127
+ predictions=predictions, dataset=dataset
128
+ )
105
129
  return self.pipeline_scoring_function(self.pipeline, predictions)
106
130
 
107
131
  def _download_pipeline(self) -> "Pipeline":
@@ -133,13 +157,18 @@ class PipelineMetric(Metric):
133
157
  ### European Values Metric ###
134
158
 
135
159
 
136
- def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence[int]:
160
+ def european_values_preprocessing_fn(
161
+ predictions: c.Sequence[int], dataset: "Dataset"
162
+ ) -> c.Sequence[int]:
137
163
  """Preprocess the model predictions for the European Values metric.
138
164
 
139
165
  Args:
140
166
  predictions:
141
167
  The model predictions, a sequence of integers representing the predicted
142
168
  choices for each question.
169
+ dataset:
170
+ The dataset used for evaluation. This is only used in case any additional
171
+ metadata is used to compute the metrics.
143
172
 
144
173
  Returns:
145
174
  The preprocessed model predictions, a sequence of integers representing the
@@ -154,6 +183,17 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
154
183
  num_questions = 53
155
184
  num_phrasings_per_question = 5
156
185
 
186
+ # Convert the predictions to integers
187
+ integer_predictions = []
188
+ for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
189
+ idx_to_choice = {
190
+ int(idx): int(choice)
191
+ for idx, choice in idx_to_choice.items()
192
+ if choice is not None
193
+ }
194
+ integer_prediction = idx_to_choice[prediction]
195
+ integer_predictions.append(integer_prediction)
196
+
157
197
  assert len(predictions) % num_questions == 0, (
158
198
  f"The number of predictions ({len(predictions)}) is not a multiple of "
159
199
  f"{num_questions}, which is required for the European Values metric."
@@ -171,7 +211,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
171
211
  # Shape: (num_questions, num_phrasings_per_question)
172
212
  arr = np.array(
173
213
  [
174
- predictions[i : i + num_phrasings_per_question]
214
+ integer_predictions[i : i + num_phrasings_per_question]
175
215
  for i in range(0, len(predictions), num_phrasings_per_question)
176
216
  ]
177
217
  )
@@ -188,7 +228,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
188
228
  arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
189
229
 
190
230
  # Convert the array to a list
191
- predictions = arr.tolist()
231
+ integer_predictions = arr.tolist()
192
232
 
193
233
  # Some of the questions are categorical and we're only interested in whether the
194
234
  # model chooses a specific choice or not. This mapping takes the question index
@@ -208,11 +248,13 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
208
248
  }
209
249
 
210
250
  # Map the predictions to the choices we're interested in
211
- predictions = list(predictions)
251
+ integer_predictions = list(integer_predictions)
212
252
  for question_idx, choice in question_choices.items():
213
- predictions[question_idx] = 1 if predictions[question_idx] == choice else 0
253
+ integer_predictions[question_idx] = (
254
+ 1 if integer_predictions[question_idx] == choice else 0
255
+ )
214
256
 
215
- return predictions
257
+ return integer_predictions
216
258
 
217
259
 
218
260
  def european_values_scoring_function(
euroeval/model_cache.py CHANGED
@@ -10,7 +10,9 @@ from dataclasses import asdict
10
10
 
11
11
  from tqdm.auto import tqdm
12
12
 
13
+ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
13
14
  from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
15
+ from .utils import log_once
14
16
 
15
17
  if t.TYPE_CHECKING:
16
18
  from pathlib import Path
@@ -189,10 +191,20 @@ class ModelCache:
189
191
  # the indices of the top scores, to save space. Further, we only store
190
192
  # the scores if the generated sequence is shorter than the maximum
191
193
  # length
192
- if model_output.scores is not None and self.max_generated_tokens < 8:
194
+ if (
195
+ model_output.scores is not None
196
+ and self.max_generated_tokens
197
+ <= NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
198
+ ):
193
199
  assert model_output.scores is not None
194
200
  scores = model_output.scores[sample_idx]
195
201
  else:
202
+ if model_output.scores is not None:
203
+ log_once(
204
+ "The generated sequence is longer than the maximum "
205
+ "length for classification. Not caching the scores.",
206
+ level=logging.DEBUG,
207
+ )
196
208
  scores = None
197
209
  self[model_input] = SingleGenerativeModelOutput(
198
210
  sequence=model_output.sequences[sample_idx], scores=scores
@@ -126,7 +126,7 @@ def prepare_examples(
126
126
  ):
127
127
  choice_idxs.append(idx)
128
128
 
129
- choices = [sections[idx] for idx in choice_idxs]
129
+ choices = [sections[idx] for idx in reversed(choice_idxs)]
130
130
 
131
131
  # Check that the choices are present, and that all of them are at the end
132
132
  assert len(choices) > 0, "No choices found in the document."
@@ -146,7 +146,7 @@ def prepare_examples(
146
146
  )
147
147
  new_examples["label"] = [
148
148
  int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
149
- for letter, choice in zip("abcde", choices)
149
+ for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
150
150
  ]
151
151
  new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
152
152
  return new_examples
@@ -9,7 +9,11 @@ import numpy as np
9
9
 
10
10
  from ..enums import TaskGroup
11
11
  from ..exceptions import InvalidBenchmark
12
- from ..utils import log_once, raise_if_model_output_contains_nan_values
12
+ from ..utils import (
13
+ extract_multiple_choice_labels,
14
+ log_once,
15
+ raise_if_model_output_contains_nan_values,
16
+ )
13
17
 
14
18
  if t.TYPE_CHECKING:
15
19
  from datasets.arrow_dataset import Dataset
@@ -128,6 +132,21 @@ def extract_labels_from_generation(
128
132
  or if the model outputted log probabilities but the first label token
129
133
  mapping is not provided.
130
134
  """
135
+ # Get the candidate labels, which are the labels that the model can predict
136
+ default_labels = [
137
+ dataset_config.prompt_label_mapping[lbl]
138
+ for lbl in dataset_config.id2label.values()
139
+ ]
140
+ if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
141
+ sample_candidate_labels = [
142
+ extract_multiple_choice_labels(
143
+ prompt=prompt, candidate_labels=default_labels
144
+ )
145
+ for prompt in input_batch["prompt"]
146
+ ]
147
+ else:
148
+ sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
149
+
131
150
  if model_output.scores is not None:
132
151
  if first_label_token_mapping is False:
133
152
  raise InvalidBenchmark(
@@ -136,8 +155,8 @@ def extract_labels_from_generation(
136
155
  )
137
156
  labels = get_closest_logprobs_labels(
138
157
  generation_logprobs=model_output.scores,
139
- dataset_config=dataset_config,
140
158
  first_label_token_mapping=first_label_token_mapping,
159
+ candidate_labels=sample_candidate_labels,
141
160
  )
142
161
  if labels is not None:
143
162
  return labels
@@ -147,31 +166,8 @@ def extract_labels_from_generation(
147
166
  "does not seem to be able to do that. Skipping the evaluation."
148
167
  )
149
168
 
150
- # Get the candidate labels, which are the labels that the model can predict
151
- candidate_labels = [
152
- dataset_config.prompt_label_mapping[lbl]
153
- for lbl in dataset_config.id2label.values()
154
- ]
155
-
156
169
  new_predicted_labels: list[str] = list()
157
170
  for idx, predicted_label in enumerate(model_output.sequences):
158
- # Special case if we are doing multiple choice classification: we in this case
159
- # dynamically change the candidate labels to the labels mentioned in the prompt
160
- if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
161
- prompt = input_batch["text"][idx]
162
- sample_candidate_labels = [
163
- candidate_label
164
- for candidate_label in candidate_labels
165
- if re.search(
166
- pattern=rf"\b{candidate_label}. ",
167
- string=prompt,
168
- flags=re.IGNORECASE,
169
- )
170
- is not None
171
- ]
172
- else:
173
- sample_candidate_labels = candidate_labels
174
-
175
171
  # If the prediction includes a boxed answer, use that instead of the full
176
172
  # generation
177
173
  if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
@@ -192,22 +188,43 @@ def extract_labels_from_generation(
192
188
  s2=candidate_label.lower(),
193
189
  weights=(insertion_weight, deletion_weight, substitution_weight),
194
190
  )
195
- for candidate_label in sample_candidate_labels
191
+ for candidate_label in sample_candidate_labels[idx]
192
+ ]
193
+
194
+ best_candidate_label = sample_candidate_labels[idx][
195
+ np.argmin(edit_distances).item()
196
196
  ]
197
197
 
198
- # If no candidate labels were found, we assume that something is wrong with the
199
- # model output, and we raise an error
198
+ # If no candidate labels were found, we either pick the label with the smallest
199
+ # word edit distance to the predicted label (if invalid model outputs are
200
+ # allowed), or we raise an error
200
201
  if min(edit_distances) > 100:
201
- raise InvalidBenchmark(
202
- "No candidate labels found for the predicted label "
203
- f"{predicted_label!r}, out of the candidate labels "
204
- f"{sample_candidate_labels}. This likely means that the model output "
205
- "is completely off, and we cannot extract any labels from it. Please "
206
- "check the model output and the candidate labels."
207
- )
202
+ if dataset_config.task.allow_invalid_model_outputs:
203
+ logger.warning(
204
+ "No candidate labels found for the predicted label "
205
+ f"{predicted_label!r}, out of the candidate labels "
206
+ f"{sample_candidate_labels[idx]}. This likely means that the model "
207
+ "output is completely off, but since invalid model outputs are "
208
+ "allowed for this task, we will use the closest candidate label "
209
+ f"({best_candidate_label})) as the output label. If you see this "
210
+ "warning very often, please report this issue to the EuroEval "
211
+ "team at github.com/EuroEval/EuroEval/issues."
212
+ )
213
+ logger.debug(
214
+ "The candidate labels were extracted from the prompt: "
215
+ f"{input_batch['text'][idx]!r}."
216
+ )
217
+ else:
218
+ raise InvalidBenchmark(
219
+ "No candidate labels found for the predicted label "
220
+ f"{predicted_label!r}, out of the candidate labels "
221
+ f"{sample_candidate_labels[idx]}. This likely means that the model "
222
+ "output is completely off, and we cannot extract any labels from "
223
+ "it. Please check the model output and the candidate labels. The "
224
+ "candidate labels were extracted from the prompt: "
225
+ f"{input_batch['text'][idx]!r}."
226
+ )
208
227
 
209
- # Pick the label with the smallest word edit distance to the predicted label
210
- best_candidate_label = sample_candidate_labels[np.argmin(edit_distances).item()]
211
228
  new_predicted_labels.append(best_candidate_label)
212
229
 
213
230
  return new_predicted_labels
@@ -215,8 +232,8 @@ def extract_labels_from_generation(
215
232
 
216
233
  def get_closest_logprobs_labels(
217
234
  generation_logprobs: list[list[list[tuple[str, float]]]],
218
- dataset_config: "DatasetConfig",
219
235
  first_label_token_mapping: dict[str, str] | t.Literal[True],
236
+ candidate_labels: list[list[str]],
220
237
  ) -> list[str] | None:
221
238
  """Get the labels with the highest predicted logprob value.
222
239
 
@@ -229,11 +246,11 @@ def get_closest_logprobs_labels(
229
246
  generation_logprobs:
230
247
  The logprobs of the generated tokens, for all samples in the batch. Of shape
231
248
  (batch_size, num_tokens, num_logprobs).
232
- dataset_config:
233
- The configuration of the dataset.
234
249
  first_label_token_mapping:
235
250
  A mapping from labels to the first token in each label, or alternatively a
236
251
  `True` value indicating that the model should output logprobs.
252
+ candidate_labels:
253
+ The candidate labels for each sample in the batch.
237
254
 
238
255
  Returns:
239
256
  The predicted labels, or None if labels could not be extracted.
@@ -242,12 +259,8 @@ def get_closest_logprobs_labels(
242
259
  InvalidBenchmark:
243
260
  If no candidate label can be found for any of the generated labels.
244
261
  """
245
- english_labels = list(dataset_config.id2label.values())
246
- english2local = dataset_config.prompt_label_mapping
247
- candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
248
-
249
262
  output_labels: list[str] = list()
250
- for sample in generation_logprobs:
263
+ for idx, sample in enumerate(generation_logprobs):
251
264
  for logprob_list in sample:
252
265
  generated_labels = [
253
266
  re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
@@ -265,7 +278,7 @@ def get_closest_logprobs_labels(
265
278
  if isinstance(first_label_token_mapping, dict):
266
279
  if any(
267
280
  candidate_label not in first_label_token_mapping
268
- for candidate_label in candidate_labels
281
+ for candidate_label in candidate_labels[idx]
269
282
  ):
270
283
  raise InvalidBenchmark(
271
284
  "There is a label not present in the first label token "
@@ -276,13 +289,13 @@ def get_closest_logprobs_labels(
276
289
 
277
290
  candidate_output_labels = {
278
291
  candidate_label
279
- for candidate_label in candidate_labels
292
+ for candidate_label in candidate_labels[idx]
280
293
  if generated_label == first_label_token_mapping[candidate_label]
281
294
  }
282
295
  else:
283
296
  candidate_output_labels = {
284
297
  candidate_label
285
- for candidate_label in candidate_labels
298
+ for candidate_label in candidate_labels[idx]
286
299
  if candidate_label.startswith(generated_label)
287
300
  }
288
301
 
@@ -328,7 +341,7 @@ def get_closest_logprobs_labels(
328
341
  elif len(candidate_output_labels) == 0:
329
342
  candidate_output_labels_starting_with_generated_label = [
330
343
  candidate_label
331
- for candidate_label in candidate_labels
344
+ for candidate_label in candidate_labels[idx]
332
345
  if candidate_label.startswith(generated_label)
333
346
  ]
334
347
  if candidate_output_labels_starting_with_generated_label:
@@ -364,18 +377,18 @@ def get_closest_logprobs_labels(
364
377
  if len(sample) == 0:
365
378
  log_once(
366
379
  "The model outputted an empty string, so no candidate labels could "
367
- f"be determined. Using the first label, {candidate_labels[0]!r}, "
368
- "as the output label.",
380
+ "be determined. Using the first label, "
381
+ f"{candidate_labels[idx][0]!r}, as the output label.",
369
382
  level=logging.INFO,
370
383
  )
371
384
  else:
372
385
  log_once(
373
386
  "Could not find a candidate label for any of the generated "
374
387
  f"labels in the sample {sample}. Using the first label, "
375
- f"{candidate_labels[0]!r}, as the output label.",
388
+ f"{candidate_labels[idx][0]!r}, as the output label.",
376
389
  level=logging.INFO,
377
390
  )
378
- output_labels.append(candidate_labels[0])
391
+ output_labels.append(candidate_labels[idx][0])
379
392
 
380
393
  assert len(output_labels) == len(generation_logprobs)
381
394
  return output_labels
@@ -215,6 +215,20 @@ def extract_labels_from_generation(
215
215
 
216
216
  prompt_label_mapping = dataset_config.prompt_label_mapping
217
217
  for prompt_tag_name, named_entities in prediction_dict.items():
218
+ if not isinstance(named_entities, list):
219
+ logger.debug(
220
+ "The model produced an invalid format for the named entities. "
221
+ f"Expected a list but got {type(named_entities)}. Skipping."
222
+ )
223
+ continue
224
+ try:
225
+ named_entities = [str(ne) for ne in named_entities]
226
+ except Exception:
227
+ logger.debug(
228
+ "The model produced an invalid format for the named entities. "
229
+ f"Expected a list of strings but got {named_entities}. Skipping."
230
+ )
231
+ continue
218
232
  try:
219
233
  tag_name = [
220
234
  tag[2:]
euroeval/tasks.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """All benchmarks tasks used in EuroEval."""
2
2
 
3
3
  from . import metrics as m
4
+ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
4
5
  from .data_models import Task
5
6
  from .enums import GenerativeType, ModelType, TaskGroup
6
7
  from .prompt_templates import (
@@ -28,7 +29,7 @@ LA = Task(
28
29
  template_dict=LA_TEMPLATES,
29
30
  metrics=[m.mcc_metric, m.macro_f1_metric],
30
31
  default_num_few_shot_examples=12,
31
- default_max_generated_tokens=10,
32
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
32
33
  default_labels=["correct", "incorrect"],
33
34
  uses_logprobs=True,
34
35
  )
@@ -73,7 +74,7 @@ SENT = Task(
73
74
  template_dict=SENT_TEMPLATES,
74
75
  metrics=[m.mcc_metric, m.macro_f1_metric],
75
76
  default_num_few_shot_examples=12,
76
- default_max_generated_tokens=10,
77
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
77
78
  default_labels=["positive", "neutral", "negative"],
78
79
  uses_logprobs=True,
79
80
  )
@@ -97,7 +98,7 @@ KNOW = Task(
97
98
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
98
99
  metrics=[m.mcc_metric, m.accuracy_metric],
99
100
  default_num_few_shot_examples=5,
100
- default_max_generated_tokens=10,
101
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
101
102
  default_labels=["a", "b", "c", "d"],
102
103
  uses_logprobs=True,
103
104
  )
@@ -109,7 +110,7 @@ MCRC = Task(
109
110
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
110
111
  metrics=[m.mcc_metric, m.accuracy_metric],
111
112
  default_num_few_shot_examples=5,
112
- default_max_generated_tokens=10,
113
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
113
114
  default_labels=["a", "b", "c", "d"],
114
115
  uses_logprobs=True,
115
116
  )
@@ -121,7 +122,7 @@ COMMON_SENSE = Task(
121
122
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
122
123
  metrics=[m.mcc_metric, m.accuracy_metric],
123
124
  default_num_few_shot_examples=5,
124
- default_max_generated_tokens=10,
125
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
125
126
  default_labels=["a", "b", "c", "d"],
126
127
  uses_logprobs=True,
127
128
  )
@@ -133,8 +134,8 @@ EUROPEAN_VALUES = Task(
133
134
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
134
135
  metrics=[m.european_values_metric],
135
136
  default_num_few_shot_examples=0,
136
- default_max_generated_tokens=10,
137
- default_labels=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
137
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
138
+ default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
138
139
  allowed_model_types=[ModelType.GENERATIVE],
139
140
  allowed_generative_types=[
140
141
  GenerativeType.INSTRUCTION_TUNED,
@@ -142,6 +143,7 @@ EUROPEAN_VALUES = Task(
142
143
  ],
143
144
  requires_zero_shot=True,
144
145
  uses_logprobs=True,
146
+ allow_invalid_model_outputs=False,
145
147
  )
146
148
 
147
149
 
@@ -7,9 +7,8 @@ import typing as t
7
7
  import torch
8
8
  from transformers import MistralCommonTokenizer
9
9
 
10
- from euroeval.exceptions import InvalidModel
11
-
12
10
  from .enums import GenerativeType
11
+ from .exceptions import InvalidModel
13
12
  from .utils import log_once
14
13
 
15
14
  if t.TYPE_CHECKING:
euroeval/utils.py CHANGED
@@ -25,7 +25,7 @@ from datasets.utils import disable_progress_bar
25
25
  from requests.exceptions import RequestException
26
26
  from transformers import logging as tf_logging
27
27
 
28
- from .exceptions import NaNValueInModelOutput
28
+ from .exceptions import InvalidBenchmark, NaNValueInModelOutput
29
29
 
30
30
  if t.TYPE_CHECKING:
31
31
  from types import TracebackType
@@ -457,3 +457,34 @@ def get_hf_token(api_key: str | None) -> str | bool:
457
457
  level=logging.DEBUG,
458
458
  )
459
459
  return False
460
+
461
+
462
+ def extract_multiple_choice_labels(
463
+ prompt: str, candidate_labels: list[str]
464
+ ) -> list[str]:
465
+ """Extract multiple choice labels from a prompt.
466
+
467
+ Args:
468
+ prompt:
469
+ The prompt to extract the labels from.
470
+ candidate_labels:
471
+ The candidate labels to look for in the prompt.
472
+
473
+ Returns:
474
+ The extracted labels.
475
+ """
476
+ sample_candidate_labels: list[str] = list()
477
+ for candidate_label in candidate_labels:
478
+ candidate_label_match = re.search(
479
+ pattern=rf"\b{candidate_label}\. ", string=prompt, flags=re.IGNORECASE
480
+ )
481
+ if candidate_label_match is not None:
482
+ sample_candidate_labels.append(candidate_label)
483
+ if not sample_candidate_labels:
484
+ raise InvalidBenchmark(
485
+ "Could not extract any candidate labels from the prompt. Please ensure "
486
+ "that the candidate labels are present in the prompt, each followed by a "
487
+ "dot and a space (e.g., 'a. '). The candidate labels are: "
488
+ f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}"
489
+ )
490
+ return sample_candidate_labels
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.0.0
3
+ Version: 16.0.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,10 +61,12 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
61
61
  Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
64
65
  Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
68
70
  Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
69
71
  Description-Content-Type: text/markdown
70
72
 
@@ -1,34 +1,34 @@
1
- euroeval/__init__.py,sha256=MgFG1amMgiTJmK_hcQ7nnX-o4KFhlD1P5xKUBTloPCQ,3564
1
+ euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
2
2
  euroeval/benchmark_config_factory.py,sha256=ZKzGkWr-Mr4wEMYNXUHsYkd2R-dxnNyETZJJ-Fq-my0,11386
3
3
  euroeval/benchmarker.py,sha256=YNqhl2QchqzbGMGu8QoJAG_mnYbcJ46ksfaS0x78fiw,49847
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
5
  euroeval/cli.py,sha256=RR45NiHMI9hphqBJ7Xopde-C18Be9JgJxgg6eYPFVMM,8594
6
- euroeval/constants.py,sha256=HWJ3PJRS-ZbAMXTvujiK8QP7IiS4RHkjnegv3oi52w0,2499
6
+ euroeval/constants.py,sha256=imy-YwofbAwTbjk_vgynYf3zaK5kKV349oXZl99DVyM,2742
7
7
  euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
8
- euroeval/data_models.py,sha256=NdzD1ER3GHJp51UXLGTW8iTYwzZlITH2nO0vanTkEWU,24272
8
+ euroeval/data_models.py,sha256=UGyqPAYFImrR1gi4ctQdCVb0rjVkEmyf4Lc1a7_6t6E,24663
9
9
  euroeval/enums.py,sha256=V73E8FTL1aRz74OKcxokTYLnO7Q8HGs2QI0JPZI4qQo,3032
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
11
  euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
12
12
  euroeval/generation.py,sha256=wm2u8fDGDgtWxCReG3N6v4_lLvo0OHTpR88ThGSRH7A,12139
13
- euroeval/generation_utils.py,sha256=vU-j9kjFDuPlSizEaRByx_XJyyAVpE8PdGOm9i--9zQ,14613
13
+ euroeval/generation_utils.py,sha256=w3hfiJfUPDjf2xSKdDrhlpfuxZlztF0_0h2sFPB2hT0,16212
14
14
  euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
15
- euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
15
+ euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
16
16
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
17
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
18
  euroeval/scores.py,sha256=gJ7DSQVyE2_8qZxJPuUJcFk7Byj2D7nevE23kd4XMbA,3004
19
19
  euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
20
- euroeval/tasks.py,sha256=jl8HicriMSN_LfHANokVGFqzgV53QcJ5dmzb297xI04,4173
21
- euroeval/tokenization_utils.py,sha256=icEfttWReKRC5MbREOuxTHOPpuVvH6uHhnqz1w7qIyA,20565
20
+ euroeval/tasks.py,sha256=fwmDKnIexmWbm8HueLUilYzqdNRfo0rFxX-tjZ53Nbg,4503
21
+ euroeval/tokenization_utils.py,sha256=66nip9llPw3XBEzGY0TE1DrejLV2WvdSA1p1euXC6Bg,20556
22
22
  euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
23
- euroeval/utils.py,sha256=O4JIROPfbA7MD9SbOY0CifoCckYjmdNjXYjOxDwBnwM,14149
23
+ euroeval/utils.py,sha256=ITvT-JxXosrDuElNV7cbASfxzDWSBz9mJWAZHiTOiZY,15304
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=vYW97bnlzqxxcIq6lY-zd0o6zxyDRMhT85jOhdKnoYE,11482
26
26
  euroeval/benchmark_modules/fresh.py,sha256=_iRTHt9qUkq7jPOlgwx7IwZG48dK4mjMrh7KiEHeUjE,10462
27
27
  euroeval/benchmark_modules/hf.py,sha256=HDXuVwt0kZUyL9x3aG5pEjSdGCRfzegqT0xKZYprjU0,43843
28
28
  euroeval/benchmark_modules/litellm.py,sha256=M6ct5ppcYfO-Il5VMRm3PuyAeQ-rtS22UKyRStLnqfM,59210
29
- euroeval/benchmark_modules/vllm.py,sha256=dTwGGOFQ7wqYXg7x2YBUJNQcO6OwqjTMBfUf5OveXNk,41289
29
+ euroeval/benchmark_modules/vllm.py,sha256=ckWLA9maDP5TLAfLhEXzkOYJBngb5BQR7X7RLKPl64A,41824
30
30
  euroeval/dataset_configs/__init__.py,sha256=lEOr4kJzgtUymeNBVhd-VwdUK0YTUZ3GjUMlLz5fGWk,2010
31
- euroeval/dataset_configs/danish.py,sha256=3n9e0r-hYRI2hPOgLDMQsO8bPgZKjw7OcFCUsCvdmk4,5294
31
+ euroeval/dataset_configs/danish.py,sha256=Pb43E-xfgQk9uaxq8ooznvf8okdX8KAYFEPHt1CG_TQ,5192
32
32
  euroeval/dataset_configs/dutch.py,sha256=tY7FDw7BmhXxNfI1hqfasxQXP0QbYTqknokTZ7gqdRY,5079
33
33
  euroeval/dataset_configs/english.py,sha256=Y4yc3AQu8WojqENj0sy4-rIlx1LhPnsCQ0DeonqDsVs,4128
34
34
  euroeval/dataset_configs/estonian.py,sha256=o13P_XkrdhLFCz9l8LJy-TSY3JIN7XmByxesEDiagnc,2879
@@ -47,7 +47,7 @@ euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,
47
47
  euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
48
48
  euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
49
49
  euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
50
- euroeval/metrics/pipeline.py,sha256=T65p2sxPnwh2WgCjqsqzvE3XOzizNY7rlSm8KPR7sCk,8883
50
+ euroeval/metrics/pipeline.py,sha256=a09Um3tnNdyQhzyDa9k-seYQXriYiJRQ5vyHK2lrKcg,10276
51
51
  euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
52
52
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
53
53
  euroeval/prompt_templates/linguistic_acceptability.py,sha256=9ZIyv_hfI2Aj20Uy9SY1izq5OBRV844PXPiZCNCOoEY,8207
@@ -57,13 +57,13 @@ euroeval/prompt_templates/reading_comprehension.py,sha256=eRMN-kCT3wuImbuFXzZYfo
57
57
  euroeval/prompt_templates/sentiment_classification.py,sha256=eIXn-aAY7LKeXqxzMKoqdVbihA2f1RaNQk7DhceuQdQ,8887
58
58
  euroeval/prompt_templates/summarization.py,sha256=GvnKuYJKbJ_2QkdtSWp_h4RhfOXdq-7_yYeClJSPaTY,6137
59
59
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
60
- euroeval/task_group_utils/multiple_choice_classification.py,sha256=lNEOWi3ckLBnMP1QoSTxNxT-s6kBz2XH17mrmjQlv5s,7075
60
+ euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
61
61
  euroeval/task_group_utils/question_answering.py,sha256=vdEbcZy7BE6ICA7kWkPYmPW4eVuIiZ_4uJRLUexDhwY,27750
62
- euroeval/task_group_utils/sequence_classification.py,sha256=K_hFWY6D5WR8-uy6ZikCq3ighHNHSyzW7A62vwDkwDs,16512
62
+ euroeval/task_group_utils/sequence_classification.py,sha256=ZIXcYo6ins9VUv8TT4aupWrfUQoWGBlgU8a1hYATOYM,17249
63
63
  euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
64
- euroeval/task_group_utils/token_classification.py,sha256=6bN9soT1kLthutCpqUT-jDmZZw9Mt7H3tjI4zVvE4BY,16469
65
- euroeval-16.0.0.dist-info/METADATA,sha256=uvzi8Bkgab8rKhgKavqFnv8rpL0KntFIYMZ7f1Joa0U,13544
66
- euroeval-16.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
67
- euroeval-16.0.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
68
- euroeval-16.0.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
69
- euroeval-16.0.0.dist-info/RECORD,,
64
+ euroeval/task_group_utils/token_classification.py,sha256=sNl0rhkXI9g5zKsJujrWX-9jWbYYK2iaKA1AcUg0xW4,17118
65
+ euroeval-16.0.1.dist-info/METADATA,sha256=toyIiyjwyl4Oty2YsD-P6r95hN0Si3BkBNBMOfmiwBA,13729
66
+ euroeval-16.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
67
+ euroeval-16.0.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
68
+ euroeval-16.0.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
69
+ euroeval-16.0.1.dist-info/RECORD,,