EuroEval 16.0.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +5 -0
- euroeval/benchmark_modules/vllm.py +41 -28
- euroeval/constants.py +6 -0
- euroeval/data_models.py +20 -16
- euroeval/dataset_configs/danish.py +0 -3
- euroeval/generation_utils.py +44 -6
- euroeval/metrics/pipeline.py +50 -8
- euroeval/model_cache.py +13 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -2
- euroeval/task_group_utils/sequence_classification.py +66 -53
- euroeval/task_group_utils/token_classification.py +14 -0
- euroeval/tasks.py +9 -7
- euroeval/tokenization_utils.py +1 -2
- euroeval/utils.py +32 -1
- {euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +3 -1
- {euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/RECORD +19 -19
- {euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -13,6 +13,7 @@ from termcolor import colored
|
|
|
13
13
|
|
|
14
14
|
# Block specific warnings before importing anything else, as they can be noisy
|
|
15
15
|
warnings.filterwarnings("ignore", category=UserWarning)
|
|
16
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
16
17
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
17
18
|
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
18
19
|
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
@@ -101,6 +102,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
|
|
|
101
102
|
os.environ["VLLM_USE_V1"] = "1"
|
|
102
103
|
|
|
103
104
|
|
|
105
|
+
# Use the FlashInfer flash-attention backend for vLLM
|
|
106
|
+
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
|
|
107
|
+
|
|
108
|
+
|
|
104
109
|
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
105
110
|
# former and LiteLLM uses the latter
|
|
106
111
|
if os.getenv("HUGGINGFACE_API_KEY"):
|
|
@@ -337,31 +337,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
337
337
|
if end_of_chat_token:
|
|
338
338
|
stop_tokens.append(end_of_chat_token)
|
|
339
339
|
|
|
340
|
-
structured_generation_schema = None
|
|
341
|
-
if self.dataset_config.task.uses_structured_output:
|
|
342
|
-
if self.generative_type == GenerativeType.REASONING:
|
|
343
|
-
log_once(
|
|
344
|
-
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
345
|
-
"and thus does not support structured generation, so we do not "
|
|
346
|
-
"enable it.",
|
|
347
|
-
level=logging.DEBUG,
|
|
348
|
-
)
|
|
349
|
-
else:
|
|
350
|
-
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
351
|
-
keys_and_their_types: dict[str, t.Any] = {
|
|
352
|
-
tag_name: (conlist(str, max_length=5), ...)
|
|
353
|
-
for tag_name in ner_tag_names
|
|
354
|
-
}
|
|
355
|
-
answer_format_class = create_model(
|
|
356
|
-
"AnswerFormat", **keys_and_their_types
|
|
357
|
-
)
|
|
358
|
-
structured_generation_schema = answer_format_class.model_json_schema()
|
|
359
|
-
log_once(
|
|
360
|
-
"Using structured generation with the JSON schema "
|
|
361
|
-
f"{structured_generation_schema}",
|
|
362
|
-
level=logging.DEBUG,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
340
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
366
341
|
# time we generate a new dataset since the dataset config can change
|
|
367
342
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
@@ -382,8 +357,29 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
382
357
|
"error was. Skipping this evaluation."
|
|
383
358
|
)
|
|
384
359
|
|
|
385
|
-
|
|
386
|
-
if
|
|
360
|
+
structured_generation_schema = None
|
|
361
|
+
if (
|
|
362
|
+
self.dataset_config.task.uses_structured_output
|
|
363
|
+
or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
|
|
364
|
+
) and self.generative_type == GenerativeType.REASONING:
|
|
365
|
+
guided_decoding = None
|
|
366
|
+
logger.debug(
|
|
367
|
+
"The dataset uses structured output, but we are not using it as the "
|
|
368
|
+
"model is a reasoning model."
|
|
369
|
+
)
|
|
370
|
+
elif self.dataset_config.task.uses_structured_output:
|
|
371
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
372
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
373
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
374
|
+
for tag_name in ner_tag_names
|
|
375
|
+
}
|
|
376
|
+
answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
377
|
+
structured_generation_schema = answer_format_class.model_json_schema()
|
|
378
|
+
log_once(
|
|
379
|
+
"Using structured generation with the JSON schema: "
|
|
380
|
+
f"{json.dumps(structured_generation_schema)}",
|
|
381
|
+
level=logging.DEBUG,
|
|
382
|
+
)
|
|
387
383
|
guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
|
|
388
384
|
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
389
385
|
guided_decoding = GuidedDecodingParams(
|
|
@@ -392,8 +388,17 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
392
388
|
for label in self.dataset_config.labels
|
|
393
389
|
]
|
|
394
390
|
)
|
|
391
|
+
log_once(
|
|
392
|
+
"Using structured generation with the choices: "
|
|
393
|
+
f"{guided_decoding.choice!r}.",
|
|
394
|
+
level=logging.DEBUG,
|
|
395
|
+
)
|
|
395
396
|
else:
|
|
396
397
|
guided_decoding = None
|
|
398
|
+
log_once(
|
|
399
|
+
"Not using structured generation as the dataset does not require it.",
|
|
400
|
+
level=logging.DEBUG,
|
|
401
|
+
)
|
|
397
402
|
|
|
398
403
|
# Define the parameters used for vLLM generation
|
|
399
404
|
max_tokens: int = (
|
|
@@ -439,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
439
444
|
# Generate sequences using vLLM
|
|
440
445
|
input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
|
|
441
446
|
num_attempts = 3
|
|
447
|
+
truncation_attempts = 0
|
|
442
448
|
for _ in range(num_attempts):
|
|
443
449
|
try:
|
|
444
450
|
raw_outputs = self._model.generate(
|
|
@@ -466,12 +472,19 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
466
472
|
"Prompts are too long, so truncating them and trying again..."
|
|
467
473
|
)
|
|
468
474
|
logger.debug(f"The error message was: {str(e)}")
|
|
475
|
+
|
|
476
|
+
# If we have already tried truncating the prompts a few times, then
|
|
477
|
+
# we truncate a bit more aggressively
|
|
478
|
+
extra_truncation = 50 * truncation_attempts
|
|
479
|
+
truncation_attempts += 1
|
|
480
|
+
|
|
469
481
|
tokenized_prompts = self._tokeniser(
|
|
470
482
|
text=prompts,
|
|
471
483
|
truncation=True,
|
|
472
484
|
max_length=max(
|
|
473
485
|
min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
|
|
474
|
-
- max_tokens
|
|
486
|
+
- max_tokens
|
|
487
|
+
- extra_truncation,
|
|
475
488
|
0,
|
|
476
489
|
),
|
|
477
490
|
)
|
euroeval/constants.py
CHANGED
|
@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
|
|
|
75
75
|
|
|
76
76
|
# These characters are stripped from JSON output when trying to identify the label
|
|
77
77
|
JSON_STRIP_CHARACTERS = ' {}\n\r":'
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# The number of tokens we generate when evaluating generative models on classification
|
|
81
|
+
# tasks. We also use this to determine whether we should store logprobs in the model
|
|
82
|
+
# outputs (and cache).
|
|
83
|
+
NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
|
euroeval/data_models.py
CHANGED
|
@@ -125,6 +125,12 @@ class Task:
|
|
|
125
125
|
A list of generative model types that are allowed to be evaluated on this
|
|
126
126
|
task. If None, all generative model types are allowed. Only relevant if
|
|
127
127
|
`allowed_model_types` includes generative models.
|
|
128
|
+
allow_invalid_model_outputs (optional):
|
|
129
|
+
Whether to allow invalid model outputs. This is only relevant for generative
|
|
130
|
+
models on classification tasks, where the model may generate an output
|
|
131
|
+
which is not one of the allowed labels. If True, the model output will be
|
|
132
|
+
mapped to the closest valid label. If False, the model output will be
|
|
133
|
+
considered incorrect and the evaluation will be aborted. Defaults to True.
|
|
128
134
|
"""
|
|
129
135
|
|
|
130
136
|
name: str
|
|
@@ -148,6 +154,7 @@ class Task:
|
|
|
148
154
|
GenerativeType.REASONING,
|
|
149
155
|
]
|
|
150
156
|
)
|
|
157
|
+
allow_invalid_model_outputs: bool = True
|
|
151
158
|
|
|
152
159
|
def __post_init__(self) -> None:
|
|
153
160
|
"""Post-initialisation checks."""
|
|
@@ -430,7 +437,6 @@ class DatasetConfig:
|
|
|
430
437
|
if self._prompt_prefix is None
|
|
431
438
|
else self._prompt_prefix
|
|
432
439
|
)
|
|
433
|
-
prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
|
|
434
440
|
return prompt_prefix
|
|
435
441
|
|
|
436
442
|
@property
|
|
@@ -443,7 +449,6 @@ class DatasetConfig:
|
|
|
443
449
|
if self._prompt_template is None
|
|
444
450
|
else self._prompt_template
|
|
445
451
|
)
|
|
446
|
-
prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
|
|
447
452
|
return prompt_template
|
|
448
453
|
|
|
449
454
|
@property
|
|
@@ -456,9 +461,6 @@ class DatasetConfig:
|
|
|
456
461
|
if self._instruction_prompt is None
|
|
457
462
|
else self._instruction_prompt
|
|
458
463
|
)
|
|
459
|
-
instruction_prompt = instruction_prompt.replace(
|
|
460
|
-
"{labels_str}", self._labels_str
|
|
461
|
-
)
|
|
462
464
|
return instruction_prompt
|
|
463
465
|
|
|
464
466
|
@property
|
|
@@ -519,15 +521,16 @@ class DatasetConfig:
|
|
|
519
521
|
"""Return a hash of the dataset configuration."""
|
|
520
522
|
return hash(self.name)
|
|
521
523
|
|
|
522
|
-
|
|
523
|
-
def _labels_str(self) -> str:
|
|
524
|
+
def get_labels_str(self, labels: list[str] | None = None) -> str:
|
|
524
525
|
"""Converts a set of labels to a natural string, in the specified language.
|
|
525
526
|
|
|
526
527
|
If the task is NER, we separate using 'and' and use the mapped labels instead of
|
|
527
528
|
the BIO NER labels.
|
|
528
529
|
|
|
529
530
|
Args:
|
|
530
|
-
|
|
531
|
+
labels (optional):
|
|
532
|
+
The labels to convert to a natural string. If None, uses all the labels
|
|
533
|
+
in the dataset. Defaults to None.
|
|
531
534
|
|
|
532
535
|
Returns:
|
|
533
536
|
The natural string representation of the labels in specified language.
|
|
@@ -539,16 +542,17 @@ class DatasetConfig:
|
|
|
539
542
|
else:
|
|
540
543
|
sep_word = main_language.or_separator
|
|
541
544
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
545
|
+
if labels is None:
|
|
546
|
+
labels = list()
|
|
547
|
+
for english_label in self.labels:
|
|
548
|
+
if english_label not in self.prompt_label_mapping:
|
|
549
|
+
continue
|
|
550
|
+
label = self.prompt_label_mapping[english_label]
|
|
551
|
+
if label not in labels:
|
|
552
|
+
labels.append(label)
|
|
549
553
|
|
|
550
554
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
551
|
-
quoted_labels = [f"'{label}'" for label in
|
|
555
|
+
quoted_labels = [f"'{label}'" for label in labels]
|
|
552
556
|
|
|
553
557
|
if not quoted_labels:
|
|
554
558
|
return ""
|
|
@@ -84,7 +84,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
|
|
|
84
84
|
languages=[DA],
|
|
85
85
|
splits=["test"],
|
|
86
86
|
bootstrap_samples=False,
|
|
87
|
-
_instruction_prompt="{text}",
|
|
88
87
|
)
|
|
89
88
|
|
|
90
89
|
|
|
@@ -159,7 +158,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
|
159
158
|
languages=[DA],
|
|
160
159
|
splits=["test"],
|
|
161
160
|
bootstrap_samples=False,
|
|
162
|
-
_instruction_prompt="{text}",
|
|
163
161
|
unofficial=True,
|
|
164
162
|
)
|
|
165
163
|
|
|
@@ -172,6 +170,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
|
|
|
172
170
|
languages=[DA],
|
|
173
171
|
splits=["test"],
|
|
174
172
|
bootstrap_samples=False,
|
|
175
|
-
_instruction_prompt="{text}",
|
|
176
173
|
unofficial=True,
|
|
177
174
|
)
|
euroeval/generation_utils.py
CHANGED
|
@@ -9,7 +9,7 @@ import typing as t
|
|
|
9
9
|
from .enums import TaskGroup
|
|
10
10
|
from .exceptions import InvalidBenchmark
|
|
11
11
|
from .tokenization_utils import apply_chat_template
|
|
12
|
-
from .utils import log_once
|
|
12
|
+
from .utils import extract_multiple_choice_labels, log_once
|
|
13
13
|
|
|
14
14
|
if t.TYPE_CHECKING:
|
|
15
15
|
from datasets import DatasetDict
|
|
@@ -230,18 +230,49 @@ def apply_prompt(
|
|
|
230
230
|
return dataset_config.prompt_template.format(**kwargs), ""
|
|
231
231
|
|
|
232
232
|
match dataset_config.task.task_group:
|
|
233
|
-
case
|
|
234
|
-
|
|
235
|
-
|
|
233
|
+
case TaskGroup.SEQUENCE_CLASSIFICATION:
|
|
234
|
+
labels_str = dataset_config.get_labels_str()
|
|
235
|
+
few_shot_sections = [
|
|
236
|
+
create_prompt(
|
|
237
|
+
text=example["text"].replace("\n", " ").strip(),
|
|
238
|
+
label=example["label"].replace("\n", " ").strip(),
|
|
239
|
+
labels_str=labels_str,
|
|
240
|
+
)
|
|
241
|
+
for example in few_shot_examples
|
|
242
|
+
]
|
|
243
|
+
new_sections = [
|
|
244
|
+
create_prompt(
|
|
245
|
+
text=text.replace("\n", " ").strip(),
|
|
246
|
+
label="",
|
|
247
|
+
labels_str=labels_str,
|
|
248
|
+
)
|
|
249
|
+
for text in examples["text"]
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
236
253
|
few_shot_sections = [
|
|
237
254
|
create_prompt(
|
|
238
255
|
text=example["text"].replace("\n", " ").strip(),
|
|
239
256
|
label=example["label"].replace("\n", " ").strip(),
|
|
257
|
+
labels_str=dataset_config.get_labels_str(
|
|
258
|
+
labels=extract_multiple_choice_labels(
|
|
259
|
+
prompt=example["text"],
|
|
260
|
+
candidate_labels=dataset_config.labels,
|
|
261
|
+
)
|
|
262
|
+
),
|
|
240
263
|
)
|
|
241
264
|
for example in few_shot_examples
|
|
242
265
|
]
|
|
243
266
|
new_sections = [
|
|
244
|
-
create_prompt(
|
|
267
|
+
create_prompt(
|
|
268
|
+
text=text.replace("\n", " ").strip(),
|
|
269
|
+
label="",
|
|
270
|
+
labels_str=dataset_config.get_labels_str(
|
|
271
|
+
labels=extract_multiple_choice_labels(
|
|
272
|
+
prompt=text, candidate_labels=dataset_config.labels
|
|
273
|
+
)
|
|
274
|
+
),
|
|
275
|
+
)
|
|
245
276
|
for text in examples["text"]
|
|
246
277
|
]
|
|
247
278
|
|
|
@@ -259,6 +290,7 @@ def apply_prompt(
|
|
|
259
290
|
]
|
|
260
291
|
|
|
261
292
|
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
293
|
+
labels_str = dataset_config.get_labels_str()
|
|
262
294
|
|
|
263
295
|
def create_label(example: dict) -> str:
|
|
264
296
|
prompt_labels = dataset_config.prompt_label_mapping.values()
|
|
@@ -280,12 +312,15 @@ def apply_prompt(
|
|
|
280
312
|
create_prompt(
|
|
281
313
|
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
282
314
|
label=create_label(example=example),
|
|
315
|
+
labels_str=labels_str,
|
|
283
316
|
)
|
|
284
317
|
for example in few_shot_examples
|
|
285
318
|
]
|
|
286
319
|
new_sections = [
|
|
287
320
|
create_prompt(
|
|
288
|
-
text=" ".join(tokens).replace("\n", " ").strip(),
|
|
321
|
+
text=" ".join(tokens).replace("\n", " ").strip(),
|
|
322
|
+
label="",
|
|
323
|
+
labels_str=labels_str,
|
|
289
324
|
)
|
|
290
325
|
for tokens in examples["tokens"]
|
|
291
326
|
]
|
|
@@ -375,4 +410,7 @@ def apply_prompt(
|
|
|
375
410
|
for new_prompt, _ in new_sections
|
|
376
411
|
]
|
|
377
412
|
|
|
413
|
+
# Always add the final prompts without few-shot examples, too, for analysis
|
|
414
|
+
examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
|
|
415
|
+
|
|
378
416
|
return examples
|
euroeval/metrics/pipeline.py
CHANGED
|
@@ -26,6 +26,27 @@ logger: logging.Logger = logging.getLogger("euroeval")
|
|
|
26
26
|
T = t.TypeVar("T", bound=int | float | str | bool)
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
class PreprocessingFunction(t.Protocol):
|
|
30
|
+
"""A protocol for a preprocessing function."""
|
|
31
|
+
|
|
32
|
+
def __call__(
|
|
33
|
+
self, predictions: c.Sequence[int], dataset: "Dataset"
|
|
34
|
+
) -> c.Sequence[int]:
|
|
35
|
+
"""Preprocess the model predictions before they are passed to the pipeline.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
predictions:
|
|
39
|
+
The model predictions.
|
|
40
|
+
dataset:
|
|
41
|
+
The dataset used for evaluation. This is only used in case any
|
|
42
|
+
additional metadata is used to compute the metrics.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
The preprocessed model predictions.
|
|
46
|
+
"""
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
|
|
29
50
|
class PipelineMetric(Metric):
|
|
30
51
|
"""Load a scikit-learn pipeline and use it to get scores from the predictions."""
|
|
31
52
|
|
|
@@ -36,7 +57,7 @@ class PipelineMetric(Metric):
|
|
|
36
57
|
pipeline_repo: str,
|
|
37
58
|
pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
|
|
38
59
|
pipeline_file_name: str = "pipeline.pkl",
|
|
39
|
-
preprocessing_fn:
|
|
60
|
+
preprocessing_fn: PreprocessingFunction | None = None,
|
|
40
61
|
postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
|
|
41
62
|
) -> None:
|
|
42
63
|
"""Initialise the pipeline transform metric.
|
|
@@ -101,7 +122,10 @@ class PipelineMetric(Metric):
|
|
|
101
122
|
"""
|
|
102
123
|
if self.pipeline is None:
|
|
103
124
|
self.pipeline = self._download_pipeline()
|
|
104
|
-
|
|
125
|
+
if self.preprocessing_fn is not None:
|
|
126
|
+
predictions = self.preprocessing_fn(
|
|
127
|
+
predictions=predictions, dataset=dataset
|
|
128
|
+
)
|
|
105
129
|
return self.pipeline_scoring_function(self.pipeline, predictions)
|
|
106
130
|
|
|
107
131
|
def _download_pipeline(self) -> "Pipeline":
|
|
@@ -133,13 +157,18 @@ class PipelineMetric(Metric):
|
|
|
133
157
|
### European Values Metric ###
|
|
134
158
|
|
|
135
159
|
|
|
136
|
-
def european_values_preprocessing_fn(
|
|
160
|
+
def european_values_preprocessing_fn(
|
|
161
|
+
predictions: c.Sequence[int], dataset: "Dataset"
|
|
162
|
+
) -> c.Sequence[int]:
|
|
137
163
|
"""Preprocess the model predictions for the European Values metric.
|
|
138
164
|
|
|
139
165
|
Args:
|
|
140
166
|
predictions:
|
|
141
167
|
The model predictions, a sequence of integers representing the predicted
|
|
142
168
|
choices for each question.
|
|
169
|
+
dataset:
|
|
170
|
+
The dataset used for evaluation. This is only used in case any additional
|
|
171
|
+
metadata is used to compute the metrics.
|
|
143
172
|
|
|
144
173
|
Returns:
|
|
145
174
|
The preprocessed model predictions, a sequence of integers representing the
|
|
@@ -154,6 +183,17 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
154
183
|
num_questions = 53
|
|
155
184
|
num_phrasings_per_question = 5
|
|
156
185
|
|
|
186
|
+
# Convert the predictions to integers
|
|
187
|
+
integer_predictions = []
|
|
188
|
+
for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
|
|
189
|
+
idx_to_choice = {
|
|
190
|
+
int(idx): int(choice)
|
|
191
|
+
for idx, choice in idx_to_choice.items()
|
|
192
|
+
if choice is not None
|
|
193
|
+
}
|
|
194
|
+
integer_prediction = idx_to_choice[prediction]
|
|
195
|
+
integer_predictions.append(integer_prediction)
|
|
196
|
+
|
|
157
197
|
assert len(predictions) % num_questions == 0, (
|
|
158
198
|
f"The number of predictions ({len(predictions)}) is not a multiple of "
|
|
159
199
|
f"{num_questions}, which is required for the European Values metric."
|
|
@@ -171,7 +211,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
171
211
|
# Shape: (num_questions, num_phrasings_per_question)
|
|
172
212
|
arr = np.array(
|
|
173
213
|
[
|
|
174
|
-
|
|
214
|
+
integer_predictions[i : i + num_phrasings_per_question]
|
|
175
215
|
for i in range(0, len(predictions), num_phrasings_per_question)
|
|
176
216
|
]
|
|
177
217
|
)
|
|
@@ -188,7 +228,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
188
228
|
arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
|
|
189
229
|
|
|
190
230
|
# Convert the array to a list
|
|
191
|
-
|
|
231
|
+
integer_predictions = arr.tolist()
|
|
192
232
|
|
|
193
233
|
# Some of the questions are categorical and we're only interested in whether the
|
|
194
234
|
# model chooses a specific choice or not. This mapping takes the question index
|
|
@@ -208,11 +248,13 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
208
248
|
}
|
|
209
249
|
|
|
210
250
|
# Map the predictions to the choices we're interested in
|
|
211
|
-
|
|
251
|
+
integer_predictions = list(integer_predictions)
|
|
212
252
|
for question_idx, choice in question_choices.items():
|
|
213
|
-
|
|
253
|
+
integer_predictions[question_idx] = (
|
|
254
|
+
1 if integer_predictions[question_idx] == choice else 0
|
|
255
|
+
)
|
|
214
256
|
|
|
215
|
-
return
|
|
257
|
+
return integer_predictions
|
|
216
258
|
|
|
217
259
|
|
|
218
260
|
def european_values_scoring_function(
|
euroeval/model_cache.py
CHANGED
|
@@ -10,7 +10,9 @@ from dataclasses import asdict
|
|
|
10
10
|
|
|
11
11
|
from tqdm.auto import tqdm
|
|
12
12
|
|
|
13
|
+
from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
13
14
|
from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
|
|
15
|
+
from .utils import log_once
|
|
14
16
|
|
|
15
17
|
if t.TYPE_CHECKING:
|
|
16
18
|
from pathlib import Path
|
|
@@ -189,10 +191,20 @@ class ModelCache:
|
|
|
189
191
|
# the indices of the top scores, to save space. Further, we only store
|
|
190
192
|
# the scores if the generated sequence is shorter than the maximum
|
|
191
193
|
# length
|
|
192
|
-
if
|
|
194
|
+
if (
|
|
195
|
+
model_output.scores is not None
|
|
196
|
+
and self.max_generated_tokens
|
|
197
|
+
<= NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
198
|
+
):
|
|
193
199
|
assert model_output.scores is not None
|
|
194
200
|
scores = model_output.scores[sample_idx]
|
|
195
201
|
else:
|
|
202
|
+
if model_output.scores is not None:
|
|
203
|
+
log_once(
|
|
204
|
+
"The generated sequence is longer than the maximum "
|
|
205
|
+
"length for classification. Not caching the scores.",
|
|
206
|
+
level=logging.DEBUG,
|
|
207
|
+
)
|
|
196
208
|
scores = None
|
|
197
209
|
self[model_input] = SingleGenerativeModelOutput(
|
|
198
210
|
sequence=model_output.sequences[sample_idx], scores=scores
|
|
@@ -126,7 +126,7 @@ def prepare_examples(
|
|
|
126
126
|
):
|
|
127
127
|
choice_idxs.append(idx)
|
|
128
128
|
|
|
129
|
-
choices = [sections[idx] for idx in choice_idxs]
|
|
129
|
+
choices = [sections[idx] for idx in reversed(choice_idxs)]
|
|
130
130
|
|
|
131
131
|
# Check that the choices are present, and that all of them are at the end
|
|
132
132
|
assert len(choices) > 0, "No choices found in the document."
|
|
@@ -146,7 +146,7 @@ def prepare_examples(
|
|
|
146
146
|
)
|
|
147
147
|
new_examples["label"] = [
|
|
148
148
|
int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
|
|
149
|
-
for letter, choice in zip("
|
|
149
|
+
for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
|
|
150
150
|
]
|
|
151
151
|
new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
|
|
152
152
|
return new_examples
|
|
@@ -9,7 +9,11 @@ import numpy as np
|
|
|
9
9
|
|
|
10
10
|
from ..enums import TaskGroup
|
|
11
11
|
from ..exceptions import InvalidBenchmark
|
|
12
|
-
from ..utils import
|
|
12
|
+
from ..utils import (
|
|
13
|
+
extract_multiple_choice_labels,
|
|
14
|
+
log_once,
|
|
15
|
+
raise_if_model_output_contains_nan_values,
|
|
16
|
+
)
|
|
13
17
|
|
|
14
18
|
if t.TYPE_CHECKING:
|
|
15
19
|
from datasets.arrow_dataset import Dataset
|
|
@@ -128,6 +132,21 @@ def extract_labels_from_generation(
|
|
|
128
132
|
or if the model outputted log probabilities but the first label token
|
|
129
133
|
mapping is not provided.
|
|
130
134
|
"""
|
|
135
|
+
# Get the candidate labels, which are the labels that the model can predict
|
|
136
|
+
default_labels = [
|
|
137
|
+
dataset_config.prompt_label_mapping[lbl]
|
|
138
|
+
for lbl in dataset_config.id2label.values()
|
|
139
|
+
]
|
|
140
|
+
if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
141
|
+
sample_candidate_labels = [
|
|
142
|
+
extract_multiple_choice_labels(
|
|
143
|
+
prompt=prompt, candidate_labels=default_labels
|
|
144
|
+
)
|
|
145
|
+
for prompt in input_batch["prompt"]
|
|
146
|
+
]
|
|
147
|
+
else:
|
|
148
|
+
sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
|
|
149
|
+
|
|
131
150
|
if model_output.scores is not None:
|
|
132
151
|
if first_label_token_mapping is False:
|
|
133
152
|
raise InvalidBenchmark(
|
|
@@ -136,8 +155,8 @@ def extract_labels_from_generation(
|
|
|
136
155
|
)
|
|
137
156
|
labels = get_closest_logprobs_labels(
|
|
138
157
|
generation_logprobs=model_output.scores,
|
|
139
|
-
dataset_config=dataset_config,
|
|
140
158
|
first_label_token_mapping=first_label_token_mapping,
|
|
159
|
+
candidate_labels=sample_candidate_labels,
|
|
141
160
|
)
|
|
142
161
|
if labels is not None:
|
|
143
162
|
return labels
|
|
@@ -147,31 +166,8 @@ def extract_labels_from_generation(
|
|
|
147
166
|
"does not seem to be able to do that. Skipping the evaluation."
|
|
148
167
|
)
|
|
149
168
|
|
|
150
|
-
# Get the candidate labels, which are the labels that the model can predict
|
|
151
|
-
candidate_labels = [
|
|
152
|
-
dataset_config.prompt_label_mapping[lbl]
|
|
153
|
-
for lbl in dataset_config.id2label.values()
|
|
154
|
-
]
|
|
155
|
-
|
|
156
169
|
new_predicted_labels: list[str] = list()
|
|
157
170
|
for idx, predicted_label in enumerate(model_output.sequences):
|
|
158
|
-
# Special case if we are doing multiple choice classification: we in this case
|
|
159
|
-
# dynamically change the candidate labels to the labels mentioned in the prompt
|
|
160
|
-
if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
161
|
-
prompt = input_batch["text"][idx]
|
|
162
|
-
sample_candidate_labels = [
|
|
163
|
-
candidate_label
|
|
164
|
-
for candidate_label in candidate_labels
|
|
165
|
-
if re.search(
|
|
166
|
-
pattern=rf"\b{candidate_label}. ",
|
|
167
|
-
string=prompt,
|
|
168
|
-
flags=re.IGNORECASE,
|
|
169
|
-
)
|
|
170
|
-
is not None
|
|
171
|
-
]
|
|
172
|
-
else:
|
|
173
|
-
sample_candidate_labels = candidate_labels
|
|
174
|
-
|
|
175
171
|
# If the prediction includes a boxed answer, use that instead of the full
|
|
176
172
|
# generation
|
|
177
173
|
if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
|
|
@@ -192,22 +188,43 @@ def extract_labels_from_generation(
|
|
|
192
188
|
s2=candidate_label.lower(),
|
|
193
189
|
weights=(insertion_weight, deletion_weight, substitution_weight),
|
|
194
190
|
)
|
|
195
|
-
for candidate_label in sample_candidate_labels
|
|
191
|
+
for candidate_label in sample_candidate_labels[idx]
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
best_candidate_label = sample_candidate_labels[idx][
|
|
195
|
+
np.argmin(edit_distances).item()
|
|
196
196
|
]
|
|
197
197
|
|
|
198
|
-
# If no candidate labels were found, we
|
|
199
|
-
#
|
|
198
|
+
# If no candidate labels were found, we either pick the label with the smallest
|
|
199
|
+
# word edit distance to the predicted label (if invalid model outputs are
|
|
200
|
+
# allowed), or we raise an error
|
|
200
201
|
if min(edit_distances) > 100:
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
202
|
+
if dataset_config.task.allow_invalid_model_outputs:
|
|
203
|
+
logger.warning(
|
|
204
|
+
"No candidate labels found for the predicted label "
|
|
205
|
+
f"{predicted_label!r}, out of the candidate labels "
|
|
206
|
+
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
207
|
+
"output is completely off, but since invalid model outputs are "
|
|
208
|
+
"allowed for this task, we will use the closest candidate label "
|
|
209
|
+
f"({best_candidate_label})) as the output label. If you see this "
|
|
210
|
+
"warning very often, please report this issue to the EuroEval "
|
|
211
|
+
"team at github.com/EuroEval/EuroEval/issues."
|
|
212
|
+
)
|
|
213
|
+
logger.debug(
|
|
214
|
+
"The candidate labels were extracted from the prompt: "
|
|
215
|
+
f"{input_batch['text'][idx]!r}."
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
raise InvalidBenchmark(
|
|
219
|
+
"No candidate labels found for the predicted label "
|
|
220
|
+
f"{predicted_label!r}, out of the candidate labels "
|
|
221
|
+
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
222
|
+
"output is completely off, and we cannot extract any labels from "
|
|
223
|
+
"it. Please check the model output and the candidate labels. The "
|
|
224
|
+
"candidate labels were extracted from the prompt: "
|
|
225
|
+
f"{input_batch['text'][idx]!r}."
|
|
226
|
+
)
|
|
208
227
|
|
|
209
|
-
# Pick the label with the smallest word edit distance to the predicted label
|
|
210
|
-
best_candidate_label = sample_candidate_labels[np.argmin(edit_distances).item()]
|
|
211
228
|
new_predicted_labels.append(best_candidate_label)
|
|
212
229
|
|
|
213
230
|
return new_predicted_labels
|
|
@@ -215,8 +232,8 @@ def extract_labels_from_generation(
|
|
|
215
232
|
|
|
216
233
|
def get_closest_logprobs_labels(
|
|
217
234
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
218
|
-
dataset_config: "DatasetConfig",
|
|
219
235
|
first_label_token_mapping: dict[str, str] | t.Literal[True],
|
|
236
|
+
candidate_labels: list[list[str]],
|
|
220
237
|
) -> list[str] | None:
|
|
221
238
|
"""Get the labels with the highest predicted logprob value.
|
|
222
239
|
|
|
@@ -229,11 +246,11 @@ def get_closest_logprobs_labels(
|
|
|
229
246
|
generation_logprobs:
|
|
230
247
|
The logprobs of the generated tokens, for all samples in the batch. Of shape
|
|
231
248
|
(batch_size, num_tokens, num_logprobs).
|
|
232
|
-
dataset_config:
|
|
233
|
-
The configuration of the dataset.
|
|
234
249
|
first_label_token_mapping:
|
|
235
250
|
A mapping from labels to the first token in each label, or alternatively a
|
|
236
251
|
`True` value indicating that the model should output logprobs.
|
|
252
|
+
candidate_labels:
|
|
253
|
+
The candidate labels for each sample in the batch.
|
|
237
254
|
|
|
238
255
|
Returns:
|
|
239
256
|
The predicted labels, or None if labels could not be extracted.
|
|
@@ -242,12 +259,8 @@ def get_closest_logprobs_labels(
|
|
|
242
259
|
InvalidBenchmark:
|
|
243
260
|
If no candidate label can be found for any of the generated labels.
|
|
244
261
|
"""
|
|
245
|
-
english_labels = list(dataset_config.id2label.values())
|
|
246
|
-
english2local = dataset_config.prompt_label_mapping
|
|
247
|
-
candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
|
|
248
|
-
|
|
249
262
|
output_labels: list[str] = list()
|
|
250
|
-
for sample in generation_logprobs:
|
|
263
|
+
for idx, sample in enumerate(generation_logprobs):
|
|
251
264
|
for logprob_list in sample:
|
|
252
265
|
generated_labels = [
|
|
253
266
|
re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
|
|
@@ -265,7 +278,7 @@ def get_closest_logprobs_labels(
|
|
|
265
278
|
if isinstance(first_label_token_mapping, dict):
|
|
266
279
|
if any(
|
|
267
280
|
candidate_label not in first_label_token_mapping
|
|
268
|
-
for candidate_label in candidate_labels
|
|
281
|
+
for candidate_label in candidate_labels[idx]
|
|
269
282
|
):
|
|
270
283
|
raise InvalidBenchmark(
|
|
271
284
|
"There is a label not present in the first label token "
|
|
@@ -276,13 +289,13 @@ def get_closest_logprobs_labels(
|
|
|
276
289
|
|
|
277
290
|
candidate_output_labels = {
|
|
278
291
|
candidate_label
|
|
279
|
-
for candidate_label in candidate_labels
|
|
292
|
+
for candidate_label in candidate_labels[idx]
|
|
280
293
|
if generated_label == first_label_token_mapping[candidate_label]
|
|
281
294
|
}
|
|
282
295
|
else:
|
|
283
296
|
candidate_output_labels = {
|
|
284
297
|
candidate_label
|
|
285
|
-
for candidate_label in candidate_labels
|
|
298
|
+
for candidate_label in candidate_labels[idx]
|
|
286
299
|
if candidate_label.startswith(generated_label)
|
|
287
300
|
}
|
|
288
301
|
|
|
@@ -328,7 +341,7 @@ def get_closest_logprobs_labels(
|
|
|
328
341
|
elif len(candidate_output_labels) == 0:
|
|
329
342
|
candidate_output_labels_starting_with_generated_label = [
|
|
330
343
|
candidate_label
|
|
331
|
-
for candidate_label in candidate_labels
|
|
344
|
+
for candidate_label in candidate_labels[idx]
|
|
332
345
|
if candidate_label.startswith(generated_label)
|
|
333
346
|
]
|
|
334
347
|
if candidate_output_labels_starting_with_generated_label:
|
|
@@ -364,18 +377,18 @@ def get_closest_logprobs_labels(
|
|
|
364
377
|
if len(sample) == 0:
|
|
365
378
|
log_once(
|
|
366
379
|
"The model outputted an empty string, so no candidate labels could "
|
|
367
|
-
|
|
368
|
-
"as the output label.",
|
|
380
|
+
"be determined. Using the first label, "
|
|
381
|
+
f"{candidate_labels[idx][0]!r}, as the output label.",
|
|
369
382
|
level=logging.INFO,
|
|
370
383
|
)
|
|
371
384
|
else:
|
|
372
385
|
log_once(
|
|
373
386
|
"Could not find a candidate label for any of the generated "
|
|
374
387
|
f"labels in the sample {sample}. Using the first label, "
|
|
375
|
-
f"{candidate_labels[0]!r}, as the output label.",
|
|
388
|
+
f"{candidate_labels[idx][0]!r}, as the output label.",
|
|
376
389
|
level=logging.INFO,
|
|
377
390
|
)
|
|
378
|
-
output_labels.append(candidate_labels[0])
|
|
391
|
+
output_labels.append(candidate_labels[idx][0])
|
|
379
392
|
|
|
380
393
|
assert len(output_labels) == len(generation_logprobs)
|
|
381
394
|
return output_labels
|
|
@@ -215,6 +215,20 @@ def extract_labels_from_generation(
|
|
|
215
215
|
|
|
216
216
|
prompt_label_mapping = dataset_config.prompt_label_mapping
|
|
217
217
|
for prompt_tag_name, named_entities in prediction_dict.items():
|
|
218
|
+
if not isinstance(named_entities, list):
|
|
219
|
+
logger.debug(
|
|
220
|
+
"The model produced an invalid format for the named entities. "
|
|
221
|
+
f"Expected a list but got {type(named_entities)}. Skipping."
|
|
222
|
+
)
|
|
223
|
+
continue
|
|
224
|
+
try:
|
|
225
|
+
named_entities = [str(ne) for ne in named_entities]
|
|
226
|
+
except Exception:
|
|
227
|
+
logger.debug(
|
|
228
|
+
"The model produced an invalid format for the named entities. "
|
|
229
|
+
f"Expected a list of strings but got {named_entities}. Skipping."
|
|
230
|
+
)
|
|
231
|
+
continue
|
|
218
232
|
try:
|
|
219
233
|
tag_name = [
|
|
220
234
|
tag[2:]
|
euroeval/tasks.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All benchmarks tasks used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from . import metrics as m
|
|
4
|
+
from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
4
5
|
from .data_models import Task
|
|
5
6
|
from .enums import GenerativeType, ModelType, TaskGroup
|
|
6
7
|
from .prompt_templates import (
|
|
@@ -28,7 +29,7 @@ LA = Task(
|
|
|
28
29
|
template_dict=LA_TEMPLATES,
|
|
29
30
|
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
30
31
|
default_num_few_shot_examples=12,
|
|
31
|
-
default_max_generated_tokens=
|
|
32
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
32
33
|
default_labels=["correct", "incorrect"],
|
|
33
34
|
uses_logprobs=True,
|
|
34
35
|
)
|
|
@@ -73,7 +74,7 @@ SENT = Task(
|
|
|
73
74
|
template_dict=SENT_TEMPLATES,
|
|
74
75
|
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
75
76
|
default_num_few_shot_examples=12,
|
|
76
|
-
default_max_generated_tokens=
|
|
77
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
77
78
|
default_labels=["positive", "neutral", "negative"],
|
|
78
79
|
uses_logprobs=True,
|
|
79
80
|
)
|
|
@@ -97,7 +98,7 @@ KNOW = Task(
|
|
|
97
98
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
98
99
|
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
99
100
|
default_num_few_shot_examples=5,
|
|
100
|
-
default_max_generated_tokens=
|
|
101
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
101
102
|
default_labels=["a", "b", "c", "d"],
|
|
102
103
|
uses_logprobs=True,
|
|
103
104
|
)
|
|
@@ -109,7 +110,7 @@ MCRC = Task(
|
|
|
109
110
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
110
111
|
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
111
112
|
default_num_few_shot_examples=5,
|
|
112
|
-
default_max_generated_tokens=
|
|
113
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
113
114
|
default_labels=["a", "b", "c", "d"],
|
|
114
115
|
uses_logprobs=True,
|
|
115
116
|
)
|
|
@@ -121,7 +122,7 @@ COMMON_SENSE = Task(
|
|
|
121
122
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
122
123
|
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
123
124
|
default_num_few_shot_examples=5,
|
|
124
|
-
default_max_generated_tokens=
|
|
125
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
125
126
|
default_labels=["a", "b", "c", "d"],
|
|
126
127
|
uses_logprobs=True,
|
|
127
128
|
)
|
|
@@ -133,8 +134,8 @@ EUROPEAN_VALUES = Task(
|
|
|
133
134
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
134
135
|
metrics=[m.european_values_metric],
|
|
135
136
|
default_num_few_shot_examples=0,
|
|
136
|
-
default_max_generated_tokens=
|
|
137
|
-
default_labels=["
|
|
137
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
138
|
+
default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
|
|
138
139
|
allowed_model_types=[ModelType.GENERATIVE],
|
|
139
140
|
allowed_generative_types=[
|
|
140
141
|
GenerativeType.INSTRUCTION_TUNED,
|
|
@@ -142,6 +143,7 @@ EUROPEAN_VALUES = Task(
|
|
|
142
143
|
],
|
|
143
144
|
requires_zero_shot=True,
|
|
144
145
|
uses_logprobs=True,
|
|
146
|
+
allow_invalid_model_outputs=False,
|
|
145
147
|
)
|
|
146
148
|
|
|
147
149
|
|
euroeval/tokenization_utils.py
CHANGED
|
@@ -7,9 +7,8 @@ import typing as t
|
|
|
7
7
|
import torch
|
|
8
8
|
from transformers import MistralCommonTokenizer
|
|
9
9
|
|
|
10
|
-
from euroeval.exceptions import InvalidModel
|
|
11
|
-
|
|
12
10
|
from .enums import GenerativeType
|
|
11
|
+
from .exceptions import InvalidModel
|
|
13
12
|
from .utils import log_once
|
|
14
13
|
|
|
15
14
|
if t.TYPE_CHECKING:
|
euroeval/utils.py
CHANGED
|
@@ -25,7 +25,7 @@ from datasets.utils import disable_progress_bar
|
|
|
25
25
|
from requests.exceptions import RequestException
|
|
26
26
|
from transformers import logging as tf_logging
|
|
27
27
|
|
|
28
|
-
from .exceptions import NaNValueInModelOutput
|
|
28
|
+
from .exceptions import InvalidBenchmark, NaNValueInModelOutput
|
|
29
29
|
|
|
30
30
|
if t.TYPE_CHECKING:
|
|
31
31
|
from types import TracebackType
|
|
@@ -457,3 +457,34 @@ def get_hf_token(api_key: str | None) -> str | bool:
|
|
|
457
457
|
level=logging.DEBUG,
|
|
458
458
|
)
|
|
459
459
|
return False
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def extract_multiple_choice_labels(
|
|
463
|
+
prompt: str, candidate_labels: list[str]
|
|
464
|
+
) -> list[str]:
|
|
465
|
+
"""Extract multiple choice labels from a prompt.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
prompt:
|
|
469
|
+
The prompt to extract the labels from.
|
|
470
|
+
candidate_labels:
|
|
471
|
+
The candidate labels to look for in the prompt.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
The extracted labels.
|
|
475
|
+
"""
|
|
476
|
+
sample_candidate_labels: list[str] = list()
|
|
477
|
+
for candidate_label in candidate_labels:
|
|
478
|
+
candidate_label_match = re.search(
|
|
479
|
+
pattern=rf"\b{candidate_label}\. ", string=prompt, flags=re.IGNORECASE
|
|
480
|
+
)
|
|
481
|
+
if candidate_label_match is not None:
|
|
482
|
+
sample_candidate_labels.append(candidate_label)
|
|
483
|
+
if not sample_candidate_labels:
|
|
484
|
+
raise InvalidBenchmark(
|
|
485
|
+
"Could not extract any candidate labels from the prompt. Please ensure "
|
|
486
|
+
"that the candidate labels are present in the prompt, each followed by a "
|
|
487
|
+
"dot and a space (e.g., 'a. '). The candidate labels are: "
|
|
488
|
+
f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}"
|
|
489
|
+
)
|
|
490
|
+
return sample_candidate_labels
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 16.0.
|
|
3
|
+
Version: 16.0.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -61,10 +61,12 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
|
61
61
|
Provides-Extra: all
|
|
62
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
65
|
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
65
66
|
Provides-Extra: generative
|
|
66
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
+
Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
70
|
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
69
71
|
Description-Content-Type: text/markdown
|
|
70
72
|
|
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
|
|
2
2
|
euroeval/benchmark_config_factory.py,sha256=ZKzGkWr-Mr4wEMYNXUHsYkd2R-dxnNyETZJJ-Fq-my0,11386
|
|
3
3
|
euroeval/benchmarker.py,sha256=YNqhl2QchqzbGMGu8QoJAG_mnYbcJ46ksfaS0x78fiw,49847
|
|
4
4
|
euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
|
|
5
5
|
euroeval/cli.py,sha256=RR45NiHMI9hphqBJ7Xopde-C18Be9JgJxgg6eYPFVMM,8594
|
|
6
|
-
euroeval/constants.py,sha256=
|
|
6
|
+
euroeval/constants.py,sha256=imy-YwofbAwTbjk_vgynYf3zaK5kKV349oXZl99DVyM,2742
|
|
7
7
|
euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=UGyqPAYFImrR1gi4ctQdCVb0rjVkEmyf4Lc1a7_6t6E,24663
|
|
9
9
|
euroeval/enums.py,sha256=V73E8FTL1aRz74OKcxokTYLnO7Q8HGs2QI0JPZI4qQo,3032
|
|
10
10
|
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
11
|
euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
|
|
12
12
|
euroeval/generation.py,sha256=wm2u8fDGDgtWxCReG3N6v4_lLvo0OHTpR88ThGSRH7A,12139
|
|
13
|
-
euroeval/generation_utils.py,sha256=
|
|
13
|
+
euroeval/generation_utils.py,sha256=w3hfiJfUPDjf2xSKdDrhlpfuxZlztF0_0h2sFPB2hT0,16212
|
|
14
14
|
euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
|
|
15
|
-
euroeval/model_cache.py,sha256=
|
|
15
|
+
euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
|
|
16
16
|
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
17
17
|
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
18
18
|
euroeval/scores.py,sha256=gJ7DSQVyE2_8qZxJPuUJcFk7Byj2D7nevE23kd4XMbA,3004
|
|
19
19
|
euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
|
|
20
|
-
euroeval/tasks.py,sha256=
|
|
21
|
-
euroeval/tokenization_utils.py,sha256=
|
|
20
|
+
euroeval/tasks.py,sha256=fwmDKnIexmWbm8HueLUilYzqdNRfo0rFxX-tjZ53Nbg,4503
|
|
21
|
+
euroeval/tokenization_utils.py,sha256=66nip9llPw3XBEzGY0TE1DrejLV2WvdSA1p1euXC6Bg,20556
|
|
22
22
|
euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
23
|
+
euroeval/utils.py,sha256=ITvT-JxXosrDuElNV7cbASfxzDWSBz9mJWAZHiTOiZY,15304
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=vYW97bnlzqxxcIq6lY-zd0o6zxyDRMhT85jOhdKnoYE,11482
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=_iRTHt9qUkq7jPOlgwx7IwZG48dK4mjMrh7KiEHeUjE,10462
|
|
27
27
|
euroeval/benchmark_modules/hf.py,sha256=HDXuVwt0kZUyL9x3aG5pEjSdGCRfzegqT0xKZYprjU0,43843
|
|
28
28
|
euroeval/benchmark_modules/litellm.py,sha256=M6ct5ppcYfO-Il5VMRm3PuyAeQ-rtS22UKyRStLnqfM,59210
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=ckWLA9maDP5TLAfLhEXzkOYJBngb5BQR7X7RLKPl64A,41824
|
|
30
30
|
euroeval/dataset_configs/__init__.py,sha256=lEOr4kJzgtUymeNBVhd-VwdUK0YTUZ3GjUMlLz5fGWk,2010
|
|
31
|
-
euroeval/dataset_configs/danish.py,sha256=
|
|
31
|
+
euroeval/dataset_configs/danish.py,sha256=Pb43E-xfgQk9uaxq8ooznvf8okdX8KAYFEPHt1CG_TQ,5192
|
|
32
32
|
euroeval/dataset_configs/dutch.py,sha256=tY7FDw7BmhXxNfI1hqfasxQXP0QbYTqknokTZ7gqdRY,5079
|
|
33
33
|
euroeval/dataset_configs/english.py,sha256=Y4yc3AQu8WojqENj0sy4-rIlx1LhPnsCQ0DeonqDsVs,4128
|
|
34
34
|
euroeval/dataset_configs/estonian.py,sha256=o13P_XkrdhLFCz9l8LJy-TSY3JIN7XmByxesEDiagnc,2879
|
|
@@ -47,7 +47,7 @@ euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,
|
|
|
47
47
|
euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
|
|
48
48
|
euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
|
|
49
49
|
euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
|
|
50
|
-
euroeval/metrics/pipeline.py,sha256=
|
|
50
|
+
euroeval/metrics/pipeline.py,sha256=a09Um3tnNdyQhzyDa9k-seYQXriYiJRQ5vyHK2lrKcg,10276
|
|
51
51
|
euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
|
|
52
52
|
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
53
53
|
euroeval/prompt_templates/linguistic_acceptability.py,sha256=9ZIyv_hfI2Aj20Uy9SY1izq5OBRV844PXPiZCNCOoEY,8207
|
|
@@ -57,13 +57,13 @@ euroeval/prompt_templates/reading_comprehension.py,sha256=eRMN-kCT3wuImbuFXzZYfo
|
|
|
57
57
|
euroeval/prompt_templates/sentiment_classification.py,sha256=eIXn-aAY7LKeXqxzMKoqdVbihA2f1RaNQk7DhceuQdQ,8887
|
|
58
58
|
euroeval/prompt_templates/summarization.py,sha256=GvnKuYJKbJ_2QkdtSWp_h4RhfOXdq-7_yYeClJSPaTY,6137
|
|
59
59
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
60
|
-
euroeval/task_group_utils/multiple_choice_classification.py,sha256=
|
|
60
|
+
euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
|
|
61
61
|
euroeval/task_group_utils/question_answering.py,sha256=vdEbcZy7BE6ICA7kWkPYmPW4eVuIiZ_4uJRLUexDhwY,27750
|
|
62
|
-
euroeval/task_group_utils/sequence_classification.py,sha256=
|
|
62
|
+
euroeval/task_group_utils/sequence_classification.py,sha256=ZIXcYo6ins9VUv8TT4aupWrfUQoWGBlgU8a1hYATOYM,17249
|
|
63
63
|
euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
|
|
64
|
-
euroeval/task_group_utils/token_classification.py,sha256=
|
|
65
|
-
euroeval-16.0.
|
|
66
|
-
euroeval-16.0.
|
|
67
|
-
euroeval-16.0.
|
|
68
|
-
euroeval-16.0.
|
|
69
|
-
euroeval-16.0.
|
|
64
|
+
euroeval/task_group_utils/token_classification.py,sha256=sNl0rhkXI9g5zKsJujrWX-9jWbYYK2iaKA1AcUg0xW4,17118
|
|
65
|
+
euroeval-16.0.1.dist-info/METADATA,sha256=toyIiyjwyl4Oty2YsD-P6r95hN0Si3BkBNBMOfmiwBA,13729
|
|
66
|
+
euroeval-16.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
67
|
+
euroeval-16.0.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
|
|
68
|
+
euroeval-16.0.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
69
|
+
euroeval-16.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|