EuroEval 15.4.0__py3-none-any.whl → 15.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/generation.py +17 -3
- euroeval/task_utils/sequence_classification.py +27 -7
- {euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/METADATA +3 -3
- {euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/RECORD +7 -7
- {euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/WHEEL +0 -0
- {euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/licenses/LICENSE +0 -0
euroeval/generation.py
CHANGED
|
@@ -20,7 +20,12 @@ from .model_cache import (
|
|
|
20
20
|
from .utils import clear_memory
|
|
21
21
|
|
|
22
22
|
if t.TYPE_CHECKING:
|
|
23
|
-
from .data_models import
|
|
23
|
+
from .data_models import (
|
|
24
|
+
BenchmarkConfig,
|
|
25
|
+
DatasetConfig,
|
|
26
|
+
GenerativeModelOutput,
|
|
27
|
+
ModelConfig,
|
|
28
|
+
)
|
|
24
29
|
|
|
25
30
|
logger = logging.getLogger("euroeval")
|
|
26
31
|
|
|
@@ -163,6 +168,7 @@ def generate_single_iteration(
|
|
|
163
168
|
if benchmark_config.debug:
|
|
164
169
|
debug_log(
|
|
165
170
|
batch=batch,
|
|
171
|
+
model_output=model_output,
|
|
166
172
|
extracted_labels=extracted_labels, # type: ignore[arg-type]
|
|
167
173
|
dataset_config=dataset_config,
|
|
168
174
|
)
|
|
@@ -217,6 +223,7 @@ def generate_single_iteration(
|
|
|
217
223
|
|
|
218
224
|
def debug_log(
|
|
219
225
|
batch: dict[str, t.Any],
|
|
226
|
+
model_output: "GenerativeModelOutput",
|
|
220
227
|
extracted_labels: list[dict | str | list[str]],
|
|
221
228
|
dataset_config: "DatasetConfig",
|
|
222
229
|
) -> None:
|
|
@@ -225,6 +232,8 @@ def debug_log(
|
|
|
225
232
|
Args:
|
|
226
233
|
batch:
|
|
227
234
|
The batch of examples to evaluate on.
|
|
235
|
+
model_output:
|
|
236
|
+
The output of the model.
|
|
228
237
|
extracted_labels:
|
|
229
238
|
The extracted labels from the model output.
|
|
230
239
|
dataset_config:
|
|
@@ -290,7 +299,12 @@ def debug_log(
|
|
|
290
299
|
else:
|
|
291
300
|
input_texts = batch["text"]
|
|
292
301
|
|
|
293
|
-
for input_text, prediction, label in zip(
|
|
302
|
+
for input_text, raw_output, prediction, label in zip(
|
|
303
|
+
input_texts, model_output.sequences, extracted_labels, labels
|
|
304
|
+
):
|
|
294
305
|
logger.info(
|
|
295
|
-
f"Input: '{input_text}'\
|
|
306
|
+
f"Input: '{input_text}'\n"
|
|
307
|
+
f"Raw outout: '{raw_output}'\n"
|
|
308
|
+
f"Prediction: '{prediction}'\n"
|
|
309
|
+
f"Label: '{label}'"
|
|
296
310
|
)
|
|
@@ -162,9 +162,8 @@ def get_closest_logprobs_labels(
|
|
|
162
162
|
"""
|
|
163
163
|
english_labels = list(dataset_config.id2label.values())
|
|
164
164
|
english2local = dataset_config.prompt_label_mapping
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
] + english_labels
|
|
165
|
+
local_labels = [english2local[lbl].lower() for lbl in english_labels]
|
|
166
|
+
candidate_labels = local_labels + english_labels
|
|
168
167
|
|
|
169
168
|
output_labels: list[str] = list()
|
|
170
169
|
for sample in generation_logprobs:
|
|
@@ -179,18 +178,39 @@ def get_closest_logprobs_labels(
|
|
|
179
178
|
]
|
|
180
179
|
generated_labels = [label for label in generated_labels if label != ""]
|
|
181
180
|
|
|
182
|
-
# We want to use the first generated label which
|
|
181
|
+
# We want to use the first generated label which contains a unique candidate
|
|
183
182
|
# label, as the output label
|
|
184
183
|
output_label: str | None = None
|
|
184
|
+
previously_generated_labels: list[str] = list()
|
|
185
185
|
for generated_label in generated_labels:
|
|
186
|
+
generated_label = "".join(previously_generated_labels) + generated_label
|
|
187
|
+
|
|
188
|
+
# Get the candidate labels that contain the generated label
|
|
186
189
|
candidate_output_labels = [
|
|
187
190
|
candidate_label
|
|
188
191
|
for candidate_label in candidate_labels
|
|
189
|
-
if candidate_label
|
|
192
|
+
if generated_label in candidate_label
|
|
190
193
|
]
|
|
194
|
+
|
|
195
|
+
# If we can uniquely determine the output label, we break the loop.
|
|
196
|
+
# Since we have both the original local labels as well as the English
|
|
197
|
+
# versions, we want to have 0 or 1 candidate labels from each set. This
|
|
198
|
+
# means that ["positive", "positiv"] is fine as they're both referencing
|
|
199
|
+
# the same label, but ["negativ", "neutral"] is not. In the bad case we
|
|
200
|
+
# cannot use the scores and we fall back to using the
|
|
201
|
+
# candidate label with the highest edit distance.
|
|
202
|
+
at_most_one_english_label = (
|
|
203
|
+
len(set(candidate_output_labels).intersection(english_labels)) <= 1
|
|
204
|
+
)
|
|
205
|
+
at_most_one_local_label = (
|
|
206
|
+
len(set(candidate_output_labels).intersection(local_labels)) <= 1
|
|
207
|
+
)
|
|
191
208
|
if candidate_output_labels:
|
|
192
|
-
|
|
193
|
-
|
|
209
|
+
if at_most_one_english_label and at_most_one_local_label:
|
|
210
|
+
output_label = candidate_output_labels[0]
|
|
211
|
+
break
|
|
212
|
+
else:
|
|
213
|
+
previously_generated_labels.append(generated_label)
|
|
194
214
|
|
|
195
215
|
if output_label is not None:
|
|
196
216
|
output_label = english2local.get(output_label, output_label)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.4.
|
|
3
|
+
Version: 15.4.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -61,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
61
61
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
62
62
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
63
63
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
64
|
-
Requires-Dist: vllm
|
|
64
|
+
Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'all'
|
|
65
65
|
Provides-Extra: generative
|
|
66
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
69
|
-
Requires-Dist: vllm
|
|
69
|
+
Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
|
|
70
70
|
Provides-Extra: human-evaluation
|
|
71
71
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
72
72
|
Provides-Extra: test
|
|
@@ -10,7 +10,7 @@ euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,9
|
|
|
10
10
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
11
11
|
euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
|
|
12
12
|
euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
|
|
13
|
-
euroeval/generation.py,sha256=
|
|
13
|
+
euroeval/generation.py,sha256=dohSPYc4eASm5tJhNKfBlpJnellKG7nVeyx8yXXxMlE,10721
|
|
14
14
|
euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
|
|
15
15
|
euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
|
|
16
16
|
euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
|
|
@@ -30,11 +30,11 @@ euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3c
|
|
|
30
30
|
euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
31
31
|
euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
|
|
32
32
|
euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
|
|
33
|
-
euroeval/task_utils/sequence_classification.py,sha256=
|
|
33
|
+
euroeval/task_utils/sequence_classification.py,sha256=bIsbAj123hEyW40QeSUW8Dpc2SyI3ZPCGexapr9qqjw,9826
|
|
34
34
|
euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
|
|
35
35
|
euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
|
|
36
|
-
euroeval-15.4.
|
|
37
|
-
euroeval-15.4.
|
|
38
|
-
euroeval-15.4.
|
|
39
|
-
euroeval-15.4.
|
|
40
|
-
euroeval-15.4.
|
|
36
|
+
euroeval-15.4.1.dist-info/METADATA,sha256=OdTP-FAbbF9vUV3OTeV5Y-B6P7FXN2bAalG903ny8hU,10740
|
|
37
|
+
euroeval-15.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
euroeval-15.4.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
39
|
+
euroeval-15.4.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
40
|
+
euroeval-15.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|