EuroEval 15.4.0__py3-none-any.whl → 15.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/generation.py CHANGED
@@ -20,7 +20,12 @@ from .model_cache import (
20
20
  from .utils import clear_memory
21
21
 
22
22
  if t.TYPE_CHECKING:
23
- from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
23
+ from .data_models import (
24
+ BenchmarkConfig,
25
+ DatasetConfig,
26
+ GenerativeModelOutput,
27
+ ModelConfig,
28
+ )
24
29
 
25
30
  logger = logging.getLogger("euroeval")
26
31
 
@@ -163,6 +168,7 @@ def generate_single_iteration(
163
168
  if benchmark_config.debug:
164
169
  debug_log(
165
170
  batch=batch,
171
+ model_output=model_output,
166
172
  extracted_labels=extracted_labels, # type: ignore[arg-type]
167
173
  dataset_config=dataset_config,
168
174
  )
@@ -217,6 +223,7 @@ def generate_single_iteration(
217
223
 
218
224
  def debug_log(
219
225
  batch: dict[str, t.Any],
226
+ model_output: "GenerativeModelOutput",
220
227
  extracted_labels: list[dict | str | list[str]],
221
228
  dataset_config: "DatasetConfig",
222
229
  ) -> None:
@@ -225,6 +232,8 @@ def debug_log(
225
232
  Args:
226
233
  batch:
227
234
  The batch of examples to evaluate on.
235
+ model_output:
236
+ The output of the model.
228
237
  extracted_labels:
229
238
  The extracted labels from the model output.
230
239
  dataset_config:
@@ -290,7 +299,12 @@ def debug_log(
290
299
  else:
291
300
  input_texts = batch["text"]
292
301
 
293
- for input_text, prediction, label in zip(input_texts, extracted_labels, labels):
302
+ for input_text, raw_output, prediction, label in zip(
303
+ input_texts, model_output.sequences, extracted_labels, labels
304
+ ):
294
305
  logger.info(
295
- f"Input: '{input_text}'\nPrediction: '{prediction}'\nLabel: '{label}'"
306
+ f"Input: '{input_text}'\n"
307
+ f"Raw outout: '{raw_output}'\n"
308
+ f"Prediction: '{prediction}'\n"
309
+ f"Label: '{label}'"
296
310
  )
@@ -162,9 +162,8 @@ def get_closest_logprobs_labels(
162
162
  """
163
163
  english_labels = list(dataset_config.id2label.values())
164
164
  english2local = dataset_config.prompt_label_mapping
165
- candidate_labels = [
166
- english2local[lbl].lower() for lbl in english_labels
167
- ] + english_labels
165
+ local_labels = [english2local[lbl].lower() for lbl in english_labels]
166
+ candidate_labels = local_labels + english_labels
168
167
 
169
168
  output_labels: list[str] = list()
170
169
  for sample in generation_logprobs:
@@ -179,18 +178,39 @@ def get_closest_logprobs_labels(
179
178
  ]
180
179
  generated_labels = [label for label in generated_labels if label != ""]
181
180
 
182
- # We want to use the first generated label which starts with a candidate
181
+ # We want to use the first generated label which contains a unique candidate
183
182
  # label, as the output label
184
183
  output_label: str | None = None
184
+ previously_generated_labels: list[str] = list()
185
185
  for generated_label in generated_labels:
186
+ generated_label = "".join(previously_generated_labels) + generated_label
187
+
188
+ # Get the candidate labels that contain the generated label
186
189
  candidate_output_labels = [
187
190
  candidate_label
188
191
  for candidate_label in candidate_labels
189
- if candidate_label.startswith(generated_label)
192
+ if generated_label in candidate_label
190
193
  ]
194
+
195
+ # If we can uniquely determine the output label, we break the loop.
196
+ # Since we have both the original local labels as well as the English
197
+ # versions, we want to have 0 or 1 candidate labels from each set. This
198
+ # means that ["positive", "positiv"] is fine as they're both referencing
199
+ # the same label, but ["negativ", "neutral"] is not. In the bad case we
200
+ # cannot use the scores and we fall back to using the
201
+ # candidate label with the highest edit distance.
202
+ at_most_one_english_label = (
203
+ len(set(candidate_output_labels).intersection(english_labels)) <= 1
204
+ )
205
+ at_most_one_local_label = (
206
+ len(set(candidate_output_labels).intersection(local_labels)) <= 1
207
+ )
191
208
  if candidate_output_labels:
192
- output_label = candidate_output_labels[0]
193
- break
209
+ if at_most_one_english_label and at_most_one_local_label:
210
+ output_label = candidate_output_labels[0]
211
+ break
212
+ else:
213
+ previously_generated_labels.append(generated_label)
194
214
 
195
215
  if output_label is not None:
196
216
  output_label = english2local.get(output_label, output_label)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.4.0
3
+ Version: 15.4.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
61
61
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
62
62
  Requires-Dist: gradio>=4.26.0; extra == 'all'
63
63
  Requires-Dist: outlines>=0.1.11; extra == 'all'
64
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
65
  Provides-Extra: generative
66
66
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
67
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
69
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
70
  Provides-Extra: human-evaluation
71
71
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
72
72
  Provides-Extra: test
@@ -10,7 +10,7 @@ euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,9
10
10
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
11
  euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
12
  euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
13
- euroeval/generation.py,sha256=UZ9nmKl4rbNBhW41iwpgw_tqfsEfe1UhOnjGudz9GWs,10382
13
+ euroeval/generation.py,sha256=dohSPYc4eASm5tJhNKfBlpJnellKG7nVeyx8yXXxMlE,10721
14
14
  euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
15
15
  euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
16
16
  euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
@@ -30,11 +30,11 @@ euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3c
30
30
  euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
31
  euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
32
  euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
33
- euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
33
+ euroeval/task_utils/sequence_classification.py,sha256=bIsbAj123hEyW40QeSUW8Dpc2SyI3ZPCGexapr9qqjw,9826
34
34
  euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
35
35
  euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
36
- euroeval-15.4.0.dist-info/METADATA,sha256=HfNWsANdb8TJAyK__QPBhs7O5qsQp9G_gPlhVVNuK9c,10724
37
- euroeval-15.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- euroeval-15.4.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
- euroeval-15.4.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
- euroeval-15.4.0.dist-info/RECORD,,
36
+ euroeval-15.4.1.dist-info/METADATA,sha256=OdTP-FAbbF9vUV3OTeV5Y-B6P7FXN2bAalG903ny8hU,10740
37
+ euroeval-15.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.4.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.4.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.4.1.dist-info/RECORD,,