EuroEval 15.7.2__py3-none-any.whl → 15.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -32,6 +32,7 @@ from litellm.llms.vertex_ai.common_utils import VertexAIError
32
32
  from litellm.types.utils import ChoiceLogprobs, ModelResponse
33
33
  from pydantic import conlist, create_model
34
34
  from requests.exceptions import RequestException
35
+ from tqdm.asyncio import tqdm as tqdm_async
35
36
  from tqdm.auto import tqdm
36
37
  from transformers.trainer import Trainer
37
38
 
@@ -66,7 +67,12 @@ from ..task_group_utils import (
66
67
  )
67
68
  from ..tokenization_utils import get_first_label_token_mapping
68
69
  from ..types import ExtractLabelsFunction
69
- from ..utils import create_model_cache_dir, log_once
70
+ from ..utils import (
71
+ catch_coroutine_exception,
72
+ create_model_cache_dir,
73
+ log_once,
74
+ safe_run,
75
+ )
70
76
  from .base import BenchmarkModule
71
77
  from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
72
78
 
@@ -159,9 +165,21 @@ class LiteLLMModel(BenchmarkModule):
159
165
  """A generative model from LiteLLM."""
160
166
 
161
167
  fresh_model = False
162
- batching_preference = BatchingPreference.SINGLE_SAMPLE
168
+ batching_preference = BatchingPreference.ALL_AT_ONCE
163
169
  high_priority = False
164
170
 
171
+ _handleable_exceptions = (
172
+ BadRequestError,
173
+ RateLimitError,
174
+ APIError,
175
+ APIConnectionError,
176
+ Timeout,
177
+ ServiceUnavailableError,
178
+ InternalServerError,
179
+ SystemError,
180
+ AuthenticationError,
181
+ )
182
+
165
183
  def __init__(
166
184
  self,
167
185
  model_config: ModelConfig,
@@ -233,10 +251,7 @@ class LiteLLMModel(BenchmarkModule):
233
251
  The generated model outputs.
234
252
  """
235
253
  assert "messages" in inputs, "The input must contain a 'messages' key."
236
- assert len(inputs["messages"]) == 1, (
237
- "API models only support single-sample batching."
238
- )
239
- messages = inputs["messages"][0]
254
+ messages = inputs["messages"]
240
255
 
241
256
  generation_kwargs: dict[str, t.Any] = dict(
242
257
  model=self.model_config.model_id,
@@ -267,9 +282,20 @@ class LiteLLMModel(BenchmarkModule):
267
282
  generation_kwargs["top_logprobs"] = MAX_LOGPROBS
268
283
 
269
284
  if self.dataset_config.task in TASKS_USING_JSON:
270
- assert "json" in messages[0]["content"].lower(), (
271
- "Prompt must contain 'json' for JSON tasks."
272
- )
285
+ for msg_list in messages:
286
+ # msg_list is a list of {'role':…, 'content':…} dicts
287
+ if not msg_list:
288
+ raise InvalidBenchmark(
289
+ "Encountered an empty message list in 'messages'."
290
+ )
291
+ last = msg_list[-1]
292
+ assert isinstance(last, dict), (
293
+ f"Expected dict message, got {type(last)}"
294
+ )
295
+ assert "json" in last["content"].lower(), (
296
+ "Prompt must contain 'json' for JSON tasks."
297
+ )
298
+
273
299
  if self.generative_type == GenerativeType.REASONING:
274
300
  log_once(
275
301
  f"The model {self.model_config.model_id!r} is a reasoning model "
@@ -321,6 +347,76 @@ class LiteLLMModel(BenchmarkModule):
321
347
  # This drops generation kwargs that are not supported by the model
322
348
  litellm.drop_params = True
323
349
 
350
+ # Extract the generated sequences from the model response. Some APIs cannot
351
+ # handle using newlines as stop sequences, so we try both.
352
+ num_attempts = 10
353
+
354
+ all_responses = {}
355
+ all_failures = []
356
+ to_run = list(enumerate(messages))
357
+
358
+ for attempt in range(num_attempts):
359
+ if not to_run:
360
+ break
361
+
362
+ batch_indices, batch_msgs = zip(*to_run)
363
+ model_response, failures = safe_run(
364
+ self._generate_async(
365
+ messages=list(batch_msgs),
366
+ generation_kwargs=generation_kwargs,
367
+ max_retries=3,
368
+ max_reruns=15,
369
+ )
370
+ )
371
+
372
+ for orig_idx, response in zip(batch_indices, model_response):
373
+ all_responses[orig_idx] = response
374
+
375
+ if not failures:
376
+ to_run = []
377
+ break
378
+
379
+ all_failures.extend(failures)
380
+ to_run = [(orig_idx, messages[orig_idx]) for orig_idx, _ in failures]
381
+ logger.debug(
382
+ f"Attempt {attempt + 1}/{num_attempts}: "
383
+ f"retrying {len(to_run)} failed message(s)"
384
+ )
385
+
386
+ for _, error in failures:
387
+ self._handle_exception(error=error, generation_kwargs=generation_kwargs)
388
+ else:
389
+ raise InvalidBenchmark(
390
+ message=f"Failed to generate text, after {num_attempts} attempts."
391
+ )
392
+
393
+ if to_run:
394
+ raise InvalidBenchmark(
395
+ f"Failed to generate text after {num_attempts} attempts. "
396
+ f"Errors: {all_failures}"
397
+ )
398
+
399
+ ordered_responses = [all_responses[i] for i in range(len(messages))]
400
+ model_output = self._create_model_output(
401
+ model_responses=ordered_responses, model_id=self.model_config.model_id
402
+ )
403
+
404
+ return model_output
405
+
406
+ def _handle_exception(
407
+ self, error: Exception, generation_kwargs: dict[str, t.Any]
408
+ ) -> None:
409
+ """Handle an exception from the model.
410
+
411
+ Args:
412
+ error:
413
+ The exception to handle.
414
+ generation_kwargs:
415
+ The generation kwargs to pass to the model.
416
+ """
417
+ error_msg = str(error).lower()
418
+ model_id = self.model_config.model_id
419
+
324
420
  # Error messages that we want to catch and handle
325
421
  stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
326
422
  logprobs_messages = [
@@ -341,153 +437,238 @@ class LiteLLMModel(BenchmarkModule):
341
437
  max_items_messages = ["'maxItems' is not permitted."]
342
438
  no_json_schema_messages = ["Property keys should match pattern"]
343
439
 
344
- # Extract the generated sequences from the model response. Some APIs cannot
345
- # handle using newlines as stop sequences, so we try both.
346
- num_attempts = 10
347
- for _ in range(num_attempts):
348
- try:
349
- model_response = litellm.completion_with_retries(
350
- messages=messages, **generation_kwargs
351
- )
352
- break
353
- except (BadRequestError, RateLimitError) as e:
354
- if any(msg.lower() in str(e).lower() for msg in stop_messages):
355
- log_once(
356
- f"The model {self.model_config.model_id!r} does not support "
357
- "stop sequences, so disabling them.",
358
- level=logging.DEBUG,
359
- )
360
- generation_kwargs["stop"] = None
361
- elif (
362
- any(msg.lower() in str(e).lower() for msg in logprobs_messages)
363
- # Special case for Vertex AI models, since they have strict rate
364
- # limits on using logprobs. They also have a cap of 5 logprobs, but
365
- # we ignore this since the rate limiting makes it unusable anyway.
366
- or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
367
- ):
368
- log_once(
369
- f"The model {self.model_config.model_id!r} does not support "
370
- "logprobs, so disabling it.",
371
- level=logging.DEBUG,
372
- )
373
- generation_kwargs.pop("logprobs")
374
- generation_kwargs.pop("top_logprobs")
375
- elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
376
- log_once(
377
- f"The model {self.model_config.model_id!r} does not support "
378
- "temperature, so disabling it.",
379
- level=logging.DEBUG,
380
- )
381
- generation_kwargs.pop("temperature")
382
- elif any(
383
- msg.lower() in str(e).lower()
384
- for msg in temperature_must_be_one_messages
385
- ):
386
- log_once(
387
- f"The model {self.model_config.model_id!r} requires "
388
- "temperature to be set to 1, so setting it.",
389
- level=logging.DEBUG,
390
- )
391
- generation_kwargs["temperature"] = 1.0
392
- elif any(msg.lower() in str(e).lower() for msg in max_items_messages):
393
- log_once(
394
- f"The model {self.model_config.model_id!r} does not support "
395
- "maxItems in the JSON schema, so disabling it.",
396
- level=logging.DEBUG,
397
- )
398
- ner_tag_names = list(
399
- self.dataset_config.prompt_label_mapping.values()
400
- )
401
- keys_and_their_types = {
402
- tag_name: (list[str], ...) for tag_name in ner_tag_names
403
- }
404
- pydantic_class = create_model(
405
- "AnswerFormat", **keys_and_their_types
406
- )
407
- generation_kwargs["response_format"] = pydantic_class
408
- elif any(
409
- msg.lower() in str(e).lower() for msg in no_json_schema_messages
410
- ):
411
- log_once(
412
- f"The model {self.model_config.model_id!r} does not support "
413
- "JSON schemas, so using the vanilla JSON format.",
414
- level=logging.DEBUG,
415
- )
416
- generation_kwargs["response_format"] = dict(type="json_object")
417
- elif isinstance(e, RateLimitError):
418
- raise InvalidModel(
419
- "You have encountered your rate limit for model "
420
- f"{self.model_config.model_id!r}. Skipping."
421
- )
422
- else:
423
- raise InvalidBenchmark(
424
- f"Failed to generate text. The error message was: {e}"
425
- )
426
- except APIError as e:
427
- raise InvalidBenchmark(
428
- f"Failed to generate text. The error message was: {e}"
429
- )
430
- except (
440
+ if any(msg.lower() in error_msg for msg in stop_messages):
441
+ log_once(
442
+ f"The model {model_id!r} does not support "
443
+ "stop sequences, so disabling them.",
444
+ level=logging.DEBUG,
445
+ )
446
+ generation_kwargs["stop"] = None
447
+ return
448
+ elif (
449
+ any(msg.lower() in error_msg for msg in logprobs_messages)
450
+ # Special case for Vertex AI models, since they have strict rate
451
+ # limits on using logprobs. They also have a cap of 5 logprobs, but
452
+ # we ignore this since the rate limiting makes it unusable anyway.
453
+ or (isinstance(error, VertexAIError) and "logprobs" in error_msg)
454
+ ):
455
+ log_once(
456
+ f"The model {model_id!r} does not support logprobs, so disabling it.",
457
+ level=logging.DEBUG,
458
+ )
459
+ generation_kwargs.pop("logprobs")
460
+ generation_kwargs.pop("top_logprobs")
461
+ return
462
+ elif any(msg.lower() in error_msg for msg in temperature_messages):
463
+ log_once(
464
+ f"The model {model_id!r} does not support "
465
+ "temperature, so disabling it.",
466
+ level=logging.DEBUG,
467
+ )
468
+ generation_kwargs.pop("temperature")
469
+ return
470
+ elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
471
+ log_once(
472
+ f"The model {model_id!r} requires "
473
+ "temperature to be set to 1, so setting it.",
474
+ level=logging.DEBUG,
475
+ )
476
+ generation_kwargs["temperature"] = 1.0
477
+ return
478
+ elif any(msg.lower() in error_msg for msg in max_items_messages):
479
+ log_once(
480
+ f"The model {model_id!r} does not support "
481
+ "maxItems in the JSON schema, so disabling it.",
482
+ level=logging.DEBUG,
483
+ )
484
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
485
+ keys_and_their_types = {
486
+ tag_name: (list[str], ...) for tag_name in ner_tag_names
487
+ }
488
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
489
+ generation_kwargs["response_format"] = pydantic_class
490
+ return
491
+ elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
492
+ log_once(
493
+ f"The model {self.model_config.model_id!r} does not support "
494
+ "JSON schemas, so using the vanilla JSON format.",
495
+ level=logging.DEBUG,
496
+ )
497
+ generation_kwargs["response_format"] = dict(type="json_object")
498
+ return
499
+ elif isinstance(
500
+ error,
501
+ (
431
502
  APIConnectionError,
432
503
  Timeout,
433
504
  ServiceUnavailableError,
434
505
  InternalServerError,
435
506
  SystemError,
436
- ) as e:
437
- logger.debug(
438
- f"Service temporarily unavailable. The error message was: {e}. "
439
- f"Retrying in 5 seconds..."
507
+ ),
508
+ ):
509
+ logger.debug(
510
+ f"Service temporarily unavailable. The error message was: {error}. "
511
+ f"Retrying in 5 seconds..."
512
+ )
513
+ sleep(5)
514
+ return
515
+
516
+ if isinstance(error, RateLimitError):
517
+ raise InvalidModel(
518
+ f"You have encountered your rate limit for model {model_id!r}. "
519
+ "Skipping."
520
+ )
521
+
522
+ if isinstance(error, AuthenticationError):
523
+ raise NeedsAdditionalArgument(
524
+ cli_argument="--api-key",
525
+ script_argument="api_key=<your-api-key>",
526
+ run_with_cli=self.benchmark_config.run_with_cli,
527
+ )
528
+
529
+ raise InvalidBenchmark(
530
+ f"Failed to generate text. The error message was: {error}"
531
+ )
532
+
533
+ async def _generate_async(
534
+ self,
535
+ messages: list[dict[str, t.Any]],
536
+ generation_kwargs: dict[str, t.Any],
537
+ max_retries: int,
538
+ max_reruns: int,
539
+ ) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
540
+ """Generate outputs from the model asynchronously.
541
+
542
+ Args:
543
+ messages:
544
+ The messages to pass to the model.
545
+ generation_kwargs:
546
+ The generation kwargs to pass to the model.
547
+ max_retries:
548
+ The maximum number of retries to make.
549
+ max_reruns:
550
+ The maximum number of reruns to make.
551
+
552
+ Returns:
553
+ A tuple containing the successful responses and the failed responses.
554
+ """
555
+ success = []
556
+ all_failures = {}
557
+ to_run = list(enumerate(messages))
558
+ prev_fail_count = len(to_run)
559
+ rerun_count = 0
560
+
561
+ while to_run and rerun_count < max_reruns and prev_fail_count > 0:
562
+ requests = [
563
+ litellm.acompletion(
564
+ messages=msg, max_retries=max_retries, **generation_kwargs
440
565
  )
441
- sleep(5)
442
- except AuthenticationError:
443
- raise NeedsAdditionalArgument(
444
- cli_argument="--api-key",
445
- script_argument="api_key=<your-api-key>",
446
- run_with_cli=self.benchmark_config.run_with_cli,
566
+ for _, msg in to_run
567
+ ]
568
+ wrapped_requests = [
569
+ catch_coroutine_exception(request) for request in requests
570
+ ]
571
+ responses = await tqdm_async.gather(*wrapped_requests, leave=False)
572
+
573
+ next_to_run = []
574
+ current_fail_count = 0
575
+
576
+ for (orig_idx, _), response in zip(to_run, responses):
577
+ if isinstance(response, Exception):
578
+ current_fail_count += 1
579
+ all_failures[orig_idx] = response
580
+ next_to_run.append((orig_idx, messages[orig_idx]))
581
+ else:
582
+ success.append(response)
583
+
584
+ if current_fail_count >= prev_fail_count:
585
+ logger.warning(
586
+ "Retry loop aborting due to no progress: "
587
+ f"current_fail_count={current_fail_count}, "
588
+ f"prev_fail_count={prev_fail_count}"
447
589
  )
448
- else:
449
- raise InvalidBenchmark(
450
- message=f"Failed to generate text, after {num_attempts} attempts."
451
- )
590
+ break
591
+
592
+ prev_fail_count = current_fail_count
593
+ to_run = next_to_run
594
+ rerun_count += 1
595
+
596
+ failures = [(orig_idx, all_failures[orig_idx]) for orig_idx, _ in to_run]
597
+ return success, failures
598
+
599
+ @staticmethod
600
+ def _create_model_output(
601
+ model_responses: list[ModelResponse], model_id: str
602
+ ) -> GenerativeModelOutput:
603
+ """Create a GenerativeModelOutput object from a list of ModelResponse objects.
604
+
605
+ Args:
606
+ model_responses:
607
+ The list of ModelResponse objects to create the GenerativeModelOutput
608
+ object from.
609
+ model_id:
610
+ The ID of the model.
611
+
612
+ Returns:
613
+ A GenerativeModelOutput object.
614
+ """
615
+ sequences = []
616
+ scores = []
617
+ for model_response in model_responses:
618
+ if not model_response.choices:
619
+ # This happens for reasoning models, when they don't finish thinking
620
+ # and run out of tokens. Happens quite rarely, but we need to handle it.
621
+ logger.warning(
622
+ f"The model {model_id!r} did not end up "
623
+ "generating any text. This is likely because the model ran "
624
+ "out of tokens while reasoning. Returning an empty string."
625
+ )
626
+ continue
627
+
628
+ model_response_choices = model_response.choices[0]
629
+ assert isinstance(model_response_choices, litellm.Choices)
630
+ generated_message: litellm.Message = model_response_choices.message
631
+ generation_output = generated_message.content or ""
632
+ generation_output = generation_output.strip()
633
+
634
+ # Structure the model output as a GenerativeModelOutput object
635
+ sequences.append(generation_output)
636
+ if hasattr(model_response_choices, "logprobs"):
637
+ logprobs_obj = model_response_choices.logprobs
638
+ if isinstance(logprobs_obj, ChoiceLogprobs):
639
+ logprobs_list: list[list[tuple[str, float]]] = [
640
+ [
641
+ (top_logprob.token, top_logprob.logprob)
642
+ for top_logprob in content.top_logprobs
643
+ ]
644
+ for content in model_response_choices.logprobs.content or list()
645
+ ]
646
+ scores.append(logprobs_list)
647
+ else:
648
+ log_once(
649
+ "The logprobs object is malformed, so we won't use logprobs to "
650
+ "determine the labels.",
651
+ level=logging.WARNING,
652
+ )
452
653
 
453
- assert isinstance(model_response, ModelResponse)
454
- if not model_response.choices:
455
- # This happens for reasoning models, when they don't finish thinking and run
456
- # out of tokens. Happens quite rarely, but we need to handle it.
654
+ if not sequences:
457
655
  logger.warning(
458
- f"The model {self.model_config.model_id!r} did not end up generating "
459
- "any text. This is likely because the model ran out of tokens while "
460
- "reasoning. Returning an empty string."
656
+ "No sequences were generated by the model "
657
+ f"{model_id!r}. This may be due to the "
658
+ "model running out of tokens or an issue with the input data. "
659
+ "Returning an empty GenerativeModelOutput."
461
660
  )
462
- return GenerativeModelOutput(sequences=[""])
463
-
464
- model_response_choices = model_response.choices[0]
465
- assert isinstance(model_response_choices, litellm.Choices)
466
- generated_message: litellm.Message = model_response_choices.message
467
- generation_output = generated_message.content or ""
468
- generation_output = generation_output.strip()
469
-
470
- # Structure the model output as a GenerativeModelOutput object
471
- model_output = GenerativeModelOutput(sequences=[generation_output])
472
- if hasattr(model_response_choices, "logprobs"):
473
- logprobs_obj = model_response_choices.logprobs
474
- if isinstance(logprobs_obj, ChoiceLogprobs):
475
- logprobs_list: list[list[tuple[str, float]]] = [
476
- [
477
- (top_logprob.token, top_logprob.logprob)
478
- for top_logprob in content.top_logprobs
479
- ]
480
- for content in model_response_choices.logprobs.content or list()
481
- ]
482
- model_output.scores = [logprobs_list]
483
- else:
484
- log_once(
485
- "The logprobs object is malformed, so we won't use logprobs to "
486
- "determine the labels.",
487
- level=logging.WARNING,
488
- )
661
+ return GenerativeModelOutput(sequences=[], scores=None)
489
662
 
490
- return model_output
663
+ if scores and len(sequences) != len(scores):
664
+ raise InvalidBenchmark(
665
+ "Sequences and scores must have the same length. "
666
+ f"Got {len(sequences)} sequences and {len(scores)} scores."
667
+ )
668
+
669
+ return GenerativeModelOutput(
670
+ sequences=sequences, scores=scores if scores else None
671
+ )
491
672
 
492
673
  @cached_property
493
674
  def num_params(self) -> int:
euroeval/benchmarker.py CHANGED
@@ -11,6 +11,7 @@ from pathlib import Path
11
11
  from shutil import rmtree
12
12
  from time import sleep
13
13
 
14
+ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
14
15
  from torch.distributed import destroy_process_group
15
16
 
16
17
  from .benchmark_config_factory import build_benchmark_config
@@ -27,7 +28,7 @@ from .model_loading import load_model
27
28
  from .scores import log_scores
28
29
  from .speed_benchmark import benchmark_speed
29
30
  from .tasks import SPEED
30
- from .utils import enforce_reproducibility
31
+ from .utils import enforce_reproducibility, get_package_version
31
32
 
32
33
  if t.TYPE_CHECKING:
33
34
  from .benchmark_modules import BenchmarkModule
@@ -164,6 +165,15 @@ class Benchmarker:
164
165
  if task is not None and dataset is not None:
165
166
  raise ValueError("Only one of `task` and `dataset` can be specified.")
166
167
 
168
+ # Bail early if hf_transfer is enabled but not installed.
169
+ if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
170
+ raise ImportError(
171
+ "Fast download using 'hf_transfer' is enabled "
172
+ "(HF_HUB_ENABLE_HF_TRANSFER=1) but the 'hf_transfer' "
173
+ "package is not available in your environment. "
174
+ "Try installing it with `pip install hf_transfer`."
175
+ )
176
+
167
177
  self.benchmark_config_default_params = BenchmarkConfigParams(
168
178
  progress_bar=progress_bar,
169
179
  save_results=save_results,
euroeval/data_models.py CHANGED
@@ -529,12 +529,16 @@ class DatasetConfig:
529
529
  else:
530
530
  sep_word = main_language.or_separator
531
531
 
532
+ local_labels: list[str] = []
533
+ for label in self.labels:
534
+ if label not in self.prompt_label_mapping:
535
+ continue
536
+ local_label = self.prompt_label_mapping[label]
537
+ if local_label not in local_labels:
538
+ local_labels.append(local_label)
539
+
532
540
  # Convert labels to single-quoted labels - and remove duplicates
533
- quoted_labels = [
534
- f"'{self.prompt_label_mapping[label]}'"
535
- for label in set(self.labels)
536
- if label in self.prompt_label_mapping
537
- ]
541
+ quoted_labels = [f"'{label}'" for label in local_labels]
538
542
 
539
543
  if not quoted_labels:
540
544
  return ""
@@ -79,7 +79,7 @@ ARC_CONFIG = DatasetConfig(
79
79
  )
80
80
 
81
81
  BELEBELE_CONFIG = DatasetConfig(
82
- name="belebele",
82
+ name="belebele-en",
83
83
  pretty_name="the English multiple choice reading comprehension dataset BeleBele",
84
84
  huggingface_id="EuroEval/belebele-mini",
85
85
  task=MCRC,
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import FI
5
- from ..tasks import LA, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -40,16 +40,14 @@ XLSUM_FI_CONFIG = DatasetConfig(
40
40
  languages=[FI],
41
41
  )
42
42
 
43
- # TODO: Include when this issue has been resolved:
44
- # https://github.com/EuroEval/EuroEval/issues/158#issuecomment-2846664885
45
- # HELLASWAG_FI_CONFIG = DatasetConfig(
46
- # name="hellaswag-fi",
47
- # pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
48
- # "HellaSwag-fi, translated from the English HellaSwag dataset",
49
- # huggingface_id="EuroEval/hellaswag-fi-mini",
50
- # task=COMMON_SENSE,
51
- # languages=[FI],
52
- # )
43
+ HELLASWAG_FI_CONFIG = DatasetConfig(
44
+ name="hellaswag-fi",
45
+ pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
46
+ "HellaSwag-fi, translated from the English HellaSwag dataset",
47
+ huggingface_id="EuroEval/hellaswag-fi-mini",
48
+ task=COMMON_SENSE,
49
+ languages=[FI],
50
+ )
53
51
 
54
52
  SCALA_FI_CONFIG = DatasetConfig(
55
53
  name="scala-fi",
@@ -60,3 +58,13 @@ SCALA_FI_CONFIG = DatasetConfig(
60
58
  )
61
59
 
62
60
  ### Unofficial datasets ###
61
+
62
+ BELEBELE_FI_CONFIG = DatasetConfig(
63
+ name="belebele-fi",
64
+ pretty_name="the Finnish multiple choice reading comprehension dataset "
65
+ "BeleBele-fi, translated from the English BeleBele dataset",
66
+ huggingface_id="EuroEval/belebele-fi-mini",
67
+ task=MCRC,
68
+ languages=[FI],
69
+ unofficial=True,
70
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import IT
5
- from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -79,3 +79,13 @@ WIKINEURAL_IT_CONFIG = DatasetConfig(
79
79
  languages=[IT],
80
80
  unofficial=True,
81
81
  )
82
+
83
+ BELEBELE_IT_CONFIG = DatasetConfig(
84
+ name="belebele-it",
85
+ pretty_name="the Italian multiple choice reading comprehension dataset "
86
+ "BeleBele-it, translated from the English BeleBele dataset",
87
+ huggingface_id="EuroEval/belebele-it-mini",
88
+ task=MCRC,
89
+ languages=[IT],
90
+ unofficial=True,
91
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import ES
5
- from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -76,3 +76,13 @@ XQUAD_ES_CONFIG = DatasetConfig(
76
76
  languages=[ES],
77
77
  unofficial=True,
78
78
  )
79
+
80
+ BELEBELE_ES_CONFIG = DatasetConfig(
81
+ name="belebele-es",
82
+ pretty_name="the Spanish multiple choice reading comprehension dataset "
83
+ "BeleBele-es, translated from the English BeleBele dataset",
84
+ huggingface_id="EuroEval/belebele-es-mini",
85
+ task=MCRC,
86
+ languages=[ES],
87
+ unofficial=True,
88
+ )
euroeval/finetuning.py CHANGED
@@ -103,7 +103,6 @@ def finetune(
103
103
  itr_scores = finetune_single_iteration(
104
104
  model=model if model_already_initialized else None,
105
105
  dataset=datasets[idx],
106
- iteration_idx=idx,
107
106
  training_args=training_args,
108
107
  model_config=model_config,
109
108
  dataset_config=dataset_config,
@@ -158,7 +157,6 @@ def finetune(
158
157
  def finetune_single_iteration(
159
158
  model: BenchmarkModule | None,
160
159
  dataset: DatasetDict,
161
- iteration_idx: int,
162
160
  training_args: TrainingArguments,
163
161
  model_config: "ModelConfig",
164
162
  dataset_config: "DatasetConfig",
@@ -171,8 +169,6 @@ def finetune_single_iteration(
171
169
  The model to use in the benchmark. If None then a new model will be loaded.
172
170
  dataset:
173
171
  The dataset to use for training and evaluation.
174
- iteration_idx:
175
- The index of the iteration.
176
172
  training_args:
177
173
  The training arguments.
178
174
  model_config:
@@ -213,41 +209,42 @@ def finetune_single_iteration(
213
209
 
214
210
  trainer.log = no_logging
215
211
 
216
- # Re-block terminal output, as it gets unblocked by the `transformers`
217
- # package before training
212
+ # Re-block terminal output, as it gets unblocked by the `transformers` package
213
+ # before training
218
214
  block_terminal_output()
219
215
 
220
- # Sort out callbacks. We remove the callbacks that are producing unnecessary
221
- # output, to avoid cluttering the terminal output
216
+ # Sort out callbacks. We remove the callbacks that are producing unnecessary output,
217
+ # to avoid cluttering the terminal output
222
218
  if not benchmark_config.verbose:
223
219
  trainer.remove_callback(PrinterCallback)
224
220
  trainer.remove_callback(ProgressCallback)
225
221
  if benchmark_config.progress_bar:
226
222
  trainer.add_callback(NeverLeaveProgressCallback)
227
223
 
228
- try:
229
- trainer.train()
230
- with torch.inference_mode():
231
- try:
232
- test_scores = trainer.evaluate(
233
- eval_dataset=dataset["test"],
234
- orig_eval_dataset=dataset["original_test"],
235
- metric_key_prefix="test",
236
- )
237
- except TypeError:
238
- test_scores = trainer.evaluate(
239
- eval_dataset=dataset["test"], metric_key_prefix="test"
240
- )
241
- return test_scores
242
-
243
- except NaNValueInModelOutput as e:
244
- del trainer
245
- del model
246
- clear_memory()
247
- raise e
248
-
249
- except (RuntimeError, ValueError, IndexError) as e:
250
- raise InvalidBenchmark(str(e))
224
+ # Train the model
225
+ trainer.train()
226
+
227
+ # Evaluate the model
228
+ with torch.inference_mode():
229
+ try:
230
+ test_scores = trainer.evaluate(
231
+ eval_dataset=dataset["test"],
232
+ orig_eval_dataset=dataset["original_test"],
233
+ metric_key_prefix="test",
234
+ )
235
+ except TypeError:
236
+ test_scores = trainer.evaluate(
237
+ eval_dataset=dataset["test"], metric_key_prefix="test"
238
+ )
239
+ except NaNValueInModelOutput as e:
240
+ del trainer
241
+ del model
242
+ clear_memory()
243
+ raise e
244
+ except (RuntimeError, ValueError, IndexError) as e:
245
+ raise InvalidBenchmark(str(e))
246
+
247
+ return test_scores
251
248
 
252
249
 
253
250
  def get_training_args(
@@ -300,6 +297,7 @@ def get_training_args(
300
297
  save_total_limit=1,
301
298
  per_device_train_batch_size=batch_size,
302
299
  per_device_eval_batch_size=batch_size,
300
+ eval_accumulation_steps=32,
303
301
  optim=OptimizerNames.ADAMW_TORCH,
304
302
  learning_rate=2e-5,
305
303
  warmup_ratio=0.01,
@@ -144,9 +144,27 @@ def extract_labels_from_generation(
144
144
  )
145
145
  if labels is not None:
146
146
  return labels
147
- return get_closest_word_edit_labels(
148
- generated_sequences=model_output.sequences, dataset_config=dataset_config
149
- )
147
+
148
+ candidate_labels = [
149
+ dataset_config.prompt_label_mapping[lbl]
150
+ for lbl in dataset_config.id2label.values()
151
+ ]
152
+ new_predicted_labels: list[str] = list()
153
+ for predicted_label in model_output.sequences:
154
+ # If the prediction includes a boxed answer, use that instead of the full
155
+ # generation
156
+ if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
157
+ predicted_label = m.group(1)
158
+
159
+ # Pick the label with the smallest word edit distance to the predicted label
160
+ edit_distances = [
161
+ Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
162
+ for candidate_label in candidate_labels
163
+ ]
164
+ predicted_label = candidate_labels[np.argmin(edit_distances).item()]
165
+ new_predicted_labels.append(predicted_label)
166
+
167
+ return new_predicted_labels
150
168
 
151
169
 
152
170
  def get_closest_logprobs_labels(
@@ -305,32 +323,3 @@ def get_closest_logprobs_labels(
305
323
 
306
324
  assert len(output_labels) == len(generation_logprobs)
307
325
  return output_labels
308
-
309
-
310
- def get_closest_word_edit_labels(
311
- generated_sequences: list[str], dataset_config: "DatasetConfig"
312
- ) -> list[str]:
313
- """Get the labels with the smallest edit distance to the predicted labels.
314
-
315
- Args:
316
- generated_sequences:
317
- The generated sequences from the model.
318
- dataset_config:
319
- The configuration of the dataset.
320
-
321
- Returns:
322
- The candidate labels with the smallest edit distance to the predicted labels.
323
- """
324
- candidate_labels = [
325
- dataset_config.prompt_label_mapping[lbl]
326
- for lbl in dataset_config.id2label.values()
327
- ]
328
- new_predicted_labels: list[str] = list()
329
- for predicted_label in generated_sequences:
330
- edit_distances = [
331
- Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
332
- for candidate_label in candidate_labels
333
- ]
334
- closest_label = candidate_labels[np.argmin(edit_distances).item()]
335
- new_predicted_labels.append(closest_label)
336
- return new_predicted_labels
@@ -169,7 +169,7 @@ def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
169
169
 
170
170
  vocab: dict[str, int] = tokenizer.get_vocab()
171
171
 
172
- candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
172
+ candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
173
173
  for candidate_bos_token in candidate_bos_tokens:
174
174
  if candidate_bos_token in vocab:
175
175
  bos_token = candidate_bos_token
@@ -200,7 +200,7 @@ def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
200
200
 
201
201
  vocab: dict[str, int] = tokenizer.get_vocab()
202
202
 
203
- candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
203
+ candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
204
204
  for candidate_eos_token in candidate_eos_tokens:
205
205
  if candidate_eos_token in vocab:
206
206
  eos_token = candidate_eos_token
euroeval/utils.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Utility functions to be used in other scripts."""
2
2
 
3
+ import asyncio
3
4
  import gc
4
5
  import importlib
5
6
  import importlib.metadata
@@ -327,3 +328,43 @@ def get_package_version(package_name: str) -> str | None:
327
328
  return importlib.metadata.version(package_name)
328
329
  except importlib.metadata.PackageNotFoundError:
329
330
  return None
331
+
332
+
333
+ T = t.TypeVar("T", bound=object)
334
+
335
+
336
+ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
337
+ """Run a coroutine, ensuring that the event loop is always closed when we're done.
338
+
339
+ Args:
340
+ coroutine:
341
+ The coroutine to run.
342
+
343
+ Returns:
344
+ The result of the coroutine.
345
+ """
346
+ loop = asyncio.new_event_loop()
347
+ try:
348
+ asyncio.set_event_loop(loop)
349
+ return loop.run_until_complete(coroutine)
350
+ finally:
351
+ loop.close()
352
+ asyncio.set_event_loop(None)
353
+
354
+
355
+ async def catch_coroutine_exception(
356
+ coroutine: t.Coroutine[t.Any, t.Any, T],
357
+ ) -> T | Exception:
358
+ """Run a coroutine, catching any exceptions and returning them.
359
+
360
+ Args:
361
+ coroutine:
362
+ The coroutine to run.
363
+
364
+ Returns:
365
+ The result of the coroutine, or the exception if it was raised.
366
+ """
367
+ try:
368
+ return await coroutine
369
+ except Exception as exc:
370
+ return exc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.7.2
3
+ Version: 15.8.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,14 +1,14 @@
1
1
  euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
2
  euroeval/benchmark_config_factory.py,sha256=RDYotoLcfNr3xU8Cw-G-Y8wLe6RSlJD1Ok9C97lWfOs,12553
3
- euroeval/benchmarker.py,sha256=4tCrs0CvKvQcMpJRtaonxELEDXkmY95stCGwht6wTGE,48649
3
+ euroeval/benchmarker.py,sha256=EHoYilZ2Xx0-6_aEBlG84MsZbomJSiHNHc4wKOVVBB8,49199
4
4
  euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
6
  euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
7
7
  euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
8
- euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
8
+ euroeval/data_models.py,sha256=59ca6gxmHwMdeIIU6f-gGXOVIXXDQPOt7m5nCXHK86E,23166
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
11
- euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
11
+ euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
12
12
  euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
13
13
  euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
14
14
  euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
@@ -19,27 +19,27 @@ euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,223
19
19
  euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
20
20
  euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
21
21
  euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
22
- euroeval/tokenization_utils.py,sha256=RYTYbzCM9cryZ_w-_CzyN9Sbt47DbaGU5ukm-H38sHI,13871
22
+ euroeval/tokenization_utils.py,sha256=kghOIZMM3H0P9YDv0VBSNI7drzgJXlkRtMwt3Cgeev8,13907
23
23
  euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
24
- euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
24
+ euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
25
25
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
26
26
  euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
27
27
  euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
28
28
  euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
29
- euroeval/benchmark_modules/litellm.py,sha256=_32H-M1L_TfW-opyaMLJFPxx0iOG8A8Zfq7uVGFKZdA,43005
29
+ euroeval/benchmark_modules/litellm.py,sha256=dd7OqBvWA75zNrsEHtC3cx3rNpNJ-1QOL2arV_CqYG0,48231
30
30
  euroeval/benchmark_modules/vllm.py,sha256=DJyla0jr-DVMPPs4RBguxq1Xn5YguvyuAnIlgIOfFaw,39394
31
31
  euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
32
32
  euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
33
33
  euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
34
- euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
34
+ euroeval/dataset_configs/english.py,sha256=-N85DiNVrZFqpahNUTfxaWy4vvdOWC8Bi0G4uAO4uDw,2326
35
35
  euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
36
- euroeval/dataset_configs/finnish.py,sha256=lZA2bY_ul9qh3uGFrTNe7q15WyZ04EL9OYmrkcNjygY,1857
36
+ euroeval/dataset_configs/finnish.py,sha256=_8YWIlZNpO8Qi233bH7cKwm3tq3WETLfC_6mzg7LLog,2045
37
37
  euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
38
38
  euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
39
39
  euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
40
- euroeval/dataset_configs/italian.py,sha256=5yYMMBbxkfSDpLgJ9IH_pgkpzEp-74vMMvx-dT8x4WY,2345
40
+ euroeval/dataset_configs/italian.py,sha256=KNjCvTzsEqH_EEk3At8slKqNwWWiIdbv_t5ke7n9nZI,2660
41
41
  euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada4qwE7tw0,5181
42
- euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
42
+ euroeval/dataset_configs/spanish.py,sha256=NviL-FzJ5jq1bLTRvbtZBiGrAmZjxyijZNpKZFrnT-M,2527
43
43
  euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
44
44
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
45
45
  euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
@@ -51,11 +51,11 @@ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5w
51
51
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
52
52
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
53
53
  euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
54
- euroeval/task_group_utils/sequence_classification.py,sha256=MCdO5h3v_LWTkrvKAeefPq7rl1H5mFed50nAL4uZq0E,13837
54
+ euroeval/task_group_utils/sequence_classification.py,sha256=Yqx0pUhuHYmSkv1ZUfOndSLTvpr0lWCk19oYITfSjV4,13555
55
55
  euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
56
56
  euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
57
- euroeval-15.7.2.dist-info/METADATA,sha256=nCF9GI8kOoKP3Up_KgPSxe4pnomawC1rQqRGlYoEsIA,13669
58
- euroeval-15.7.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
- euroeval-15.7.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
- euroeval-15.7.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
- euroeval-15.7.2.dist-info/RECORD,,
57
+ euroeval-15.8.1.dist-info/METADATA,sha256=Fdzj20PR6wWZUx_7f_bhPh8S4DF6ghZwMIDrJ4ozxFE,13669
58
+ euroeval-15.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
+ euroeval-15.8.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
+ euroeval-15.8.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
+ euroeval-15.8.1.dist-info/RECORD,,