EuroEval 15.7.1__py3-none-any.whl → 15.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -238,7 +238,7 @@ def prepare_languages(
238
238
  The default language codes of the languages to include.
239
239
 
240
240
  Returns:
241
- The prepared model or dataset languages.
241
+ The prepared dataset languages.
242
242
  """
243
243
  # Create a dictionary that maps languages to their associated language objects
244
244
  language_mapping = get_all_languages()
@@ -32,6 +32,7 @@ from litellm.llms.vertex_ai.common_utils import VertexAIError
32
32
  from litellm.types.utils import ChoiceLogprobs, ModelResponse
33
33
  from pydantic import conlist, create_model
34
34
  from requests.exceptions import RequestException
35
+ from tqdm.asyncio import tqdm as tqdm_async
35
36
  from tqdm.auto import tqdm
36
37
  from transformers.trainer import Trainer
37
38
 
@@ -66,7 +67,12 @@ from ..task_group_utils import (
66
67
  )
67
68
  from ..tokenization_utils import get_first_label_token_mapping
68
69
  from ..types import ExtractLabelsFunction
69
- from ..utils import create_model_cache_dir, log_once
70
+ from ..utils import (
71
+ catch_coroutine_exception,
72
+ create_model_cache_dir,
73
+ log_once,
74
+ safe_run,
75
+ )
70
76
  from .base import BenchmarkModule
71
77
  from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
72
78
 
@@ -159,9 +165,21 @@ class LiteLLMModel(BenchmarkModule):
159
165
  """A generative model from LiteLLM."""
160
166
 
161
167
  fresh_model = False
162
- batching_preference = BatchingPreference.SINGLE_SAMPLE
168
+ batching_preference = BatchingPreference.ALL_AT_ONCE
163
169
  high_priority = False
164
170
 
171
+ _handleable_exceptions = (
172
+ BadRequestError,
173
+ RateLimitError,
174
+ APIError,
175
+ APIConnectionError,
176
+ Timeout,
177
+ ServiceUnavailableError,
178
+ InternalServerError,
179
+ SystemError,
180
+ AuthenticationError,
181
+ )
182
+
165
183
  def __init__(
166
184
  self,
167
185
  model_config: ModelConfig,
@@ -233,10 +251,7 @@ class LiteLLMModel(BenchmarkModule):
233
251
  The generated model outputs.
234
252
  """
235
253
  assert "messages" in inputs, "The input must contain a 'messages' key."
236
- assert len(inputs["messages"]) == 1, (
237
- "API models only support single-sample batching."
238
- )
239
- messages = inputs["messages"][0]
254
+ messages = inputs["messages"]
240
255
 
241
256
  generation_kwargs: dict[str, t.Any] = dict(
242
257
  model=self.model_config.model_id,
@@ -267,9 +282,20 @@ class LiteLLMModel(BenchmarkModule):
267
282
  generation_kwargs["top_logprobs"] = MAX_LOGPROBS
268
283
 
269
284
  if self.dataset_config.task in TASKS_USING_JSON:
270
- assert "json" in messages[0]["content"].lower(), (
271
- "Prompt must contain 'json' for JSON tasks."
272
- )
285
+ for msg_list in messages:
286
+ # msg_list is a list of {'role':…, 'content':…} dicts
287
+ if not msg_list:
288
+ raise InvalidBenchmark(
289
+ "Encountered an empty message list in 'messages'."
290
+ )
291
+ last = msg_list[-1]
292
+ assert isinstance(last, dict), (
293
+ f"Expected dict message, got {type(last)}"
294
+ )
295
+ assert "json" in last["content"].lower(), (
296
+ "Prompt must contain 'json' for JSON tasks."
297
+ )
298
+
273
299
  if self.generative_type == GenerativeType.REASONING:
274
300
  log_once(
275
301
  f"The model {self.model_config.model_id!r} is a reasoning model "
@@ -321,6 +347,76 @@ class LiteLLMModel(BenchmarkModule):
321
347
  # This drops generation kwargs that are not supported by the model
322
348
  litellm.drop_params = True
323
349
 
350
+ # Extract the generated sequences from the model response. Some APIs cannot
351
+ # handle using newlines as stop sequences, so we try both.
352
+ num_attempts = 10
353
+
354
+ all_responses = {}
355
+ all_failures = []
356
+ to_run = list(enumerate(messages))
357
+
358
+ for attempt in range(num_attempts):
359
+ if not to_run:
360
+ break
361
+
362
+ batch_indices, batch_msgs = zip(*to_run)
363
+ model_response, failures = safe_run(
364
+ self._generate_async(
365
+ messages=list(batch_msgs),
366
+ generation_kwargs=generation_kwargs,
367
+ max_retries=3,
368
+ max_reruns=15,
369
+ )
370
+ )
371
+
372
+ for orig_idx, response in zip(batch_indices, model_response):
373
+ all_responses[orig_idx] = response
374
+
375
+ if not failures:
376
+ to_run = []
377
+ break
378
+
379
+ all_failures.extend(failures)
380
+ to_run = [(orig_idx, messages[orig_idx]) for orig_idx, _ in failures]
381
+ logger.debug(
382
+ f"Attempt {attempt + 1}/{num_attempts}: "
383
+ f"retrying {len(to_run)} failed message(s)"
384
+ )
385
+
386
+ for _, error in failures:
387
+ self._handle_exception(error=error, generation_kwargs=generation_kwargs)
388
+ else:
389
+ raise InvalidBenchmark(
390
+ message=f"Failed to generate text, after {num_attempts} attempts."
391
+ )
392
+
393
+ if to_run:
394
+ raise InvalidBenchmark(
395
+ f"Failed to generate text after {num_attempts} attempts. "
396
+ f"Errors: {all_failures}"
397
+ )
398
+
399
+ ordered_responses = [all_responses[i] for i in range(len(messages))]
400
+ model_output = self._create_model_output(
401
+ model_responses=ordered_responses, model_id=self.model_config.model_id
402
+ )
403
+
404
+ return model_output
405
+
406
+ def _handle_exception(
407
+ self, error: Exception, generation_kwargs: dict[str, t.Any]
408
+ ) -> None:
409
+ """Handle an exception from the model.
410
+
411
+ Args:
412
+ error:
413
+ The exception to handle.
414
+ generation_kwargs:
415
+ The generation kwargs to pass to the model.
416
+ """
417
+ error_msg = str(error).lower()
418
+ model_id = self.model_config.model_id
419
+
324
420
  # Error messages that we want to catch and handle
325
421
  stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
326
422
  logprobs_messages = [
@@ -341,153 +437,238 @@ class LiteLLMModel(BenchmarkModule):
341
437
  max_items_messages = ["'maxItems' is not permitted."]
342
438
  no_json_schema_messages = ["Property keys should match pattern"]
343
439
 
344
- # Extract the generated sequences from the model response. Some APIs cannot
345
- # handle using newlines as stop sequences, so we try both.
346
- num_attempts = 10
347
- for _ in range(num_attempts):
348
- try:
349
- model_response = litellm.completion_with_retries(
350
- messages=messages, **generation_kwargs
351
- )
352
- break
353
- except (BadRequestError, RateLimitError) as e:
354
- if any(msg.lower() in str(e).lower() for msg in stop_messages):
355
- log_once(
356
- f"The model {self.model_config.model_id!r} does not support "
357
- "stop sequences, so disabling them.",
358
- level=logging.DEBUG,
359
- )
360
- generation_kwargs["stop"] = None
361
- elif (
362
- any(msg.lower() in str(e).lower() for msg in logprobs_messages)
363
- # Special case for Vertex AI models, since they have strict rate
364
- # limits on using logprobs. They also have a cap of 5 logprobs, but
365
- # we ignore this since the rate limiting makes it unusable anyway.
366
- or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
367
- ):
368
- log_once(
369
- f"The model {self.model_config.model_id!r} does not support "
370
- "logprobs, so disabling it.",
371
- level=logging.DEBUG,
372
- )
373
- generation_kwargs.pop("logprobs")
374
- generation_kwargs.pop("top_logprobs")
375
- elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
376
- log_once(
377
- f"The model {self.model_config.model_id!r} does not support "
378
- "temperature, so disabling it.",
379
- level=logging.DEBUG,
380
- )
381
- generation_kwargs.pop("temperature")
382
- elif any(
383
- msg.lower() in str(e).lower()
384
- for msg in temperature_must_be_one_messages
385
- ):
386
- log_once(
387
- f"The model {self.model_config.model_id!r} requires "
388
- "temperature to be set to 1, so setting it.",
389
- level=logging.DEBUG,
390
- )
391
- generation_kwargs["temperature"] = 1.0
392
- elif any(msg.lower() in str(e).lower() for msg in max_items_messages):
393
- log_once(
394
- f"The model {self.model_config.model_id!r} does not support "
395
- "maxItems in the JSON schema, so disabling it.",
396
- level=logging.DEBUG,
397
- )
398
- ner_tag_names = list(
399
- self.dataset_config.prompt_label_mapping.values()
400
- )
401
- keys_and_their_types = {
402
- tag_name: (list[str], ...) for tag_name in ner_tag_names
403
- }
404
- pydantic_class = create_model(
405
- "AnswerFormat", **keys_and_their_types
406
- )
407
- generation_kwargs["response_format"] = pydantic_class
408
- elif any(
409
- msg.lower() in str(e).lower() for msg in no_json_schema_messages
410
- ):
411
- log_once(
412
- f"The model {self.model_config.model_id!r} does not support "
413
- "JSON schemas, so using the vanilla JSON format.",
414
- level=logging.DEBUG,
415
- )
416
- generation_kwargs["response_format"] = dict(type="json_object")
417
- elif isinstance(e, RateLimitError):
418
- raise InvalidModel(
419
- "You have encountered your rate limit for model "
420
- f"{self.model_config.model_id!r}. Skipping."
421
- )
422
- else:
423
- raise InvalidBenchmark(
424
- f"Failed to generate text. The error message was: {e}"
425
- )
426
- except APIError as e:
427
- raise InvalidBenchmark(
428
- f"Failed to generate text. The error message was: {e}"
429
- )
430
- except (
440
+ if any(msg.lower() in error_msg for msg in stop_messages):
441
+ log_once(
442
+ f"The model {model_id!r} does not support "
443
+ "stop sequences, so disabling them.",
444
+ level=logging.DEBUG,
445
+ )
446
+ generation_kwargs["stop"] = None
447
+ return
448
+ elif (
449
+ any(msg.lower() in error_msg for msg in logprobs_messages)
450
+ # Special case for Vertex AI models, since they have strict rate
451
+ # limits on using logprobs. They also have a cap of 5 logprobs, but
452
+ # we ignore this since the rate limiting makes it unusable anyway.
453
+ or (isinstance(error, VertexAIError) and "logprobs" in error_msg)
454
+ ):
455
+ log_once(
456
+ f"The model {model_id!r} does not support logprobs, so disabling it.",
457
+ level=logging.DEBUG,
458
+ )
459
+ generation_kwargs.pop("logprobs")
460
+ generation_kwargs.pop("top_logprobs")
461
+ return
462
+ elif any(msg.lower() in error_msg for msg in temperature_messages):
463
+ log_once(
464
+ f"The model {model_id!r} does not support "
465
+ "temperature, so disabling it.",
466
+ level=logging.DEBUG,
467
+ )
468
+ generation_kwargs.pop("temperature")
469
+ return
470
+ elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
471
+ log_once(
472
+ f"The model {model_id!r} requires "
473
+ "temperature to be set to 1, so setting it.",
474
+ level=logging.DEBUG,
475
+ )
476
+ generation_kwargs["temperature"] = 1.0
477
+ return
478
+ elif any(msg.lower() in error_msg for msg in max_items_messages):
479
+ log_once(
480
+ f"The model {model_id!r} does not support "
481
+ "maxItems in the JSON schema, so disabling it.",
482
+ level=logging.DEBUG,
483
+ )
484
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
485
+ keys_and_their_types = {
486
+ tag_name: (list[str], ...) for tag_name in ner_tag_names
487
+ }
488
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
489
+ generation_kwargs["response_format"] = pydantic_class
490
+ return
491
+ elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
492
+ log_once(
493
+ f"The model {self.model_config.model_id!r} does not support "
494
+ "JSON schemas, so using the vanilla JSON format.",
495
+ level=logging.DEBUG,
496
+ )
497
+ generation_kwargs["response_format"] = dict(type="json_object")
498
+ return
499
+ elif isinstance(
500
+ error,
501
+ (
431
502
  APIConnectionError,
432
503
  Timeout,
433
504
  ServiceUnavailableError,
434
505
  InternalServerError,
435
506
  SystemError,
436
- ) as e:
437
- logger.debug(
438
- f"Service temporarily unavailable. The error message was: {e}. "
439
- f"Retrying in 5 seconds..."
507
+ ),
508
+ ):
509
+ logger.debug(
510
+ f"Service temporarily unavailable. The error message was: {error}. "
511
+ f"Retrying in 5 seconds..."
512
+ )
513
+ sleep(5)
514
+ return
515
+
516
+ if isinstance(error, RateLimitError):
517
+ raise InvalidModel(
518
+ f"You have encountered your rate limit for model {model_id!r}. "
519
+ "Skipping."
520
+ )
521
+
522
+ if isinstance(error, AuthenticationError):
523
+ raise NeedsAdditionalArgument(
524
+ cli_argument="--api-key",
525
+ script_argument="api_key=<your-api-key>",
526
+ run_with_cli=self.benchmark_config.run_with_cli,
527
+ )
528
+
529
+ raise InvalidBenchmark(
530
+ f"Failed to generate text. The error message was: {error}"
531
+ )
532
+
533
+ async def _generate_async(
534
+ self,
535
+ messages: list[dict[str, t.Any]],
536
+ generation_kwargs: dict[str, t.Any],
537
+ max_retries: int,
538
+ max_reruns: int,
539
+ ) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
540
+ """Generate outputs from the model asynchronously.
541
+
542
+ Args:
543
+ messages:
544
+ The messages to pass to the model.
545
+ generation_kwargs:
546
+ The generation kwargs to pass to the model.
547
+ max_retries:
548
+ The maximum number of retries to make.
549
+ max_reruns:
550
+ The maximum number of reruns to make.
551
+
552
+ Returns:
553
+ A tuple containing the successful responses and the failed responses.
554
+ """
555
+ success = []
556
+ all_failures = {}
557
+ to_run = list(enumerate(messages))
558
+ prev_fail_count = len(to_run)
559
+ rerun_count = 0
560
+
561
+ while to_run and rerun_count < max_reruns and prev_fail_count > 0:
562
+ requests = [
563
+ litellm.acompletion(
564
+ messages=msg, max_retries=max_retries, **generation_kwargs
440
565
  )
441
- sleep(5)
442
- except AuthenticationError:
443
- raise NeedsAdditionalArgument(
444
- cli_argument="--api-key",
445
- script_argument="api_key=<your-api-key>",
446
- run_with_cli=self.benchmark_config.run_with_cli,
566
+ for _, msg in to_run
567
+ ]
568
+ wrapped_requests = [
569
+ catch_coroutine_exception(request) for request in requests
570
+ ]
571
+ responses = await tqdm_async.gather(*wrapped_requests, leave=False)
572
+
573
+ next_to_run = []
574
+ current_fail_count = 0
575
+
576
+ for (orig_idx, _), response in zip(to_run, responses):
577
+ if isinstance(response, Exception):
578
+ current_fail_count += 1
579
+ all_failures[orig_idx] = response
580
+ next_to_run.append((orig_idx, messages[orig_idx]))
581
+ else:
582
+ success.append(response)
583
+
584
+ if current_fail_count >= prev_fail_count:
585
+ logger.warning(
586
+ "Retry loop aborting due to no progress: "
587
+ f"current_fail_count={current_fail_count}, "
588
+ f"prev_fail_count={prev_fail_count}"
447
589
  )
448
- else:
449
- raise InvalidBenchmark(
450
- message=f"Failed to generate text, after {num_attempts} attempts."
451
- )
590
+ break
591
+
592
+ prev_fail_count = current_fail_count
593
+ to_run = next_to_run
594
+ rerun_count += 1
595
+
596
+ failures = [(orig_idx, all_failures[orig_idx]) for orig_idx, _ in to_run]
597
+ return success, failures
452
598
 
453
- assert isinstance(model_response, ModelResponse)
454
- if not model_response.choices:
455
- # This happens for reasoning models, when they don't finish thinking and run
456
- # out of tokens. Happens quite rarely, but we need to handle it.
599
+ @staticmethod
600
+ def _create_model_output(
601
+ model_responses: list[ModelResponse], model_id: str
602
+ ) -> GenerativeModelOutput:
603
+ """Create a GenerativeModelOutput object from a list of ModelResponse objects.
604
+
605
+ Args:
606
+ model_responses:
607
+ The list of ModelResponse objects to create the GenerativeModelOutput
608
+ object from.
609
+ model_id:
610
+ The ID of the model.
611
+
612
+ Returns:
613
+ A GenerativeModelOutput object.
614
+ """
615
+ sequences = []
616
+ scores = []
617
+ for model_response in model_responses:
618
+ if not model_response.choices:
619
+ # This happens for reasoning models, when they don't finish thinking
620
+ # and run out of tokens. Happens quite rarely, but we need to handle it.
621
+ logger.warning(
622
+ f"The model {model_id!r} did not end up "
623
+ "generating any text. This is likely because the model ran "
624
+ "out of tokens while reasoning. Returning an empty string."
625
+ )
626
+ continue
627
+
628
+ model_response_choices = model_response.choices[0]
629
+ assert isinstance(model_response_choices, litellm.Choices)
630
+ generated_message: litellm.Message = model_response_choices.message
631
+ generation_output = generated_message.content or ""
632
+ generation_output = generation_output.strip()
633
+
634
+ # Structure the model output as a GenerativeModelOutput object
635
+ sequences.append(generation_output)
636
+ if hasattr(model_response_choices, "logprobs"):
637
+ logprobs_obj = model_response_choices.logprobs
638
+ if isinstance(logprobs_obj, ChoiceLogprobs):
639
+ logprobs_list: list[list[tuple[str, float]]] = [
640
+ [
641
+ (top_logprob.token, top_logprob.logprob)
642
+ for top_logprob in content.top_logprobs
643
+ ]
644
+ for content in model_response_choices.logprobs.content or list()
645
+ ]
646
+ scores.append(logprobs_list)
647
+ else:
648
+ log_once(
649
+ "The logprobs object is malformed, so we won't use logprobs to "
650
+ "determine the labels.",
651
+ level=logging.WARNING,
652
+ )
653
+
654
+ if not sequences:
457
655
  logger.warning(
458
- f"The model {self.model_config.model_id!r} did not end up generating "
459
- "any text. This is likely because the model ran out of tokens while "
460
- "reasoning. Returning an empty string."
656
+ "No sequences were generated by the model "
657
+ f"{model_id!r}. This may be due to the "
658
+ "model running out of tokens or an issue with the input data. "
659
+ "Returning an empty GenerativeModelOutput."
461
660
  )
462
- return GenerativeModelOutput(sequences=[""])
463
-
464
- model_response_choices = model_response.choices[0]
465
- assert isinstance(model_response_choices, litellm.Choices)
466
- generated_message: litellm.Message = model_response_choices.message
467
- generation_output = generated_message.content or ""
468
- generation_output = generation_output.strip()
469
-
470
- # Structure the model output as a GenerativeModelOutput object
471
- model_output = GenerativeModelOutput(sequences=[generation_output])
472
- if hasattr(model_response_choices, "logprobs"):
473
- logprobs_obj = model_response_choices.logprobs
474
- if isinstance(logprobs_obj, ChoiceLogprobs):
475
- logprobs_list: list[list[tuple[str, float]]] = [
476
- [
477
- (top_logprob.token, top_logprob.logprob)
478
- for top_logprob in content.top_logprobs
479
- ]
480
- for content in model_response_choices.logprobs.content or list()
481
- ]
482
- model_output.scores = [logprobs_list]
483
- else:
484
- log_once(
485
- "The logprobs object is malformed, so we won't use logprobs to "
486
- "determine the labels.",
487
- level=logging.WARNING,
488
- )
661
+ return GenerativeModelOutput(sequences=[], scores=None)
489
662
 
490
- return model_output
663
+ if scores and len(sequences) != len(scores):
664
+ raise InvalidBenchmark(
665
+ "Sequences and scores must have the same length. "
666
+ f"Got {len(sequences)} sequences and {len(scores)} scores."
667
+ )
668
+
669
+ return GenerativeModelOutput(
670
+ sequences=sequences, scores=scores if scores else None
671
+ )
491
672
 
492
673
  @cached_property
493
674
  def num_params(self) -> int:
@@ -1007,6 +1188,10 @@ def try_download_ollama_model(model_id: str) -> bool:
1007
1188
 
1008
1189
  Returns:
1009
1190
  Whether the model was downloaded successfully.
1191
+
1192
+ Raises:
1193
+ InvalidModel:
1194
+ If Ollama is not running or the model cannot be downloaded.
1010
1195
  """
1011
1196
  if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
1012
1197
  return False
@@ -1021,11 +1206,17 @@ def try_download_ollama_model(model_id: str) -> bool:
1021
1206
  level=logging.WARNING,
1022
1207
  )
1023
1208
 
1024
- downloaded_ollama_models: list[str] = [
1025
- model_obj.model
1026
- for model_obj in ollama.list().models
1027
- if model_obj.model is not None
1028
- ]
1209
+ try:
1210
+ downloaded_ollama_models: list[str] = [
1211
+ model_obj.model
1212
+ for model_obj in ollama.list().models
1213
+ if model_obj.model is not None
1214
+ ]
1215
+ except ConnectionError:
1216
+ raise InvalidModel(
1217
+ "Ollama does not seem to be running, so we cannot evaluate the model "
1218
+ f"{model_id!r}. Please make sure that Ollama is running and try again."
1219
+ )
1029
1220
 
1030
1221
  ollama_model_id = "/".join(model_id.split("/")[1:])
1031
1222
  if ollama_model_id not in downloaded_ollama_models:
@@ -797,7 +797,7 @@ def load_model_and_tokenizer(
797
797
  enable_lora=model_config.adapter_base_model_id is not None,
798
798
  max_lora_rank=256,
799
799
  )
800
- except (ValueError, OSError) as e:
800
+ except (RuntimeError, ValueError, OSError) as e:
801
801
  if "awaiting a review from the repo authors" in str(e):
802
802
  raise InvalidModel(
803
803
  f"The model {model_id!r} is awaiting a review from the repository "
euroeval/benchmarker.py CHANGED
@@ -11,6 +11,7 @@ from pathlib import Path
11
11
  from shutil import rmtree
12
12
  from time import sleep
13
13
 
14
+ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
14
15
  from torch.distributed import destroy_process_group
15
16
 
16
17
  from .benchmark_config_factory import build_benchmark_config
@@ -27,7 +28,7 @@ from .model_loading import load_model
27
28
  from .scores import log_scores
28
29
  from .speed_benchmark import benchmark_speed
29
30
  from .tasks import SPEED
30
- from .utils import enforce_reproducibility
31
+ from .utils import enforce_reproducibility, get_package_version
31
32
 
32
33
  if t.TYPE_CHECKING:
33
34
  from .benchmark_modules import BenchmarkModule
@@ -164,6 +165,15 @@ class Benchmarker:
164
165
  if task is not None and dataset is not None:
165
166
  raise ValueError("Only one of `task` and `dataset` can be specified.")
166
167
 
168
+ # Bail early if hf_transfer is enabled but not installed.
169
+ if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
170
+ raise ImportError(
171
+ "Fast download using 'hf_transfer' is enabled "
172
+ "(HF_HUB_ENABLE_HF_TRANSFER=1) but the 'hf_transfer' "
173
+ "package is not available in your environment. "
174
+ "Try installing it with `pip install hf_transfer`."
175
+ )
176
+
167
177
  self.benchmark_config_default_params = BenchmarkConfigParams(
168
178
  progress_bar=progress_bar,
169
179
  save_results=save_results,
@@ -372,15 +382,7 @@ class Benchmarker:
372
382
 
373
383
  current_benchmark_results: list[BenchmarkResult] = list()
374
384
  for model_id in model_ids:
375
- try:
376
- model_config = get_model_config(
377
- model_id=model_id, benchmark_config=benchmark_config
378
- )
379
- except InvalidModel as e:
380
- logger.info(e.message)
381
- num_finished_benchmarks += len(dataset_configs)
382
- continue
383
-
385
+ model_config: ModelConfig | None = None
384
386
  loaded_model: BenchmarkModule | None = None
385
387
  for dataset_config in dataset_configs:
386
388
  # Skip if we have already benchmarked this model on this dataset and
@@ -394,12 +396,22 @@ class Benchmarker:
394
396
  ):
395
397
  logger.debug(
396
398
  f"Skipping benchmarking {model_id} on "
397
- f"{dataset_config.pretty_name}, as it "
398
- "has already been benchmarked."
399
+ f"{dataset_config.pretty_name}, as it has already been "
400
+ "benchmarked."
399
401
  )
400
402
  num_finished_benchmarks += 1
401
403
  continue
402
404
 
405
+ if model_config is None:
406
+ try:
407
+ model_config = get_model_config(
408
+ model_id=model_id, benchmark_config=benchmark_config
409
+ )
410
+ except InvalidModel as e:
411
+ logger.info(e.message)
412
+ num_finished_benchmarks += len(dataset_configs)
413
+ continue
414
+
403
415
  # Skip if the model is an encoder model and the task is generative
404
416
  task_is_generative = (
405
417
  dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
@@ -7,6 +7,7 @@ from .danish import * # noqa: F403
7
7
  from .dutch import * # noqa: F403
8
8
  from .english import * # noqa: F403
9
9
  from .faroese import * # noqa: F403
10
+ from .finnish import * # noqa: F403
10
11
  from .french import * # noqa: F403
11
12
  from .german import * # noqa: F403
12
13
  from .icelandic import * # noqa: F403
@@ -79,7 +79,7 @@ ARC_CONFIG = DatasetConfig(
79
79
  )
80
80
 
81
81
  BELEBELE_CONFIG = DatasetConfig(
82
- name="belebele",
82
+ name="belebele-en",
83
83
  pretty_name="the English multiple choice reading comprehension dataset BeleBele",
84
84
  huggingface_id="EuroEval/belebele-mini",
85
85
  task=MCRC,
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import FI
5
- from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -58,3 +58,13 @@ SCALA_FI_CONFIG = DatasetConfig(
58
58
  )
59
59
 
60
60
  ### Unofficial datasets ###
61
+
62
+ BELEBELE_FI_CONFIG = DatasetConfig(
63
+ name="belebele-fi",
64
+ pretty_name="the Finnish multiple choice reading comprehension dataset "
65
+ "BeleBele-fi, translated from the English BeleBele dataset",
66
+ huggingface_id="EuroEval/belebele-fi-mini",
67
+ task=MCRC,
68
+ languages=[FI],
69
+ unofficial=True,
70
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import IT
5
- from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -79,3 +79,13 @@ WIKINEURAL_IT_CONFIG = DatasetConfig(
79
79
  languages=[IT],
80
80
  unofficial=True,
81
81
  )
82
+
83
+ BELEBELE_IT_CONFIG = DatasetConfig(
84
+ name="belebele-it",
85
+ pretty_name="the Italian multiple choice reading comprehension dataset "
86
+ "BeleBele-it, translated from the English BeleBele dataset",
87
+ huggingface_id="EuroEval/belebele-it-mini",
88
+ task=MCRC,
89
+ languages=[IT],
90
+ unofficial=True,
91
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import ES
5
- from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -76,3 +76,13 @@ XQUAD_ES_CONFIG = DatasetConfig(
76
76
  languages=[ES],
77
77
  unofficial=True,
78
78
  )
79
+
80
+ BELEBELE_ES_CONFIG = DatasetConfig(
81
+ name="belebele-es",
82
+ pretty_name="the Spanish multiple choice reading comprehension dataset "
83
+ "BeleBele-es, translated from the English BeleBele dataset",
84
+ huggingface_id="EuroEval/belebele-es-mini",
85
+ task=MCRC,
86
+ languages=[ES],
87
+ unofficial=True,
88
+ )
euroeval/finetuning.py CHANGED
@@ -103,7 +103,6 @@ def finetune(
103
103
  itr_scores = finetune_single_iteration(
104
104
  model=model if model_already_initialized else None,
105
105
  dataset=datasets[idx],
106
- iteration_idx=idx,
107
106
  training_args=training_args,
108
107
  model_config=model_config,
109
108
  dataset_config=dataset_config,
@@ -158,7 +157,6 @@ def finetune(
158
157
  def finetune_single_iteration(
159
158
  model: BenchmarkModule | None,
160
159
  dataset: DatasetDict,
161
- iteration_idx: int,
162
160
  training_args: TrainingArguments,
163
161
  model_config: "ModelConfig",
164
162
  dataset_config: "DatasetConfig",
@@ -171,8 +169,6 @@ def finetune_single_iteration(
171
169
  The model to use in the benchmark. If None then a new model will be loaded.
172
170
  dataset:
173
171
  The dataset to use for training and evaluation.
174
- iteration_idx:
175
- The index of the iteration.
176
172
  training_args:
177
173
  The training arguments.
178
174
  model_config:
@@ -213,41 +209,42 @@ def finetune_single_iteration(
213
209
 
214
210
  trainer.log = no_logging
215
211
 
216
- # Re-block terminal output, as it gets unblocked by the `transformers`
217
- # package before training
212
+ # Re-block terminal output, as it gets unblocked by the `transformers` package
213
+ # before training
218
214
  block_terminal_output()
219
215
 
220
- # Sort out callbacks. We remove the callbacks that are producing unnecessary
221
- # output, to avoid cluttering the terminal output
216
+ # Sort out callbacks. We remove the callbacks that are producing unnecessary output,
217
+ # to avoid cluttering the terminal output
222
218
  if not benchmark_config.verbose:
223
219
  trainer.remove_callback(PrinterCallback)
224
220
  trainer.remove_callback(ProgressCallback)
225
221
  if benchmark_config.progress_bar:
226
222
  trainer.add_callback(NeverLeaveProgressCallback)
227
223
 
228
- try:
229
- trainer.train()
230
- with torch.inference_mode():
231
- try:
232
- test_scores = trainer.evaluate(
233
- eval_dataset=dataset["test"],
234
- orig_eval_dataset=dataset["original_test"],
235
- metric_key_prefix="test",
236
- )
237
- except TypeError:
238
- test_scores = trainer.evaluate(
239
- eval_dataset=dataset["test"], metric_key_prefix="test"
240
- )
241
- return test_scores
242
-
243
- except NaNValueInModelOutput as e:
244
- del trainer
245
- del model
246
- clear_memory()
247
- raise e
248
-
249
- except (RuntimeError, ValueError, IndexError) as e:
250
- raise InvalidBenchmark(str(e))
224
+ # Train the model
225
+ trainer.train()
226
+
227
+ # Evaluate the model
228
+ with torch.inference_mode():
229
+ try:
230
+ test_scores = trainer.evaluate(
231
+ eval_dataset=dataset["test"],
232
+ orig_eval_dataset=dataset["original_test"],
233
+ metric_key_prefix="test",
234
+ )
235
+ except TypeError:
236
+ test_scores = trainer.evaluate(
237
+ eval_dataset=dataset["test"], metric_key_prefix="test"
238
+ )
239
+ except NaNValueInModelOutput as e:
240
+ del trainer
241
+ del model
242
+ clear_memory()
243
+ raise e
244
+ except (RuntimeError, ValueError, IndexError) as e:
245
+ raise InvalidBenchmark(str(e))
246
+
247
+ return test_scores
251
248
 
252
249
 
253
250
  def get_training_args(
@@ -300,6 +297,7 @@ def get_training_args(
300
297
  save_total_limit=1,
301
298
  per_device_train_batch_size=batch_size,
302
299
  per_device_eval_batch_size=batch_size,
300
+ eval_accumulation_steps=32,
303
301
  optim=OptimizerNames.ADAMW_TORCH,
304
302
  learning_rate=2e-5,
305
303
  warmup_ratio=0.01,
euroeval/languages.py CHANGED
@@ -21,6 +21,7 @@ def get_all_languages() -> dict[str, Language]:
21
21
  DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
22
22
  NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
23
23
  EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
24
+ FI = Language(code="fi", name="Finnish", _and_separator="ja", _or_separator="tai")
24
25
  FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
25
26
  FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
26
27
  DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
@@ -78,7 +79,6 @@ EO = Language(code="eo", name="Esperanto")
78
79
  ET = Language(code="et", name="Estonian")
79
80
  EE = Language(code="ee", name="Ewe")
80
81
  FJ = Language(code="fj", name="Fijian")
81
- FI = Language(code="fi", name="Finnish")
82
82
  FY = Language(code="fy", name="Western Frisian")
83
83
  FF = Language(code="ff", name="Fulah")
84
84
  GD = Language(code="gd", name="Gaelic")
@@ -132,6 +132,11 @@ def extract_labels_from_generation(
132
132
  The predicted labels.
133
133
  """
134
134
  if model_output.scores is not None:
135
+ if first_label_token_mapping is False:
136
+ raise InvalidBenchmark(
137
+ "The model outputted logprobs, but the first label token mapping is "
138
+ "not provided. This means that the model should not output logprobs."
139
+ )
135
140
  labels = get_closest_logprobs_labels(
136
141
  generation_logprobs=model_output.scores,
137
142
  dataset_config=dataset_config,
@@ -147,7 +152,7 @@ def extract_labels_from_generation(
147
152
  def get_closest_logprobs_labels(
148
153
  generation_logprobs: list[list[list[tuple[str, float]]]],
149
154
  dataset_config: "DatasetConfig",
150
- first_label_token_mapping: dict[str, str] | bool,
155
+ first_label_token_mapping: dict[str, str] | t.Literal[True],
151
156
  ) -> list[str] | None:
152
157
  """Get the labels with the highest predicted logprob value.
153
158
 
@@ -164,8 +169,7 @@ def get_closest_logprobs_labels(
164
169
  The configuration of the dataset.
165
170
  first_label_token_mapping:
166
171
  A mapping from labels to the first token in each label, or alternatively a
167
- Boolean value indicating whether the model should output scores (if the
168
- mapping is outputted then the model will always output scores).
172
+ `True` value indicating that the model should output logprobs.
169
173
 
170
174
  Returns:
171
175
  The predicted labels, or None if labels could not be extracted.
@@ -195,7 +199,9 @@ def get_closest_logprobs_labels(
195
199
  # label, as the output label
196
200
  output_label: str | None = None
197
201
  for generated_label in generated_labels:
198
- # Get the candidate labels that starts with the generated label
202
+ # Get the candidate labels. If we have a first label token mapping, we
203
+ # use it to get the candidate labels. Otherwise, we check if any of the
204
+ # labels start with the generated label.
199
205
  if isinstance(first_label_token_mapping, dict):
200
206
  if any(
201
207
  candidate_label not in first_label_token_mapping
@@ -239,14 +245,43 @@ def get_closest_logprobs_labels(
239
245
  )
240
246
  return None
241
247
 
242
- # If no candidate label is found, we ignore the generated label, as it
243
- # basically means that the model is just really bad at generating
244
- # labels.
248
+ # If no candidate label is found, we first check if any of the labels
249
+ # start with the generated label. This could be the case if the labels
250
+ # in the first token mapping is inaccurate or incomplete, for instance
251
+ # if 'pos' is in the first label token mapping, but the model outputted
252
+ # 'posit'. If this is the case then we cannot trust the first label
253
+ # token mapping, and we fall back to using word edit distance.
254
+ # Otherwise, the generated label is just bad, and we skip to the next
255
+ # generated label.
245
256
  elif len(candidate_output_labels) == 0:
246
- logger.debug(
247
- f"No candidate label found for the generated label "
248
- f"{generated_label!r}. The generated label is thus ignored."
249
- )
257
+ candidate_output_labels_starting_with_generated_label = [
258
+ candidate_label
259
+ for candidate_label in candidate_labels
260
+ if candidate_label.startswith(generated_label)
261
+ ]
262
+ if candidate_output_labels_starting_with_generated_label:
263
+ log_once(
264
+ f"No candidate label found for the generated label "
265
+ f"{generated_label!r}. This means that using logprobs to "
266
+ "extract the labels is not reliable, and we will instead "
267
+ "fall back to extracting the labels using word edit "
268
+ "distance.",
269
+ level=logging.DEBUG,
270
+ )
271
+ return None
272
+
273
+ # If we did not find any candidate label for any of the generated labels, we
274
+ # assume that something is wrong with the model output, and we fall back to
275
+ # using word edit distance to extract the labels
276
+ else:
277
+ log_once(
278
+ f"No candidate label found for any of the generated labels "
279
+ f"{generated_labels}. This means that using logprobs to extract "
280
+ "the labels is not reliable, and we will instead fall back to "
281
+ "extracting the labels using word edit distance.",
282
+ level=logging.DEBUG,
283
+ )
284
+ return None
250
285
 
251
286
  if output_label is not None:
252
287
  output_labels.append(output_label)
@@ -169,7 +169,7 @@ def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
169
169
 
170
170
  vocab: dict[str, int] = tokenizer.get_vocab()
171
171
 
172
- candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
172
+ candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
173
173
  for candidate_bos_token in candidate_bos_tokens:
174
174
  if candidate_bos_token in vocab:
175
175
  bos_token = candidate_bos_token
@@ -200,7 +200,7 @@ def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
200
200
 
201
201
  vocab: dict[str, int] = tokenizer.get_vocab()
202
202
 
203
- candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
203
+ candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
204
204
  for candidate_eos_token in candidate_eos_tokens:
205
205
  if candidate_eos_token in vocab:
206
206
  eos_token = candidate_eos_token
@@ -311,24 +311,60 @@ def get_first_label_token_mapping(
311
311
  for label in dataset_config.labels
312
312
  ]
313
313
 
314
- # Get the first token of each label, where we add a prefix space if needed
315
- add_prefix_space = (
316
- should_prefix_space_be_added_to_labels(
314
+ # Tokenize some text containing each label, which we will use to extract the
315
+ # first token of each label
316
+ all_tokens: list[list[str]]
317
+ if tokenizer.chat_template is None:
318
+ add_prefix_space = should_prefix_space_be_added_to_labels(
317
319
  labels_to_be_generated=local_labels, tokenizer=tokenizer
318
320
  )
319
- and tokenizer.chat_template is None
320
- )
321
- first_tokens = [
322
- tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
323
- for label in local_labels
324
- ]
325
- first_tokens = [
326
- re.sub(
327
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
328
- )
329
- for token in first_tokens
321
+ all_tokens = [
322
+ tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
323
+ for label in local_labels
324
+ ]
325
+ else:
326
+ all_tokens = [
327
+ tokenizer.convert_ids_to_tokens(
328
+ ids=tokenizer.apply_chat_template(
329
+ conversation=[
330
+ dict(role="user", content=""),
331
+ dict(role="assistant", content=label),
332
+ ],
333
+ add_generation_prompt=True,
334
+ tokenize=True,
335
+ )
336
+ )
337
+ for label in local_labels
338
+ ]
339
+
340
+ # Remove any non-alphabetic characters from the tokens
341
+ all_tokens = [
342
+ [
343
+ re.sub(
344
+ pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
345
+ repl="",
346
+ string=token.lower(),
347
+ )
348
+ for token in token_list
349
+ ]
350
+ for token_list in all_tokens
330
351
  ]
331
352
 
353
+ # Extract the first token of each label
354
+ first_tokens: list[str] = list()
355
+ for token_list, label in zip(all_tokens, local_labels):
356
+ matching_tokens = [
357
+ tok for tok in token_list if tok and label.startswith(tok)
358
+ ]
359
+ if not matching_tokens:
360
+ log_once(
361
+ f"No matching token found in token_list for label '{label}', so "
362
+ "we will not output scores.",
363
+ level=logging.DEBUG,
364
+ )
365
+ return False
366
+ first_tokens.append(matching_tokens[0])
367
+
332
368
  # Build a mapping from labels to the first token in each label if the first
333
369
  # tokens are distinct
334
370
  if len(first_tokens) == len(set(first_tokens)):
euroeval/utils.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Utility functions to be used in other scripts."""
2
2
 
3
+ import asyncio
3
4
  import gc
4
5
  import importlib
5
6
  import importlib.metadata
@@ -327,3 +328,43 @@ def get_package_version(package_name: str) -> str | None:
327
328
  return importlib.metadata.version(package_name)
328
329
  except importlib.metadata.PackageNotFoundError:
329
330
  return None
331
+
332
+
333
+ T = t.TypeVar("T", bound=object)
334
+
335
+
336
+ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
337
+ """Run a coroutine, ensuring that the event loop is always closed when we're done.
338
+
339
+ Args:
340
+ coroutine:
341
+ The coroutine to run.
342
+
343
+ Returns:
344
+ The result of the coroutine.
345
+ """
346
+ loop = asyncio.new_event_loop()
347
+ try:
348
+ asyncio.set_event_loop(loop)
349
+ return loop.run_until_complete(coroutine)
350
+ finally:
351
+ loop.close()
352
+ asyncio.set_event_loop(None)
353
+
354
+
355
+ async def catch_coroutine_exception(
356
+ coroutine: t.Coroutine[t.Any, t.Any, T],
357
+ ) -> T | Exception:
358
+ """Run a coroutine, catching any exceptions and returning them.
359
+
360
+ Args:
361
+ coroutine:
362
+ The coroutine to run.
363
+
364
+ Returns:
365
+ The result of the coroutine, or the exception if it was raised.
366
+ """
367
+ try:
368
+ return await coroutine
369
+ except Exception as exc:
370
+ return exc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.7.1
3
+ Version: 15.8.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,6 +1,6 @@
1
1
  euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
- euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
- euroeval/benchmarker.py,sha256=OnjGVblWW20wSmA7Tr2c-qE3g8FIjxW6wTJySAcGxVk,48492
2
+ euroeval/benchmark_config_factory.py,sha256=RDYotoLcfNr3xU8Cw-G-Y8wLe6RSlJD1Ok9C97lWfOs,12553
3
+ euroeval/benchmarker.py,sha256=EHoYilZ2Xx0-6_aEBlG84MsZbomJSiHNHc4wKOVVBB8,49199
4
4
  euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
6
  euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
@@ -8,38 +8,38 @@ euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
8
8
  euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
11
- euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
11
+ euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
12
12
  euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
13
13
  euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
14
14
  euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
15
- euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
15
+ euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
16
16
  euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
17
17
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
18
18
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
19
19
  euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
20
20
  euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
21
21
  euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
22
- euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
22
+ euroeval/tokenization_utils.py,sha256=kghOIZMM3H0P9YDv0VBSNI7drzgJXlkRtMwt3Cgeev8,13907
23
23
  euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
24
- euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
24
+ euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
25
25
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
26
26
  euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
27
27
  euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
28
28
  euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
29
- euroeval/benchmark_modules/litellm.py,sha256=v_rbCm2FiTMqcUui_09k3E1-s5uOmbfAvSy2c7Mm0_E,42636
30
- euroeval/benchmark_modules/vllm.py,sha256=Q-3vtZz5XxQQImJxOiF0XDrQ4T_p0bkgdPw1Jobgu3s,39380
31
- euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
29
+ euroeval/benchmark_modules/litellm.py,sha256=dd7OqBvWA75zNrsEHtC3cx3rNpNJ-1QOL2arV_CqYG0,48231
30
+ euroeval/benchmark_modules/vllm.py,sha256=DJyla0jr-DVMPPs4RBguxq1Xn5YguvyuAnIlgIOfFaw,39394
31
+ euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
32
32
  euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
33
33
  euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
34
- euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
34
+ euroeval/dataset_configs/english.py,sha256=-N85DiNVrZFqpahNUTfxaWy4vvdOWC8Bi0G4uAO4uDw,2326
35
35
  euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
36
- euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
36
+ euroeval/dataset_configs/finnish.py,sha256=_8YWIlZNpO8Qi233bH7cKwm3tq3WETLfC_6mzg7LLog,2045
37
37
  euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
38
38
  euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
39
39
  euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
40
- euroeval/dataset_configs/italian.py,sha256=5yYMMBbxkfSDpLgJ9IH_pgkpzEp-74vMMvx-dT8x4WY,2345
40
+ euroeval/dataset_configs/italian.py,sha256=KNjCvTzsEqH_EEk3At8slKqNwWWiIdbv_t5ke7n9nZI,2660
41
41
  euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada4qwE7tw0,5181
42
- euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
42
+ euroeval/dataset_configs/spanish.py,sha256=NviL-FzJ5jq1bLTRvbtZBiGrAmZjxyijZNpKZFrnT-M,2527
43
43
  euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
44
44
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
45
45
  euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
@@ -51,11 +51,11 @@ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5w
51
51
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
52
52
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
53
53
  euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
54
- euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
54
+ euroeval/task_group_utils/sequence_classification.py,sha256=MCdO5h3v_LWTkrvKAeefPq7rl1H5mFed50nAL4uZq0E,13837
55
55
  euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
56
56
  euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
57
- euroeval-15.7.1.dist-info/METADATA,sha256=Fj6QejwQCK0zGuP_DHSQ7sul195ivUqOUCT5AVxgLSI,13669
58
- euroeval-15.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
- euroeval-15.7.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
- euroeval-15.7.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
- euroeval-15.7.1.dist-info/RECORD,,
57
+ euroeval-15.8.0.dist-info/METADATA,sha256=-GcGBuEnlAPmpT9ItDAmS0psT__jwbVoNkTYOiSeRzA,13669
58
+ euroeval-15.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
+ euroeval-15.8.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
+ euroeval-15.8.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
+ euroeval-15.8.0.dist-info/RECORD,,