EuroEval 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (53) hide show
  1. euroeval/benchmark_modules/base.py +3 -2
  2. euroeval/benchmark_modules/fresh.py +8 -6
  3. euroeval/benchmark_modules/hf.py +33 -31
  4. euroeval/benchmark_modules/litellm.py +120 -56
  5. euroeval/benchmark_modules/vllm.py +41 -26
  6. euroeval/benchmarker.py +23 -21
  7. euroeval/callbacks.py +2 -2
  8. euroeval/constants.py +1 -1
  9. euroeval/data_models.py +261 -42
  10. euroeval/dataset_configs/__init__.py +61 -0
  11. euroeval/dataset_configs/danish.py +120 -0
  12. euroeval/dataset_configs/dutch.py +123 -0
  13. euroeval/dataset_configs/english.py +88 -0
  14. euroeval/dataset_configs/faroese.py +54 -0
  15. euroeval/dataset_configs/french.py +83 -0
  16. euroeval/dataset_configs/german.py +91 -0
  17. euroeval/dataset_configs/icelandic.py +148 -0
  18. euroeval/dataset_configs/italian.py +81 -0
  19. euroeval/dataset_configs/norwegian.py +178 -0
  20. euroeval/dataset_configs/spanish.py +78 -0
  21. euroeval/dataset_configs/swedish.py +100 -0
  22. euroeval/exceptions.py +10 -10
  23. euroeval/finetuning.py +6 -10
  24. euroeval/generation.py +1 -0
  25. euroeval/human_evaluation.py +2 -2
  26. euroeval/languages.py +20 -13
  27. euroeval/model_cache.py +1 -1
  28. euroeval/model_loading.py +1 -12
  29. euroeval/prompt_templates/__init__.py +8 -0
  30. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  31. euroeval/prompt_templates/multiple_choice.py +97 -0
  32. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  33. euroeval/prompt_templates/reading_comprehension.py +118 -0
  34. euroeval/prompt_templates/sentiment_classification.py +137 -0
  35. euroeval/prompt_templates/summarization.py +97 -0
  36. euroeval/speed_benchmark.py +1 -1
  37. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  38. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  39. euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
  40. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  41. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  42. euroeval/tasks.py +54 -0
  43. euroeval/tokenization_utils.py +343 -0
  44. euroeval/types.py +3 -1
  45. euroeval/utils.py +2 -347
  46. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/METADATA +31 -9
  47. euroeval-15.6.1.dist-info/RECORD +59 -0
  48. euroeval/dataset_configs.py +0 -2408
  49. euroeval-15.5.0.dist-info/RECORD +0 -40
  50. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  51. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/WHEEL +0 -0
  52. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/entry_points.txt +0 -0
  53. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/licenses/LICENSE +0 -0
euroeval/utils.py CHANGED
@@ -7,7 +7,6 @@ import importlib.util
7
7
  import logging
8
8
  import os
9
9
  import random
10
- import re
11
10
  import sys
12
11
  import typing as t
13
12
  import warnings
@@ -22,7 +21,7 @@ from datasets.utils import disable_progress_bar
22
21
  from requests.exceptions import RequestException
23
22
  from transformers import logging as tf_logging
24
23
 
25
- from .exceptions import InvalidModel, NaNValueInModelOutput
24
+ from .exceptions import NaNValueInModelOutput
26
25
 
27
26
  if importlib.util.find_spec("ray") is not None:
28
27
  import ray
@@ -30,9 +29,6 @@ if importlib.util.find_spec("ray") is not None:
30
29
  if t.TYPE_CHECKING:
31
30
  from types import TracebackType
32
31
 
33
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
34
-
35
- from .data_models import DatasetConfig
36
32
  from .types import Predictions
37
33
 
38
34
 
@@ -120,6 +116,7 @@ def block_terminal_output() -> None:
120
116
  logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
121
117
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
122
118
  logging.getLogger("ray._private.worker").setLevel(logging.CRITICAL)
119
+ logging.getLogger("ray._private.services").setLevel(logging.CRITICAL)
123
120
  logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
124
121
  logging.getLogger("accelerate").setLevel(logging.CRITICAL)
125
122
  logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
@@ -197,19 +194,6 @@ def get_min_cuda_compute_capability() -> float | None:
197
194
  return float(f"{major}.{minor}")
198
195
 
199
196
 
200
- def kebab_to_pascal(kebab_string: str) -> str:
201
- """Converts a kebab-case string to PascalCase.
202
-
203
- Args:
204
- kebab_string:
205
- The kebab-case string.
206
-
207
- Returns:
208
- The PascalCase string.
209
- """
210
- return "".join(word.title() for word in kebab_string.split("-"))
211
-
212
-
213
197
  def internet_connection_available() -> bool:
214
198
  """Checks if internet connection is available by pinging google.com.
215
199
 
@@ -223,58 +207,6 @@ def internet_connection_available() -> bool:
223
207
  return False
224
208
 
225
209
 
226
- def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
227
- """Get the special token metadata for a tokenizer.
228
-
229
- Args:
230
- tokenizer:
231
- The tokenizer.
232
-
233
- Returns:
234
- The special token metadata.
235
- """
236
- # Create some test input IDs, to check if the tokenizer is adding special tokens
237
- test_input_ids = tokenizer("Test").input_ids
238
-
239
- # Extract the CLS token IDs from the tokenizer, if it's using them
240
- has_cls_token = True
241
- if tokenizer.cls_token_id in test_input_ids:
242
- cls_token_id = tokenizer.cls_token_id
243
- cls_token = tokenizer.cls_token
244
- elif tokenizer.bos_token_id in test_input_ids:
245
- cls_token_id = tokenizer.bos_token_id
246
- cls_token = tokenizer.bos_token
247
- elif tokenizer.cls_token is not None:
248
- cls_token_id = tokenizer.cls_token_id
249
- cls_token = tokenizer.cls_token
250
- has_cls_token = False
251
- else:
252
- cls_token_id = tokenizer.bos_token_id
253
- cls_token = tokenizer.bos_token
254
- has_cls_token = False
255
-
256
- # Extract the SEP token IDs from the tokenizer, if it's using them
257
- has_sep_token = True
258
- if tokenizer.sep_token_id in test_input_ids:
259
- sep_token = tokenizer.sep_token
260
- elif tokenizer.eos_token_id in test_input_ids:
261
- sep_token = tokenizer.eos_token
262
- elif tokenizer.sep_token is not None:
263
- sep_token = tokenizer.sep_token
264
- has_sep_token = False
265
- else:
266
- sep_token = tokenizer.eos_token
267
- has_sep_token = False
268
-
269
- return dict(
270
- cls_token_id=cls_token_id,
271
- cls_token=cls_token,
272
- sep_token=sep_token,
273
- has_cls_token=has_cls_token,
274
- has_sep_token=has_sep_token,
275
- )
276
-
277
-
278
210
  class HiddenPrints:
279
211
  """Context manager which removes all terminal output."""
280
212
 
@@ -320,190 +252,6 @@ def raise_if_model_output_contains_nan_values(model_output: "Predictions") -> No
320
252
  raise NaNValueInModelOutput()
321
253
 
322
254
 
323
- def should_prompts_be_stripped(
324
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
325
- ) -> bool:
326
- """Determine if we should strip the prompts for few-shot evaluation.
327
-
328
- This is the case if the tokenizer needs to include the space as part of the label
329
- token. The strategy is thus to tokenize a label with a preceeding colon (as in the
330
- prompts), i.e., ": positive", and check if the tokenization starts with the tokens
331
- of ": ". If this is the case, then we should not strip the prompts, since the
332
- tokenizer produces the whitespace token separately.
333
-
334
- Args:
335
- labels_to_be_generated:
336
- The labels that are to be generated.
337
- tokenizer:
338
- The tokenizer used to tokenize the labels.
339
-
340
- Returns:
341
- Whether we should strip the prompts.
342
- """
343
- strip_prompts = True
344
- for label in labels_to_be_generated:
345
- colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
346
- label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
347
-
348
- if isinstance(colon_tokens, torch.Tensor):
349
- colon_tokens = list(colon_tokens.squeeze(0))
350
- if isinstance(label_tokens, torch.Tensor):
351
- label_tokens = list(label_tokens.squeeze(0))
352
-
353
- label_tokens_start_with_colon_tokens = (
354
- label_tokens[: len(colon_tokens)] == colon_tokens
355
- )
356
- if label_tokens_start_with_colon_tokens:
357
- strip_prompts = False
358
-
359
- return strip_prompts
360
-
361
-
362
- def should_prefix_space_be_added_to_labels(
363
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
364
- ) -> bool:
365
- """Determine if we should add a prefix space to the labels.
366
-
367
- This is the case if the prompts are stripped and the tokenizer doesn't
368
- automatically add prefix whitespaces to the labels.
369
-
370
- Args:
371
- labels_to_be_generated:
372
- The labels that are to be generated.
373
- tokenizer:
374
- The tokenizer used to tokenize the labels.
375
-
376
- Returns:
377
- Whether we should add a prefix space to the labels.
378
- """
379
- if not should_prompts_be_stripped(
380
- labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
381
- ):
382
- return False
383
-
384
- whitespace_token = tokenizer.convert_ids_to_tokens(
385
- ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
386
- )[0]
387
-
388
- add_prefix_space = True
389
- for label in labels_to_be_generated:
390
- label_tokens = tokenizer(label, add_special_tokens=False).input_ids
391
- if isinstance(label_tokens, torch.Tensor):
392
- label_tokens = list(label_tokens.squeeze(0))
393
- first_label_token: int = int(label_tokens[0])
394
- first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
395
- has_prefix_space = first_character_of_label == whitespace_token
396
- if has_prefix_space:
397
- add_prefix_space = False
398
- break
399
-
400
- return add_prefix_space
401
-
402
-
403
- def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
404
- """Get the beginning-of-sequence token from a tokenizer.
405
-
406
- Args:
407
- tokenizer:
408
- The tokenizer.
409
-
410
- Returns:
411
- A pair (token, token_id) representing the beginning-of-sequence token and its
412
- token ID.
413
- """
414
- if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
415
- return tokenizer.bos_token, tokenizer.bos_token_id
416
-
417
- vocab: dict[str, int] = tokenizer.get_vocab()
418
-
419
- candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
420
- for candidate_bos_token in candidate_bos_tokens:
421
- if candidate_bos_token in vocab:
422
- bos_token = candidate_bos_token
423
- bos_token_id = vocab[bos_token]
424
- break
425
- else:
426
- raise InvalidModel(
427
- "The model does not have a beginning-of-sequence token. Please ensure that "
428
- "this has been set in the tokenizer's configuration."
429
- )
430
-
431
- return bos_token, bos_token_id
432
-
433
-
434
- def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
435
- """Get the end-of-sequence token from a tokenizer.
436
-
437
- Args:
438
- tokenizer:
439
- The tokenizer.
440
-
441
- Returns:
442
- A pair (token, token_id) representing the end-of-sequence token and its token
443
- ID.
444
- """
445
- if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
446
- return tokenizer.eos_token, tokenizer.eos_token_id
447
-
448
- vocab: dict[str, int] = tokenizer.get_vocab()
449
-
450
- candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
451
- for candidate_eos_token in candidate_eos_tokens:
452
- if candidate_eos_token in vocab:
453
- eos_token = candidate_eos_token
454
- eos_token_id = vocab[eos_token]
455
- break
456
- else:
457
- raise InvalidModel(
458
- "The model does not have an end-of-sequence token. Please ensure that this "
459
- "has been set in the tokenizer's configuration."
460
- )
461
-
462
- return eos_token, eos_token_id
463
-
464
-
465
- def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
466
- """Get the end token ID for chat models.
467
-
468
- This is only relevant for tokenizers with a chat template.
469
-
470
- Args:
471
- tokenizer:
472
- The tokenizer.
473
-
474
- Returns:
475
- The token IDs used to end chats, or None if the tokenizer does not have a chat
476
- template.
477
-
478
- Raises:
479
- ValueError:
480
- If the end-of-chat token could not be located.
481
- """
482
- if tokenizer.chat_template is None:
483
- return None
484
-
485
- user_message: dict[t.Literal["role", "content"], str] = dict()
486
- user_message["role"] = "user"
487
- user_message["content"] = "X"
488
- token_ids = tokenizer.apply_chat_template(conversation=[user_message])
489
- assert isinstance(token_ids, list)
490
-
491
- for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
492
- token_id = tokenizer.convert_tokens_to_ids(token)
493
- assert isinstance(token_id, int)
494
- token = tokenizer.decode([token_id])
495
- if "X" in token:
496
- x_token_index = idx
497
- break
498
- else:
499
- raise ValueError("Could not locate the end-of-chat token for the model.")
500
-
501
- end_of_chat_tokens = token_ids[x_token_index + 1 :]
502
- if len(end_of_chat_tokens) == 0:
503
- return None
504
- return end_of_chat_tokens
505
-
506
-
507
255
  def scramble(text: str) -> str:
508
256
  """Scramble a string in a bijective manner.
509
257
 
@@ -579,96 +327,3 @@ def get_package_version(package_name: str) -> str | None:
579
327
  return importlib.metadata.version(package_name)
580
328
  except importlib.metadata.PackageNotFoundError:
581
329
  return None
582
-
583
-
584
- def get_first_label_token_mapping(
585
- dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
586
- ) -> dict[str, str] | bool:
587
- """Check if the model should output scores.
588
-
589
- Args:
590
- dataset_config:
591
- The dataset configuration.
592
- tokenizer:
593
- The tokenizer, or None if not available.
594
-
595
- Returns:
596
- A mapping from labels to the first token in each label, or alternatively a
597
- Boolean value indicating whether the model should output scores (if the mapping
598
- is outputted then the model will always output scores).
599
- """
600
- # Importing here to avoid circular imports
601
- from .constants import TASK_GROUPS_USING_LOGPROBS
602
-
603
- # If we do not have any tokenizer, then we cannot check if the model should output
604
- # scores and we just assume it should if the dataset supports it
605
- output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
606
- if tokenizer is None:
607
- if output_scores:
608
- log_once(
609
- "The model will output scores, since the dataset supports it and no "
610
- "tokenizer is available.",
611
- level=logging.DEBUG,
612
- )
613
- else:
614
- log_once(
615
- "The model will not output scores, since the dataset does not support "
616
- "it and no tokenizer is available.",
617
- level=logging.DEBUG,
618
- )
619
- return output_scores
620
-
621
- # If there are labels associated with the dataset, and that the first token of each
622
- # label is distinct, then we can safely use the logprobs
623
- if output_scores and dataset_config.labels:
624
- local_labels = [
625
- dataset_config.prompt_label_mapping[label].strip()
626
- for label in dataset_config.labels
627
- ]
628
-
629
- # Get the first token of each label, where we add a prefix space if needed
630
- add_prefix_space = (
631
- should_prefix_space_be_added_to_labels(
632
- labels_to_be_generated=local_labels, tokenizer=tokenizer
633
- )
634
- and tokenizer.chat_template is None
635
- )
636
- first_tokens = [
637
- tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
638
- for label in local_labels
639
- ]
640
- first_tokens = [
641
- re.sub(
642
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
643
- )
644
- for token in first_tokens
645
- ]
646
-
647
- # Build a mapping from labels to the first token in each label if the first
648
- # tokens are distinct
649
- if len(first_tokens) == len(set(first_tokens)):
650
- log_once(
651
- "The model will output scores, since the first tokens of the labels "
652
- "are distinct.",
653
- level=logging.DEBUG,
654
- )
655
- return {
656
- label: first_token
657
- for label, first_token in zip(local_labels, first_tokens)
658
- }
659
- else:
660
- log_once(
661
- "The model will not output scores, since the first tokens of the "
662
- "labels are not distinct. The first tokens for the labels "
663
- f"{local_labels} are {first_tokens}"
664
- )
665
- return False
666
-
667
- # Otherwise, we assume that the model should not output scores, to avoid potential
668
- # evaluation errors. This will force the label extraction to rely on word edit
669
- # distance instead of logprobs.
670
- log_once(
671
- "The model will not output scores, since the dataset does not have labels.",
672
- level=logging.DEBUG,
673
- )
674
- return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.5.0
3
+ Version: 15.6.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -35,7 +35,7 @@ Requires-Dist: click>=8.1.3
35
35
  Requires-Dist: datasets>=2.15.0
36
36
  Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
- Requires-Dist: huggingface-hub>=0.24.0
38
+ Requires-Dist: huggingface-hub>=0.30.1
39
39
  Requires-Dist: levenshtein>=0.24.0
40
40
  Requires-Dist: litellm>=1.63.0
41
41
  Requires-Dist: more-itertools>=10.5.0
@@ -56,18 +56,18 @@ Requires-Dist: setuptools>=75.8.2
56
56
  Requires-Dist: tenacity>=9.0.0
57
57
  Requires-Dist: termcolor>=2.0.0
58
58
  Requires-Dist: torch>=2.6.0
59
- Requires-Dist: transformers>=4.50.0
59
+ Requires-Dist: transformers>=4.51.0
60
60
  Provides-Extra: all
61
61
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
71
71
  Provides-Extra: human-evaluation
72
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
73
  Provides-Extra: test
@@ -89,7 +89,7 @@ ______________________________________________________________________
89
89
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
90
90
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
91
91
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
92
- [![Code Coverage](https://img.shields.io/badge/Coverage-65%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
92
+ [![Code Coverage](https://img.shields.io/badge/Coverage-67%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
93
93
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
94
94
 
95
95
 
@@ -206,7 +206,9 @@ sentiment-classification`.
206
206
 
207
207
 
208
208
  ### Reproducing the datasets
209
- All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
209
+ All datasets used in this project are generated using the scripts located in the
210
+ [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
211
+ with the following command
210
212
 
211
213
  ```shell
212
214
  $ uv run src/scripts/<name-of-script>.py
@@ -218,8 +220,28 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
218
220
  $ uv run src/scripts/create_allocine.py
219
221
  ```
220
222
 
221
-
222
- ## Special Thanks :pray:
223
+ ## Contributors :pray:
224
+
225
+ A huge thank you to all the contributors who have helped make this project a success!
226
+
227
+ <a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
228
+ <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
229
+ <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
230
+ <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
231
+ <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
232
+ <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
233
+ <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
234
+ <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
235
+ <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
236
+ <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
237
+ <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
238
+ <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
239
+ <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
240
+ <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
241
+
242
+ ### Special Thanks
243
+ - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
244
+ [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
223
245
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
224
246
  models on the leaderboards.
225
247
  - Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
@@ -0,0 +1,59 @@
1
+ euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
+ euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
+ euroeval/benchmarker.py,sha256=7LVFr7zL7OeJPs7WVYwekNnEmiIKPXHydcbAkW99MUk,48080
4
+ euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
5
+ euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
+ euroeval/constants.py,sha256=t2mAT8tE3Dn2lXWHTnaFoaOIaUcdiBjJTASCt7nSdkg,1984
7
+ euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
+ euroeval/data_models.py,sha256=oZLrGg1dhIIwbgtEzq4U_fu_ZbBsz35mrqsyizuZNPw,23138
9
+ euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
+ euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
11
+ euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
12
+ euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
13
+ euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
14
+ euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
15
+ euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
16
+ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
+ euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
+ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
19
+ euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
20
+ euroeval/tasks.py,sha256=VVXFDcEM250KTGXd1pxQb8vwdia4ZJxgTUY5Kdsa-ik,7070
21
+ euroeval/tokenization_utils.py,sha256=PNuS-FTdVrL9TWNDGlq42MvUggKwmyYM0BnC5I37IO0,11876
22
+ euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
23
+ euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
24
+ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
+ euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
26
+ euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
27
+ euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
28
+ euroeval/benchmark_modules/litellm.py,sha256=wohdi1WoeJ-JEdQLgg2q3JbZJA77XO7yGZaTRvbRU4o,47575
29
+ euroeval/benchmark_modules/vllm.py,sha256=FTpwal5WdrVsOpkjm_RXwf6-2PrNrrP1LO6BVGYb6GE,48086
30
+ euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
31
+ euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
32
+ euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
33
+ euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
34
+ euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
35
+ euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
36
+ euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
37
+ euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
38
+ euroeval/dataset_configs/italian.py,sha256=5yYMMBbxkfSDpLgJ9IH_pgkpzEp-74vMMvx-dT8x4WY,2345
39
+ euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada4qwE7tw0,5181
40
+ euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
41
+ euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
42
+ euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
43
+ euroeval/prompt_templates/linguistic_acceptability.py,sha256=sx_WqLm7N6Thll6COUCCA0lXe9RMZ7WhoH6X498pixM,6232
44
+ euroeval/prompt_templates/multiple_choice.py,sha256=H0CDQPs_WzgSJ7oI_FBzHs0TOF0Na2qZYJLhDC7S8tk,4710
45
+ euroeval/prompt_templates/named_entity_recognition.py,sha256=T65oFEtVT8JRF9c7bq2nPm233rftPdEAGic0DU-toko,11835
46
+ euroeval/prompt_templates/reading_comprehension.py,sha256=WbQoal_tjoTt7qsmSZXEWwlI77vgiANcZoZC1l1AZjc,6090
47
+ euroeval/prompt_templates/sentiment_classification.py,sha256=LcFD89e5nMOv4u-Unj8_jHpNjKMmgKPEfz0-e39VbsM,6639
48
+ euroeval/prompt_templates/summarization.py,sha256=eX0uUTf_5Xorm6f_TlBBNwLC9zKvR7YJkP0RSaLWgIw,4585
49
+ euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
50
+ euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
51
+ euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
52
+ euroeval/task_group_utils/sequence_classification.py,sha256=gqd0-l5o7vAY5QIpGSkSqwJwez3Y0r5SqOiywfPNW8A,12239
53
+ euroeval/task_group_utils/text_to_text.py,sha256=QECnGdZ0YLjsbMc6LwXqVi4KMuITdiOjmJUNQtAAOW0,5712
54
+ euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
55
+ euroeval-15.6.1.dist-info/METADATA,sha256=4i98IBxn6yWh4ugBW-SnljmDfKEXBSfRGjZyf_dlOUs,13183
56
+ euroeval-15.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
57
+ euroeval-15.6.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
58
+ euroeval-15.6.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
59
+ euroeval-15.6.1.dist-info/RECORD,,