EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show
  1. euroeval/__init__.py +5 -0
  2. euroeval/benchmark_config_factory.py +6 -1
  3. euroeval/benchmark_modules/base.py +2 -0
  4. euroeval/benchmark_modules/fresh.py +7 -1
  5. euroeval/benchmark_modules/hf.py +26 -21
  6. euroeval/benchmark_modules/litellm.py +258 -131
  7. euroeval/benchmark_modules/vllm.py +120 -68
  8. euroeval/benchmarker.py +11 -2
  9. euroeval/cli.py +14 -1
  10. euroeval/constants.py +7 -1
  11. euroeval/data_models.py +95 -20
  12. euroeval/dataset_configs/__init__.py +1 -0
  13. euroeval/dataset_configs/danish.py +14 -3
  14. euroeval/dataset_configs/dutch.py +14 -0
  15. euroeval/dataset_configs/english.py +22 -0
  16. euroeval/dataset_configs/estonian.py +15 -7
  17. euroeval/dataset_configs/finnish.py +14 -0
  18. euroeval/dataset_configs/french.py +14 -0
  19. euroeval/dataset_configs/german.py +23 -0
  20. euroeval/dataset_configs/italian.py +14 -0
  21. euroeval/dataset_configs/latvian.py +14 -0
  22. euroeval/dataset_configs/norwegian.py +14 -0
  23. euroeval/dataset_configs/polish.py +126 -0
  24. euroeval/dataset_configs/portuguese.py +14 -0
  25. euroeval/dataset_configs/spanish.py +14 -0
  26. euroeval/dataset_configs/swedish.py +25 -0
  27. euroeval/enums.py +12 -0
  28. euroeval/generation.py +17 -8
  29. euroeval/generation_utils.py +102 -16
  30. euroeval/metrics/pipeline.py +51 -9
  31. euroeval/model_cache.py +13 -1
  32. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  33. euroeval/prompt_templates/multiple_choice.py +27 -1
  34. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  35. euroeval/prompt_templates/reading_comprehension.py +11 -0
  36. euroeval/prompt_templates/sentiment_classification.py +15 -0
  37. euroeval/prompt_templates/summarization.py +27 -1
  38. euroeval/scores.py +5 -0
  39. euroeval/task_group_utils/multiple_choice_classification.py +2 -2
  40. euroeval/task_group_utils/question_answering.py +29 -29
  41. euroeval/task_group_utils/sequence_classification.py +71 -81
  42. euroeval/task_group_utils/token_classification.py +17 -3
  43. euroeval/tasks.py +12 -10
  44. euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
  45. euroeval/utils.py +67 -3
  46. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
  47. euroeval-16.1.0.dist-info/RECORD +70 -0
  48. euroeval-16.0.0.dist-info/RECORD +0 -69
  49. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
  50. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
  51. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -215,6 +215,20 @@ def extract_labels_from_generation(
215
215
 
216
216
  prompt_label_mapping = dataset_config.prompt_label_mapping
217
217
  for prompt_tag_name, named_entities in prediction_dict.items():
218
+ if not isinstance(named_entities, list):
219
+ logger.debug(
220
+ "The model produced an invalid format for the named entities. "
221
+ f"Expected a list but got {type(named_entities)}. Skipping."
222
+ )
223
+ continue
224
+ try:
225
+ named_entities = [str(ne) for ne in named_entities]
226
+ except Exception:
227
+ logger.debug(
228
+ "The model produced an invalid format for the named entities. "
229
+ f"Expected a list of strings but got {named_entities}. Skipping."
230
+ )
231
+ continue
218
232
  try:
219
233
  tag_name = [
220
234
  tag[2:]
@@ -259,7 +273,7 @@ def tokenize_and_align_labels(
259
273
  Returns:
260
274
  A dictionary containing the tokenized data as well as labels.
261
275
  """
262
- # Tokenize the texts. We use the `is_split_into_words` argument here because
276
+ # Tokenise the texts. We use the `is_split_into_words` argument here because
263
277
  # the texts in our dataset are lists of words (with a label for each word)
264
278
  tokenized_inputs = tokeniser(
265
279
  examples["tokens"], is_split_into_words=True, truncation=True, padding=True
@@ -382,7 +396,7 @@ def handle_unk_tokens(
382
396
 
383
397
  Args:
384
398
  tokeniser:
385
- The tokeniser used to tokenize the words.
399
+ The tokeniser used to tokenise the words.
386
400
  tokens:
387
401
  The list of tokens.
388
402
  words:
@@ -409,7 +423,7 @@ def handle_unk_tokens(
409
423
  # Fetch the word
410
424
  word = words[word_idx]
411
425
 
412
- # Tokenize the word, which is now a list containing at least one UNK token
426
+ # Tokenise the word, which is now a list containing at least one UNK token
413
427
  tokens_with_unk = tokeniser.convert_ids_to_tokens(
414
428
  tokeniser.encode(word, add_special_tokens=False)
415
429
  )
euroeval/tasks.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """All benchmarks tasks used in EuroEval."""
2
2
 
3
3
  from . import metrics as m
4
+ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
4
5
  from .data_models import Task
5
6
  from .enums import GenerativeType, ModelType, TaskGroup
6
7
  from .prompt_templates import (
@@ -28,7 +29,7 @@ LA = Task(
28
29
  template_dict=LA_TEMPLATES,
29
30
  metrics=[m.mcc_metric, m.macro_f1_metric],
30
31
  default_num_few_shot_examples=12,
31
- default_max_generated_tokens=10,
32
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
32
33
  default_labels=["correct", "incorrect"],
33
34
  uses_logprobs=True,
34
35
  )
@@ -73,7 +74,7 @@ SENT = Task(
73
74
  template_dict=SENT_TEMPLATES,
74
75
  metrics=[m.mcc_metric, m.macro_f1_metric],
75
76
  default_num_few_shot_examples=12,
76
- default_max_generated_tokens=10,
77
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
77
78
  default_labels=["positive", "neutral", "negative"],
78
79
  uses_logprobs=True,
79
80
  )
@@ -87,7 +88,7 @@ SUMM = Task(
87
88
  default_num_few_shot_examples=1,
88
89
  default_max_generated_tokens=256,
89
90
  default_labels=[],
90
- allowed_model_types=[ModelType.GENERATIVE],
91
+ default_allowed_model_types=[ModelType.GENERATIVE],
91
92
  )
92
93
 
93
94
 
@@ -97,7 +98,7 @@ KNOW = Task(
97
98
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
98
99
  metrics=[m.mcc_metric, m.accuracy_metric],
99
100
  default_num_few_shot_examples=5,
100
- default_max_generated_tokens=10,
101
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
101
102
  default_labels=["a", "b", "c", "d"],
102
103
  uses_logprobs=True,
103
104
  )
@@ -109,7 +110,7 @@ MCRC = Task(
109
110
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
110
111
  metrics=[m.mcc_metric, m.accuracy_metric],
111
112
  default_num_few_shot_examples=5,
112
- default_max_generated_tokens=10,
113
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
113
114
  default_labels=["a", "b", "c", "d"],
114
115
  uses_logprobs=True,
115
116
  )
@@ -121,7 +122,7 @@ COMMON_SENSE = Task(
121
122
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
122
123
  metrics=[m.mcc_metric, m.accuracy_metric],
123
124
  default_num_few_shot_examples=5,
124
- default_max_generated_tokens=10,
125
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
125
126
  default_labels=["a", "b", "c", "d"],
126
127
  uses_logprobs=True,
127
128
  )
@@ -133,15 +134,16 @@ EUROPEAN_VALUES = Task(
133
134
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
134
135
  metrics=[m.european_values_metric],
135
136
  default_num_few_shot_examples=0,
136
- default_max_generated_tokens=10,
137
- default_labels=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
138
- allowed_model_types=[ModelType.GENERATIVE],
139
- allowed_generative_types=[
137
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
138
+ default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
139
+ default_allowed_model_types=[ModelType.GENERATIVE],
140
+ default_allowed_generative_types=[
140
141
  GenerativeType.INSTRUCTION_TUNED,
141
142
  GenerativeType.REASONING,
142
143
  ],
143
144
  requires_zero_shot=True,
144
145
  uses_logprobs=True,
146
+ default_allow_invalid_model_outputs=False,
145
147
  )
146
148
 
147
149
 
@@ -1,4 +1,4 @@
1
- """Utility functions related to tokenization."""
1
+ """Utility functions related to tokenisation."""
2
2
 
3
3
  import logging
4
4
  import re
@@ -7,9 +7,8 @@ import typing as t
7
7
  import torch
8
8
  from transformers import MistralCommonTokenizer
9
9
 
10
- from euroeval.exceptions import InvalidModel
11
-
12
10
  from .enums import GenerativeType
11
+ from .exceptions import InvalidModel
13
12
  from .utils import log_once
14
13
 
15
14
  if t.TYPE_CHECKING:
@@ -80,8 +79,8 @@ def should_prompts_be_stripped(
80
79
  """Determine if we should strip the prompts for few-shot evaluation.
81
80
 
82
81
  This is the case if the tokeniser needs to include the space as part of the label
83
- token. The strategy is thus to tokenize a label with a preceeding colon (as in the
84
- prompts), i.e., ": positive", and check if the tokenization starts with the tokens
82
+ token. The strategy is thus to tokenise a label with a preceeding colon (as in the
83
+ prompts), i.e., ": positive", and check if the tokenisation starts with the tokens
85
84
  of ": ". If this is the case, then we should not strip the prompts, since the
86
85
  tokeniser produces the whitespace token separately.
87
86
 
@@ -89,7 +88,7 @@ def should_prompts_be_stripped(
89
88
  labels_to_be_generated:
90
89
  The labels that are to be generated.
91
90
  tokeniser:
92
- The tokeniser used to tokenize the labels.
91
+ The tokeniser used to tokenise the labels.
93
92
 
94
93
  Returns:
95
94
  Whether we should strip the prompts.
@@ -125,7 +124,7 @@ def should_prefix_space_be_added_to_labels(
125
124
  labels_to_be_generated:
126
125
  The labels that are to be generated.
127
126
  tokeniser:
128
- The tokeniser used to tokenize the labels.
127
+ The tokeniser used to tokenise the labels.
129
128
 
130
129
  Returns:
131
130
  Whether we should add a prefix space to the labels.
@@ -319,7 +318,9 @@ def get_pad_token(
319
318
  return pad_token, pad_token_id
320
319
 
321
320
 
322
- def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | None:
321
+ def get_end_of_chat_token_ids(
322
+ tokeniser: "PreTrainedTokenizer", generative_type: GenerativeType | None
323
+ ) -> list[int] | None:
323
324
  """Get the end token ID for chat models.
324
325
 
325
326
  This is only relevant for tokenisers with a chat template.
@@ -327,20 +328,23 @@ def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | N
327
328
  Args:
328
329
  tokeniser:
329
330
  The tokeniser.
331
+ generative_type:
332
+ The generative type, or None if not available.
330
333
 
331
334
  Returns:
332
335
  The token IDs used to end chats, or None if the tokeniser does not have a chat
333
336
  template or if no end-of-chat token could be found.
334
337
  """
335
- if not has_chat_template(tokeniser=tokeniser):
338
+ if generative_type == GenerativeType.BASE:
336
339
  return None
337
340
 
338
341
  user_message: dict[str, str] = dict(role="user", content="X")
339
342
  token_ids = apply_chat_template(
340
343
  conversation=[user_message],
341
344
  tokeniser=tokeniser,
342
- tokenize=True,
345
+ tokenise=True,
343
346
  add_generation_prompt=False,
347
+ enable_thinking=generative_type == GenerativeType.REASONING,
344
348
  )
345
349
  assert isinstance(token_ids, list)
346
350
 
@@ -421,7 +425,7 @@ def get_first_label_token_mapping(
421
425
  for label in dataset_config.labels
422
426
  ]
423
427
 
424
- # Tokenize some text containing each label, which we will use to extract the
428
+ # Tokenise some text containing each label, which we will use to extract the
425
429
  # first token of each label
426
430
  all_tokens: list[list[str]]
427
431
  if not has_chat_template(tokeniser=tokeniser):
@@ -440,11 +444,13 @@ def get_first_label_token_mapping(
440
444
  dict(role="user", content=""),
441
445
  dict(role="assistant", content=label),
442
446
  # Adding extra user message as Mistral tokenisers require
443
- # conversamtions to end with a user message
447
+ # conversations to end with a user message
444
448
  dict(role="user", content=""),
445
449
  ],
446
450
  tokeniser=tokeniser,
447
- tokenize=True,
451
+ tokenise=True,
452
+ add_generation_prompt=True,
453
+ enable_thinking=generative_type == GenerativeType.REASONING,
448
454
  )
449
455
  )
450
456
  for label in local_labels
@@ -538,9 +544,10 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
538
544
  def apply_chat_template(
539
545
  conversation: list[dict[str, str]],
540
546
  tokeniser: "PreTrainedTokenizer",
541
- tokenize: bool = False,
542
- add_generation_prompt: bool = True,
543
- **transformers_tokeniser_kwargs,
547
+ tokenise: bool,
548
+ add_generation_prompt: bool,
549
+ enable_thinking: bool,
550
+ **extra_kwargs,
544
551
  ) -> str | list[int]:
545
552
  """Apply the chat template to a prompt.
546
553
 
@@ -549,38 +556,47 @@ def apply_chat_template(
549
556
  The conversation to apply the chat template to.
550
557
  tokeniser:
551
558
  The tokeniser.
552
- tokenize:
553
- Whether to tokenize the resulting prompt, returning a list of token IDs
559
+ tokenise:
560
+ Whether to tokenise the resulting prompt, returning a list of token IDs
554
561
  instead of a string.
555
562
  add_generation_prompt:
556
563
  Whether to add a generation prompt at the end of the conversation. This is
557
564
  only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
558
565
  always add a generation prompt.
559
- **transformers_tokeniser_kwargs:
560
- Additional keyword arguments to pass to the tokeniser, in case the tokeniser
561
- is a regular Hugging Face tokeniser.
566
+ enable_thinking:
567
+ Whether to enable special handling for reasoning models, such as adding
568
+ special tokens for thinking. This is only relevant for regular Hugging
569
+ Face tokenisers, as Mistral tokenisers always handle reasoning models.
570
+ **extra_kwargs:
571
+ Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
572
+ method. Only relevant for regular Hugging Face tokenisers.
562
573
 
563
574
  Returns:
564
575
  The prompt with the chat template applied, either as a string or a list of
565
- token IDs, depending on the value of `tokenize`.
576
+ token IDs, depending on the value of `tokenise`.
566
577
 
567
578
  Raises:
568
579
  InvalidModel:
569
580
  If the tokeniser does not have a chat template.
570
581
  """
582
+ # Ensure that the first user message is not empty, as this can cause issues with
583
+ # Jinja2
584
+ conversation[0]["content"] = conversation[0]["content"] or " "
585
+
571
586
  if not has_chat_template(tokeniser=tokeniser):
572
587
  raise InvalidModel(
573
588
  "The tokeniser does not have a chat template, so cannot apply it."
574
589
  )
575
590
  elif isinstance(tokeniser, MistralCommonTokenizer):
576
591
  templated_prompt = tokeniser.apply_chat_template(
577
- conversation=conversation, tokenize=tokenize
592
+ conversation=conversation, tokenize=tokenise
578
593
  )
579
594
  else:
580
595
  templated_prompt = tokeniser.apply_chat_template(
581
596
  conversation=conversation,
582
597
  add_generation_prompt=add_generation_prompt,
583
- tokenize=tokenize,
584
- **transformers_tokeniser_kwargs,
598
+ tokenize=tokenise,
599
+ enable_thinking=enable_thinking,
600
+ **extra_kwargs,
585
601
  )
586
602
  return templated_prompt
euroeval/utils.py CHANGED
@@ -4,7 +4,6 @@ import asyncio
4
4
  import gc
5
5
  import importlib
6
6
  import importlib.metadata
7
- import importlib.util
8
7
  import logging
9
8
  import os
10
9
  import random
@@ -25,11 +24,12 @@ from datasets.utils import disable_progress_bar
25
24
  from requests.exceptions import RequestException
26
25
  from transformers import logging as tf_logging
27
26
 
28
- from .exceptions import NaNValueInModelOutput
27
+ from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
29
28
 
30
29
  if t.TYPE_CHECKING:
31
30
  from types import TracebackType
32
31
 
32
+ from .data_models import ModelIdComponents
33
33
  from .types import Predictions
34
34
 
35
35
 
@@ -347,7 +347,8 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
347
347
  loop = asyncio.new_event_loop()
348
348
  try:
349
349
  asyncio.set_event_loop(loop)
350
- return loop.run_until_complete(coroutine)
350
+ response = loop.run_until_complete(coroutine)
351
+ return response
351
352
  finally:
352
353
  loop.close()
353
354
  asyncio.set_event_loop(None)
@@ -457,3 +458,66 @@ def get_hf_token(api_key: str | None) -> str | bool:
457
458
  level=logging.DEBUG,
458
459
  )
459
460
  return False
461
+
462
+
463
+ def extract_multiple_choice_labels(
464
+ prompt: str, candidate_labels: list[str]
465
+ ) -> list[str]:
466
+ """Extract multiple choice labels from a prompt.
467
+
468
+ Args:
469
+ prompt:
470
+ The prompt to extract the labels from.
471
+ candidate_labels:
472
+ The candidate labels to look for in the prompt.
473
+
474
+ Returns:
475
+ The extracted labels.
476
+ """
477
+ sample_candidate_labels: list[str] = list()
478
+ for candidate_label in candidate_labels:
479
+ candidate_label_match = re.search(
480
+ pattern=rf"\b{candidate_label}\. ", string=prompt, flags=re.IGNORECASE
481
+ )
482
+ if candidate_label_match is not None:
483
+ sample_candidate_labels.append(candidate_label)
484
+ if not sample_candidate_labels:
485
+ raise InvalidBenchmark(
486
+ "Could not extract any candidate labels from the prompt. Please ensure "
487
+ "that the candidate labels are present in the prompt, each followed by a "
488
+ "dot and a space (e.g., 'a. '). The candidate labels are: "
489
+ f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}"
490
+ )
491
+ return sample_candidate_labels
492
+
493
+
494
+ def split_model_id(model_id: str) -> "ModelIdComponents":
495
+ """Split a model ID into its components.
496
+
497
+ Args:
498
+ model_id:
499
+ The model ID to split.
500
+
501
+ Returns:
502
+ The split model ID.
503
+
504
+ Raises:
505
+ If the model ID is not valid.
506
+ """
507
+ # Importing here to avoid circular imports
508
+ from .data_models import ModelIdComponents
509
+
510
+ # Attempt to extract the model ID, revision, and param using regex
511
+ model_id_match = re.match(pattern=r"^[^@#]+", string=model_id)
512
+ revision_match = re.search(pattern=r"@([^@#]+)", string=model_id)
513
+ param_match = re.search(pattern=r"#([^@#]+)", string=model_id)
514
+
515
+ # If we cannot extract the model ID, raise an error
516
+ if model_id_match is None:
517
+ raise InvalidModel(f"The model ID {model_id!r} is not valid.")
518
+ model_id = model_id_match.group()
519
+
520
+ # Extract the revision and param and return the result
521
+ revision = revision_match.group(1) if revision_match is not None else "main"
522
+ param = param_match.group(1) if param_match is not None else None
523
+ return ModelIdComponents(model_id=model_id, revision=revision, param=param)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.0.0
3
+ Version: 16.1.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,10 +61,12 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
61
61
  Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
64
65
  Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
68
70
  Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
69
71
  Description-Content-Type: text/markdown
70
72
 
@@ -0,0 +1,70 @@
1
+ euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
2
+ euroeval/benchmark_config_factory.py,sha256=NzNSiqix4hlVXk3xnyzdg2WDxomkectf97UWdVS3POo,11667
3
+ euroeval/benchmarker.py,sha256=JkhvYxhVpQPcWmDLzwnB8Yy6tTqj3yfDWTefklbI7RM,50355
4
+ euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
+ euroeval/cli.py,sha256=wUGetj9Ld4wkS872ZOfYqHIJMh58o8L2MDi78wU5nxI,9099
6
+ euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
7
+ euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
8
+ euroeval/data_models.py,sha256=S-PATp4F1wBwvra6wtjlJFXxZbZB_vEpJHXcdTTKA70,27593
9
+ euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
10
+ euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
+ euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
12
+ euroeval/generation.py,sha256=MSrd0oIkoqwKsCOaIkY2CFF_urXLOfNR1OO5nMvcCpY,12476
13
+ euroeval/generation_utils.py,sha256=OtEXLhI6L1vlbC768dH3xzj0qkokz43m0vswGKrRmBA,18061
14
+ euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
15
+ euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
16
+ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
+ euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
+ euroeval/scores.py,sha256=HQQqyjdgm853FZ_ifIdnSltKfBhsY7pOITov6F3Et5o,3165
19
+ euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
20
+ euroeval/tasks.py,sha256=3qEOBAMmfeqgXqlGkCKzQ-s0Yw-0-jPRgFZ97EZCFng,4535
21
+ euroeval/tokenisation_utils.py,sha256=jRIi9m8XmGh3LeZna47AWmJI9U9m4ojXQynQTe7kzWc,21344
22
+ euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
23
+ euroeval/utils.py,sha256=c0tFw1IXZIqgLU4EfY_k28iJ1ZlCZ_oFoKZH2sGCKYg,16499
24
+ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
+ euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
26
+ euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
27
+ euroeval/benchmark_modules/hf.py,sha256=oBjVumnSM9PW7ZocQwCGLKpbeGFWLN_71DBotxZo1aY,44038
28
+ euroeval/benchmark_modules/litellm.py,sha256=6EKjHnUoPCpuupISZHXqZsXLG8tyiA1-G12a5C6L8MM,64629
29
+ euroeval/benchmark_modules/vllm.py,sha256=sYFdVzB9CZX6_sGI4xghDyXoVn6I95_nbeFUWeSMXcc,43132
30
+ euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
31
+ euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
32
+ euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
33
+ euroeval/dataset_configs/english.py,sha256=7lS12Tj7FnMGkS4xj7UoZyymNX6PGXTVl5muPswIgAE,4737
34
+ euroeval/dataset_configs/estonian.py,sha256=tdnz0gmMR9yO5rm3SsIz-Wd0LmlCvi3UJ2M5r4VwkSE,3093
35
+ euroeval/dataset_configs/faroese.py,sha256=sFC25nwlPtnl6hwkPOxPkwVggPGTjw167YhSBnLl1EA,3039
36
+ euroeval/dataset_configs/finnish.py,sha256=esb5nu4HAEdqiP7F9klmME-tkjme01Qd89TOxTB1S20,4390
37
+ euroeval/dataset_configs/french.py,sha256=lZKhJcTpaG8n3y8u5KY61UfU9YzEHF9tIPKm8UakoBs,4720
38
+ euroeval/dataset_configs/german.py,sha256=gF0idcfDt5Iy89ozwgEXEYR_ukyYurdQSS1KITPz5aM,5130
39
+ euroeval/dataset_configs/icelandic.py,sha256=qX-szARxqzJ9l-h0k5iXirC5StpW_B3BOakZQ14zmpM,5797
40
+ euroeval/dataset_configs/italian.py,sha256=tJ_-OYRJ8wJX7ZCwdE4KJIScn1ijYigAXK3lDTZTA3E,5004
41
+ euroeval/dataset_configs/latvian.py,sha256=-zVftcd7Zl6MbrqL-zqBSixsIiPsbt5ZAqldE2wFOEI,2713
42
+ euroeval/dataset_configs/norwegian.py,sha256=ccLM2Zkf5eaFH1K1KyzqoMwkVNcXgjMQTxIhPf4tl_E,7745
43
+ euroeval/dataset_configs/polish.py,sha256=Z-9PT9KaopQUmBgFk5F85ve3pjQwTJqouG8IFgg5iqw,3672
44
+ euroeval/dataset_configs/portuguese.py,sha256=gQ054SdLQ5fkm4IAP6Mdh5RcPDJPDITcuyaLKZit_9o,4089
45
+ euroeval/dataset_configs/spanish.py,sha256=DvJlMK6OQg4qmxKzQA2IficlBMB7BafvxqIVuTKiZyw,4902
46
+ euroeval/dataset_configs/swedish.py,sha256=YWHp7hbJ25o36csSg9uXaQCEJK1BPb7u2RQZiCe0lNs,5445
47
+ euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
48
+ euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
49
+ euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
50
+ euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
51
+ euroeval/metrics/pipeline.py,sha256=Wcan3eDWV7t4WRXMPWCCe_JsA-fZnIfZU2ESinbbL2I,10284
52
+ euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
53
+ euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
54
+ euroeval/prompt_templates/linguistic_acceptability.py,sha256=pRR1QBnYt5DnfxQp6dw1OYFZfIct-1R9pfdgPGpjoco,8667
55
+ euroeval/prompt_templates/multiple_choice.py,sha256=Q-8-ETqG-RZeLzR8v8WUBIN7djiNSfNpmYnZRUWcd84,6905
56
+ euroeval/prompt_templates/named_entity_recognition.py,sha256=LT7J6Y9rUCJFimpnwujBZq_V5buSmXHJteIXbTOoaCE,16442
57
+ euroeval/prompt_templates/reading_comprehension.py,sha256=ogzmhiSZO6egrdxxQiWz6a0XMdC0vws-lg5yRKQoYV0,8730
58
+ euroeval/prompt_templates/sentiment_classification.py,sha256=BwnTpSdsAN_rL693ImgtKIRc5T_2G6ptWW0jCdC02NQ,9454
59
+ euroeval/prompt_templates/summarization.py,sha256=4Sqwj6C7yNfqj4FFFCseJMLDoSZ13aIOgY0SjIzzsNo,6593
60
+ euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
61
+ euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
62
+ euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5Ei12cdRnrfq4pE-T7Y,27750
63
+ euroeval/task_group_utils/sequence_classification.py,sha256=qWUUrh4X4jK2XfUzP4aoPDoJhVJifrnDEaaw_F48hig,16080
64
+ euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
65
+ euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
66
+ euroeval-16.1.0.dist-info/METADATA,sha256=pYdW0IZwY8vatTA55EERxBK1kMaQuGhqzNys5xiSqsM,13729
67
+ euroeval-16.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
+ euroeval-16.1.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
+ euroeval-16.1.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
+ euroeval-16.1.0.dist-info/RECORD,,
@@ -1,69 +0,0 @@
1
- euroeval/__init__.py,sha256=MgFG1amMgiTJmK_hcQ7nnX-o4KFhlD1P5xKUBTloPCQ,3564
2
- euroeval/benchmark_config_factory.py,sha256=ZKzGkWr-Mr4wEMYNXUHsYkd2R-dxnNyETZJJ-Fq-my0,11386
3
- euroeval/benchmarker.py,sha256=YNqhl2QchqzbGMGu8QoJAG_mnYbcJ46ksfaS0x78fiw,49847
4
- euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
- euroeval/cli.py,sha256=RR45NiHMI9hphqBJ7Xopde-C18Be9JgJxgg6eYPFVMM,8594
6
- euroeval/constants.py,sha256=HWJ3PJRS-ZbAMXTvujiK8QP7IiS4RHkjnegv3oi52w0,2499
7
- euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
8
- euroeval/data_models.py,sha256=NdzD1ER3GHJp51UXLGTW8iTYwzZlITH2nO0vanTkEWU,24272
9
- euroeval/enums.py,sha256=V73E8FTL1aRz74OKcxokTYLnO7Q8HGs2QI0JPZI4qQo,3032
10
- euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
- euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
12
- euroeval/generation.py,sha256=wm2u8fDGDgtWxCReG3N6v4_lLvo0OHTpR88ThGSRH7A,12139
13
- euroeval/generation_utils.py,sha256=vU-j9kjFDuPlSizEaRByx_XJyyAVpE8PdGOm9i--9zQ,14613
14
- euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
15
- euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
16
- euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
- euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
- euroeval/scores.py,sha256=gJ7DSQVyE2_8qZxJPuUJcFk7Byj2D7nevE23kd4XMbA,3004
19
- euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
20
- euroeval/tasks.py,sha256=jl8HicriMSN_LfHANokVGFqzgV53QcJ5dmzb297xI04,4173
21
- euroeval/tokenization_utils.py,sha256=icEfttWReKRC5MbREOuxTHOPpuVvH6uHhnqz1w7qIyA,20565
22
- euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
23
- euroeval/utils.py,sha256=O4JIROPfbA7MD9SbOY0CifoCckYjmdNjXYjOxDwBnwM,14149
24
- euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
- euroeval/benchmark_modules/base.py,sha256=vYW97bnlzqxxcIq6lY-zd0o6zxyDRMhT85jOhdKnoYE,11482
26
- euroeval/benchmark_modules/fresh.py,sha256=_iRTHt9qUkq7jPOlgwx7IwZG48dK4mjMrh7KiEHeUjE,10462
27
- euroeval/benchmark_modules/hf.py,sha256=HDXuVwt0kZUyL9x3aG5pEjSdGCRfzegqT0xKZYprjU0,43843
28
- euroeval/benchmark_modules/litellm.py,sha256=M6ct5ppcYfO-Il5VMRm3PuyAeQ-rtS22UKyRStLnqfM,59210
29
- euroeval/benchmark_modules/vllm.py,sha256=dTwGGOFQ7wqYXg7x2YBUJNQcO6OwqjTMBfUf5OveXNk,41289
30
- euroeval/dataset_configs/__init__.py,sha256=lEOr4kJzgtUymeNBVhd-VwdUK0YTUZ3GjUMlLz5fGWk,2010
31
- euroeval/dataset_configs/danish.py,sha256=3n9e0r-hYRI2hPOgLDMQsO8bPgZKjw7OcFCUsCvdmk4,5294
32
- euroeval/dataset_configs/dutch.py,sha256=tY7FDw7BmhXxNfI1hqfasxQXP0QbYTqknokTZ7gqdRY,5079
33
- euroeval/dataset_configs/english.py,sha256=Y4yc3AQu8WojqENj0sy4-rIlx1LhPnsCQ0DeonqDsVs,4128
34
- euroeval/dataset_configs/estonian.py,sha256=o13P_XkrdhLFCz9l8LJy-TSY3JIN7XmByxesEDiagnc,2879
35
- euroeval/dataset_configs/faroese.py,sha256=sFC25nwlPtnl6hwkPOxPkwVggPGTjw167YhSBnLl1EA,3039
36
- euroeval/dataset_configs/finnish.py,sha256=7iXjjpJ23tupvtXwJF3TH1Tzwhxw0RFaoBv38HclsJc,3950
37
- euroeval/dataset_configs/french.py,sha256=9ofGQpnjw0j_lPB0SuWMvbuWVZXfOvROMqZ03d-EAHs,4281
38
- euroeval/dataset_configs/german.py,sha256=qsJO2YCND8Kuc_atSWXjkoD2itUQNbUsExiGk7P0OnE,4459
39
- euroeval/dataset_configs/icelandic.py,sha256=qX-szARxqzJ9l-h0k5iXirC5StpW_B3BOakZQ14zmpM,5797
40
- euroeval/dataset_configs/italian.py,sha256=xoS_oIFXnTraiV9PX2dBsE1GyodlAbma5dEB7yM_Q8A,4564
41
- euroeval/dataset_configs/latvian.py,sha256=tibwTbe-atsRZEBbegJ6nbr1Oh4RthUYhZoHPVVawq0,2273
42
- euroeval/dataset_configs/norwegian.py,sha256=eTX0KpjH60FyLGrUTfspvNvYaL-Ytfw3DTFftlriVM0,7295
43
- euroeval/dataset_configs/portuguese.py,sha256=x-Idrdo_EtmB_xoabwKivKG091DvFEQEbO6MTcjZVqs,3646
44
- euroeval/dataset_configs/spanish.py,sha256=5m3Qh328YPhbN8jFPIy9Sa7ZWob02ToCWzlDoT8IsSw,4462
45
- euroeval/dataset_configs/swedish.py,sha256=j_I7ba9a0nXzEPvpnPTuNFEkS51pnUPrnRwcqGh7tu0,4715
46
- euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
47
- euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
48
- euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
49
- euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
50
- euroeval/metrics/pipeline.py,sha256=T65p2sxPnwh2WgCjqsqzvE3XOzizNY7rlSm8KPR7sCk,8883
51
- euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
52
- euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
53
- euroeval/prompt_templates/linguistic_acceptability.py,sha256=9ZIyv_hfI2Aj20Uy9SY1izq5OBRV844PXPiZCNCOoEY,8207
54
- euroeval/prompt_templates/multiple_choice.py,sha256=TCMKB0xS5IEa8f4YEUjsoifcUpaIv4yOL4FisVvPwok,6423
55
- euroeval/prompt_templates/named_entity_recognition.py,sha256=_ZRVDcnbXvTs_C2NXy78oMbCLFDtW9SuxmvSVg51Umo,15554
56
- euroeval/prompt_templates/reading_comprehension.py,sha256=eRMN-kCT3wuImbuFXzZYfo5WiVhCFWJkCYwRUDtpeWo,8208
57
- euroeval/prompt_templates/sentiment_classification.py,sha256=eIXn-aAY7LKeXqxzMKoqdVbihA2f1RaNQk7DhceuQdQ,8887
58
- euroeval/prompt_templates/summarization.py,sha256=GvnKuYJKbJ_2QkdtSWp_h4RhfOXdq-7_yYeClJSPaTY,6137
59
- euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
60
- euroeval/task_group_utils/multiple_choice_classification.py,sha256=lNEOWi3ckLBnMP1QoSTxNxT-s6kBz2XH17mrmjQlv5s,7075
61
- euroeval/task_group_utils/question_answering.py,sha256=vdEbcZy7BE6ICA7kWkPYmPW4eVuIiZ_4uJRLUexDhwY,27750
62
- euroeval/task_group_utils/sequence_classification.py,sha256=K_hFWY6D5WR8-uy6ZikCq3ighHNHSyzW7A62vwDkwDs,16512
63
- euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
64
- euroeval/task_group_utils/token_classification.py,sha256=6bN9soT1kLthutCpqUT-jDmZZw9Mt7H3tjI4zVvE4BY,16469
65
- euroeval-16.0.0.dist-info/METADATA,sha256=uvzi8Bkgab8rKhgKavqFnv8rpL0KntFIYMZ7f1Joa0U,13544
66
- euroeval-16.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
67
- euroeval-16.0.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
68
- euroeval-16.0.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
69
- euroeval-16.0.0.dist-info/RECORD,,