EuroEval 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -0,0 +1,346 @@
1
+ """Utility functions related to generative models."""
2
+
3
+ import itertools as it
4
+ import json
5
+ import logging
6
+ import random
7
+ import typing as t
8
+
9
+ from .enums import TaskGroup
10
+ from .exceptions import InvalidBenchmark
11
+ from .utils import log_once
12
+
13
+ if t.TYPE_CHECKING:
14
+ from datasets import DatasetDict
15
+ from transformers.tokenization_utils import PreTrainedTokenizer
16
+
17
+ from .data_models import DatasetConfig, ModelConfig
18
+
19
+ logger = logging.getLogger("euroeval")
20
+
21
+
22
+ def extract_few_shot_examples(
23
+ dataset: "DatasetDict", dataset_config: "DatasetConfig", itr_idx: int
24
+ ) -> list[dict[str, t.Any]]:
25
+ """Extract few-shot examples from a dataset.
26
+
27
+ This will always extract the examples from the training split.
28
+
29
+ We ensure that the few-shot examples are unique by picking them one at a time.
30
+
31
+ Args:
32
+ dataset:
33
+ The dataset to extract the few-shot examples from.
34
+ dataset_config:
35
+ The dataset configuration.
36
+ itr_idx:
37
+ The index of the dataset in the iterator.
38
+
39
+ Returns:
40
+ The few-shot examples.
41
+ """
42
+ random_seed = 4242 + itr_idx
43
+ num_few_shots = dataset_config.num_few_shot_examples
44
+ few_shot_examples: list[dict[str, t.Any]] = list()
45
+ shuffled_train = dataset["train"].shuffle(seed=random_seed)
46
+
47
+ match dataset_config.task.task_group:
48
+ case (
49
+ TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
50
+ ):
51
+ # Locate the maximum number of tokens that constitutes a short example
52
+ for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
53
+ train_with_short_examples = dataset["train"].filter(
54
+ lambda example: len(example["text"]) < max_num_tokens
55
+ )
56
+ num_short_examples = len(train_with_short_examples)
57
+ if num_short_examples >= dataset_config.num_few_shot_examples:
58
+ break
59
+ else:
60
+ raise InvalidBenchmark(
61
+ "Could not find enough short examples for few-shot learning."
62
+ )
63
+
64
+ shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
65
+ labels = it.cycle(dataset_config.labels)
66
+ while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
67
+ label = next(labels)
68
+ possible_examples = shuffled_train.filter(
69
+ lambda x: x["label"].lower() == label.lower()
70
+ )
71
+ if len(possible_examples) == 0:
72
+ continue
73
+ example = possible_examples.select(range(1))[0]
74
+ few_shot_examples.append(example)
75
+ shuffled_train = shuffled_train.filter(
76
+ lambda x: x["text"] != example["text"]
77
+ )
78
+
79
+ case TaskGroup.TEXT_TO_TEXT:
80
+ while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
81
+ example = shuffled_train.select(range(1))[0]
82
+ few_shot_examples.append(example)
83
+ shuffled_train = shuffled_train.filter(
84
+ lambda x: x["text"] != example["text"]
85
+ )
86
+
87
+ case TaskGroup.TOKEN_CLASSIFICATION:
88
+ labels = it.cycle(
89
+ [
90
+ label.lower()
91
+ for label in dataset_config.labels
92
+ if label.lower().startswith("b-")
93
+ ]
94
+ )
95
+ while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
96
+ label = next(labels)
97
+ possible_examples = shuffled_train.filter(
98
+ lambda x: label in [tag.lower() for tag in x["labels"]]
99
+ )
100
+ if len(possible_examples) == 0:
101
+ continue
102
+ example = possible_examples.select(range(1))[0]
103
+ few_shot_examples.append(example)
104
+ shuffled_train = shuffled_train.filter(
105
+ lambda x: x["tokens"] != example["tokens"]
106
+ )
107
+
108
+ case TaskGroup.QUESTION_ANSWERING:
109
+ # Locate the maximum number of tokens that constitutes a short example
110
+ for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
111
+ train_with_short_examples = dataset["train"].filter(
112
+ lambda example: len(example["context"]) < max_num_tokens
113
+ )
114
+ num_short_examples = len(train_with_short_examples)
115
+ if num_short_examples >= dataset_config.num_few_shot_examples:
116
+ break
117
+ else:
118
+ raise InvalidBenchmark(
119
+ "Could not find enough short examples for few-shot learning."
120
+ )
121
+
122
+ shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
123
+ while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
124
+ example = shuffled_train.select(range(1))[0]
125
+ few_shot_examples.append(example)
126
+ shuffled_train = shuffled_train.filter(
127
+ lambda x: x["context"] != example["context"]
128
+ )
129
+
130
+ case _:
131
+ raise NotImplementedError(
132
+ f"Unsupported task group: {dataset_config.task.task_group}."
133
+ )
134
+
135
+ random.seed(random_seed)
136
+ random.shuffle(few_shot_examples)
137
+ return few_shot_examples
138
+
139
+
140
+ def apply_prompt(
141
+ examples: dict[str, t.Any],
142
+ few_shot_examples: list[dict[str, t.Any]],
143
+ model_config: "ModelConfig",
144
+ dataset_config: "DatasetConfig",
145
+ instruction_model: bool,
146
+ always_populate_text_field: bool,
147
+ tokenizer: "PreTrainedTokenizer | None",
148
+ ) -> dict[str, t.Any]:
149
+ """Apply prompt template to an example, potentially with few-shot examples.
150
+
151
+ Args:
152
+ examples:
153
+ The examples to apply the few-shot examples to.
154
+ few_shot_examples:
155
+ The few-shot examples to apply.
156
+ dataset_config:
157
+ The dataset configuration.
158
+ instruction_model:
159
+ Whether the model is instruction-tuned.
160
+ always_populate_text_field:
161
+ Whether to always populate the 'text' field in the examples, as opposed to
162
+ the 'messages' field.
163
+ tokenizer:
164
+ The tokenizer to use for the model. If None, the tokenizer is not used.
165
+
166
+ Returns:
167
+ The example with the few-shot examples applied.
168
+ """
169
+ # Sanity check
170
+ if instruction_model and always_populate_text_field and tokenizer is None:
171
+ raise ValueError(
172
+ "The `tokenizer` argument must be provided when the model is instruction "
173
+ "tuned and when we are not just returning the raw messages."
174
+ )
175
+
176
+ def create_prompt(**kwargs: str) -> tuple[str, str]:
177
+ """Create a prompt from the given keyword arguments.
178
+
179
+ Args:
180
+ kwargs:
181
+ The keyword arguments to use in the prompt.
182
+
183
+ Returns:
184
+ A pair (prompt, label), where "label" is an empty string if the model is
185
+ not instruction tuned (as in this case it is included in the prompt).
186
+ """
187
+ label_key = "label" if "label" in kwargs else "target_text"
188
+ label = kwargs.pop(label_key)
189
+ assert label is not None, (
190
+ f"Found a None label for the prompt: {kwargs}. This should not happen."
191
+ )
192
+ label_mapping = dataset_config.prompt_label_mapping
193
+ label = label_mapping.get(label, label)
194
+ if instruction_model:
195
+ prompt = dataset_config.instruction_prompt.format(**kwargs)
196
+ return prompt, label
197
+ else:
198
+ kwargs[label_key] = label
199
+ return dataset_config.prompt_template.format(**kwargs), ""
200
+
201
+ match dataset_config.task.task_group:
202
+ case (
203
+ TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
204
+ ):
205
+ few_shot_sections = [
206
+ create_prompt(
207
+ text=example["text"].replace("\n", " ").strip(),
208
+ label=example["label"].replace("\n", " ").strip(),
209
+ )
210
+ for example in few_shot_examples
211
+ ]
212
+ new_sections = [
213
+ create_prompt(text=text.replace("\n", " ").strip(), label="")
214
+ for text in examples["text"]
215
+ ]
216
+
217
+ case TaskGroup.TEXT_TO_TEXT:
218
+ few_shot_sections = [
219
+ create_prompt(
220
+ text=example["text"].replace("\n", " ").strip(),
221
+ target_text=example["target_text"].replace("\n", " ").strip(),
222
+ )
223
+ for example in few_shot_examples
224
+ ]
225
+ new_sections = [
226
+ create_prompt(text=text.replace("\n", " ").strip(), target_text="")
227
+ for text in examples["text"]
228
+ ]
229
+
230
+ case TaskGroup.TOKEN_CLASSIFICATION:
231
+
232
+ def create_label(example: dict) -> str:
233
+ prompt_labels = dataset_config.prompt_label_mapping.values()
234
+ labels: dict[str, list[str]] = {
235
+ prompt_label: list() for prompt_label in prompt_labels
236
+ }
237
+ for token, label in zip(example["tokens"], example["labels"]):
238
+ label = label.lower()
239
+ if label == "o":
240
+ continue
241
+ prompt_label = dataset_config.prompt_label_mapping[label]
242
+ if label.startswith("b-"):
243
+ labels[prompt_label].append(token)
244
+ elif label.startswith("i-"):
245
+ labels[prompt_label][-1] += " " + token
246
+ return json.dumps(labels, ensure_ascii=False)
247
+
248
+ few_shot_sections = [
249
+ create_prompt(
250
+ text=" ".join(example["tokens"]).replace("\n", " ").strip(),
251
+ label=create_label(example=example),
252
+ )
253
+ for example in few_shot_examples
254
+ ]
255
+ new_sections = [
256
+ create_prompt(
257
+ text=" ".join(tokens).replace("\n", " ").strip(), label=""
258
+ )
259
+ for tokens in examples["tokens"]
260
+ ]
261
+
262
+ case TaskGroup.QUESTION_ANSWERING:
263
+ few_shot_sections = [
264
+ create_prompt(
265
+ text=example["context"].replace("\n", " ").strip(),
266
+ question=example["question"].replace("\n", " ").strip(),
267
+ label=example["answers"]["text"][0].replace("\n", " "),
268
+ )
269
+ for example in few_shot_examples
270
+ ]
271
+ new_sections = [
272
+ create_prompt(
273
+ text=context.replace("\n", " ").strip(),
274
+ question=question.replace("\n", " ").strip(),
275
+ label="",
276
+ )
277
+ for context, question in zip(examples["context"], examples["question"])
278
+ ]
279
+
280
+ case _:
281
+ raise NotImplementedError(
282
+ f"Unsupported task group: {dataset_config.task.task_group}."
283
+ )
284
+
285
+ if instruction_model:
286
+ few_shot_messages = [
287
+ dict(role=role, content=content)
288
+ for prompt, label in few_shot_sections
289
+ for role, content in [("user", prompt), ("assistant", label)]
290
+ ]
291
+
292
+ messages_list = [
293
+ few_shot_messages + [dict(role="user", content=prompt)]
294
+ for prompt, _ in new_sections
295
+ ]
296
+
297
+ if not always_populate_text_field:
298
+ examples["messages"] = messages_list
299
+
300
+ else:
301
+ assert tokenizer is not None
302
+
303
+ # Pick the chat template that matches the language of the dataset, if such a
304
+ # template exists
305
+ chat_template: str | None = None
306
+ if isinstance(tokenizer.chat_template, dict):
307
+ language_codes = [
308
+ language.code for language in dataset_config.languages
309
+ ]
310
+ for name, candidate_template in tokenizer.chat_template.items():
311
+ if name.lower() in language_codes:
312
+ chat_template = candidate_template
313
+ log_once(
314
+ f"Using the {name!r} chat template for the tokenizer for "
315
+ f"model {model_config.model_id!r}.",
316
+ level=logging.DEBUG,
317
+ )
318
+ break
319
+
320
+ texts = [
321
+ tokenizer.apply_chat_template(
322
+ conversation=messages,
323
+ tokenize=False,
324
+ add_generation_prompt=True,
325
+ chat_template=chat_template,
326
+ )
327
+ for messages in messages_list
328
+ ]
329
+
330
+ examples["text"] = texts
331
+
332
+ else:
333
+ prompt_prefix = ""
334
+ if dataset_config.prompt_prefix:
335
+ prompt_prefix = dataset_config.prompt_prefix + "\n\n"
336
+
337
+ few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
338
+ if few_shot_prompt:
339
+ few_shot_prompt += "\n\n"
340
+
341
+ examples["text"] = [
342
+ prompt_prefix + few_shot_prompt + new_prompt
343
+ for new_prompt, _ in new_sections
344
+ ]
345
+
346
+ return examples
euroeval/languages.py CHANGED
@@ -21,6 +21,7 @@ def get_all_languages() -> dict[str, Language]:
21
21
  DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
22
22
  NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
23
23
  EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
24
+ FI = Language(code="fi", name="Finnish", _and_separator="ja", _or_separator="tai")
24
25
  FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
25
26
  FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
26
27
  DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
@@ -78,7 +79,6 @@ EO = Language(code="eo", name="Esperanto")
78
79
  ET = Language(code="et", name="Estonian")
79
80
  EE = Language(code="ee", name="Ewe")
80
81
  FJ = Language(code="fj", name="Fijian")
81
- FI = Language(code="fi", name="Finnish")
82
82
  FY = Language(code="fy", name="Western Frisian")
83
83
  FF = Language(code="ff", name="Fulah")
84
84
  GD = Language(code="gd", name="Gaelic")
euroeval/scores.py CHANGED
@@ -18,6 +18,7 @@ def log_scores(
18
18
  metric_configs: list["MetricConfig"],
19
19
  scores: list[dict[str, float]],
20
20
  model_id: str,
21
+ model_revision: str,
21
22
  ) -> "ScoreDict":
22
23
  """Log the scores.
23
24
 
@@ -30,13 +31,18 @@ def log_scores(
30
31
  The scores that are to be logged. This is a list of dictionaries full of
31
32
  scores.
32
33
  model_id:
33
- The full Hugging Face Hub path to the pretrained transformer model.
34
+ The model ID of the model that was evaluated.
35
+ model_revision:
36
+ The revision of the model.
34
37
 
35
38
  Returns:
36
39
  A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
37
40
  identical to `scores` and 'total' being a dictionary with the aggregated scores
38
41
  (means and standard errors).
39
42
  """
43
+ if model_revision and model_revision != "main":
44
+ model_id += f"@{model_revision}"
45
+
40
46
  logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
41
47
 
42
48
  total_dict: dict[str, float] = dict()
@@ -132,6 +132,11 @@ def extract_labels_from_generation(
132
132
  The predicted labels.
133
133
  """
134
134
  if model_output.scores is not None:
135
+ if first_label_token_mapping is False:
136
+ raise InvalidBenchmark(
137
+ "The model outputted logprobs, but the first label token mapping is "
138
+ "not provided. This means that the model should not output logprobs."
139
+ )
135
140
  labels = get_closest_logprobs_labels(
136
141
  generation_logprobs=model_output.scores,
137
142
  dataset_config=dataset_config,
@@ -147,7 +152,7 @@ def extract_labels_from_generation(
147
152
  def get_closest_logprobs_labels(
148
153
  generation_logprobs: list[list[list[tuple[str, float]]]],
149
154
  dataset_config: "DatasetConfig",
150
- first_label_token_mapping: dict[str, str] | bool,
155
+ first_label_token_mapping: dict[str, str] | t.Literal[True],
151
156
  ) -> list[str] | None:
152
157
  """Get the labels with the highest predicted logprob value.
153
158
 
@@ -164,8 +169,7 @@ def get_closest_logprobs_labels(
164
169
  The configuration of the dataset.
165
170
  first_label_token_mapping:
166
171
  A mapping from labels to the first token in each label, or alternatively a
167
- Boolean value indicating whether the model should output scores (if the
168
- mapping is outputted then the model will always output scores).
172
+ `True` value indicating that the model should output logprobs.
169
173
 
170
174
  Returns:
171
175
  The predicted labels, or None if labels could not be extracted.
@@ -195,7 +199,9 @@ def get_closest_logprobs_labels(
195
199
  # label, as the output label
196
200
  output_label: str | None = None
197
201
  for generated_label in generated_labels:
198
- # Get the candidate labels that starts with the generated label
202
+ # Get the candidate labels. If we have a first label token mapping, we
203
+ # use it to get the candidate labels. Otherwise, we check if any of the
204
+ # labels start with the generated label.
199
205
  if isinstance(first_label_token_mapping, dict):
200
206
  if any(
201
207
  candidate_label not in first_label_token_mapping
@@ -239,14 +245,43 @@ def get_closest_logprobs_labels(
239
245
  )
240
246
  return None
241
247
 
242
- # If no candidate label is found, we ignore the generated label, as it
243
- # basically means that the model is just really bad at generating
244
- # labels.
248
+ # If no candidate label is found, we first check if any of the labels
249
+ # start with the generated label. This could be the case if the labels
250
+ # in the first token mapping is inaccurate or incomplete, for instance
251
+ # if 'pos' is in the first label token mapping, but the model outputted
252
+ # 'posit'. If this is the case then we cannot trust the first label
253
+ # token mapping, and we fall back to using word edit distance.
254
+ # Otherwise, the generated label is just bad, and we skip to the next
255
+ # generated label.
245
256
  elif len(candidate_output_labels) == 0:
246
- logger.debug(
247
- f"No candidate label found for the generated label "
248
- f"{generated_label!r}. The generated label is thus ignored."
249
- )
257
+ candidate_output_labels_starting_with_generated_label = [
258
+ candidate_label
259
+ for candidate_label in candidate_labels
260
+ if candidate_label.startswith(generated_label)
261
+ ]
262
+ if candidate_output_labels_starting_with_generated_label:
263
+ log_once(
264
+ f"No candidate label found for the generated label "
265
+ f"{generated_label!r}. This means that using logprobs to "
266
+ "extract the labels is not reliable, and we will instead "
267
+ "fall back to extracting the labels using word edit "
268
+ "distance.",
269
+ level=logging.DEBUG,
270
+ )
271
+ return None
272
+
273
+ # If we did not find any candidate label for any of the generated labels, we
274
+ # assume that something is wrong with the model output, and we fall back to
275
+ # using word edit distance to extract the labels
276
+ else:
277
+ log_once(
278
+ f"No candidate label found for any of the generated labels "
279
+ f"{generated_labels}. This means that using logprobs to extract "
280
+ "the labels is not reliable, and we will instead fall back to "
281
+ "extracting the labels using word edit distance.",
282
+ level=logging.DEBUG,
283
+ )
284
+ return None
250
285
 
251
286
  if output_label is not None:
252
287
  output_labels.append(output_label)
@@ -311,24 +311,60 @@ def get_first_label_token_mapping(
311
311
  for label in dataset_config.labels
312
312
  ]
313
313
 
314
- # Get the first token of each label, where we add a prefix space if needed
315
- add_prefix_space = (
316
- should_prefix_space_be_added_to_labels(
314
+ # Tokenize some text containing each label, which we will use to extract the
315
+ # first token of each label
316
+ all_tokens: list[list[str]]
317
+ if tokenizer.chat_template is None:
318
+ add_prefix_space = should_prefix_space_be_added_to_labels(
317
319
  labels_to_be_generated=local_labels, tokenizer=tokenizer
318
320
  )
319
- and tokenizer.chat_template is None
320
- )
321
- first_tokens = [
322
- tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
323
- for label in local_labels
324
- ]
325
- first_tokens = [
326
- re.sub(
327
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
328
- )
329
- for token in first_tokens
321
+ all_tokens = [
322
+ tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
323
+ for label in local_labels
324
+ ]
325
+ else:
326
+ all_tokens = [
327
+ tokenizer.convert_ids_to_tokens(
328
+ ids=tokenizer.apply_chat_template(
329
+ conversation=[
330
+ dict(role="user", content=""),
331
+ dict(role="assistant", content=label),
332
+ ],
333
+ add_generation_prompt=True,
334
+ tokenize=True,
335
+ )
336
+ )
337
+ for label in local_labels
338
+ ]
339
+
340
+ # Remove any non-alphabetic characters from the tokens
341
+ all_tokens = [
342
+ [
343
+ re.sub(
344
+ pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
345
+ repl="",
346
+ string=token.lower(),
347
+ )
348
+ for token in token_list
349
+ ]
350
+ for token_list in all_tokens
330
351
  ]
331
352
 
353
+ # Extract the first token of each label
354
+ first_tokens: list[str] = list()
355
+ for token_list, label in zip(all_tokens, local_labels):
356
+ matching_tokens = [
357
+ tok for tok in token_list if tok and label.startswith(tok)
358
+ ]
359
+ if not matching_tokens:
360
+ log_once(
361
+ f"No matching token found in token_list for label '{label}', so "
362
+ "we will not output scores.",
363
+ level=logging.DEBUG,
364
+ )
365
+ return False
366
+ first_tokens.append(matching_tokens[0])
367
+
332
368
  # Build a mapping from labels to the first token in each label if the first
333
369
  # tokens are distinct
334
370
  if len(first_tokens) == len(set(first_tokens)):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.7.0
3
+ Version: 15.7.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,38 +1,39 @@
1
1
  euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
- euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
- euroeval/benchmarker.py,sha256=gOLNpW11cBX_8AvotnlGNbejtOM4acmXS3aovNREqhA,48434
2
+ euroeval/benchmark_config_factory.py,sha256=RDYotoLcfNr3xU8Cw-G-Y8wLe6RSlJD1Ok9C97lWfOs,12553
3
+ euroeval/benchmarker.py,sha256=4tCrs0CvKvQcMpJRtaonxELEDXkmY95stCGwht6wTGE,48649
4
4
  euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
6
  euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
7
7
  euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
8
- euroeval/data_models.py,sha256=Nlb2s26u5OvQ2AITAt25NMpeI1IHM2_qqbpyU_bZhiY,22907
8
+ euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
11
11
  euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
12
12
  euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
13
+ euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
13
14
  euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
14
- euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
15
+ euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
15
16
  euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
16
17
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
18
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
- euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
19
+ euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
19
20
  euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
20
21
  euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
21
- euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
22
+ euroeval/tokenization_utils.py,sha256=RYTYbzCM9cryZ_w-_CzyN9Sbt47DbaGU5ukm-H38sHI,13871
22
23
  euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
23
24
  euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
24
25
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
26
  euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
26
27
  euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
27
28
  euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
28
- euroeval/benchmark_modules/litellm.py,sha256=9Fhh7Zyn6F4JBlRoQkST1wIeb8z0YliRRrcmD5pONs4,52551
29
- euroeval/benchmark_modules/vllm.py,sha256=vwAE7SGRhePqkzAt1S-FKPelEqe8VMGwah9Nj2J1hLs,51295
30
- euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
29
+ euroeval/benchmark_modules/litellm.py,sha256=_32H-M1L_TfW-opyaMLJFPxx0iOG8A8Zfq7uVGFKZdA,43005
30
+ euroeval/benchmark_modules/vllm.py,sha256=DJyla0jr-DVMPPs4RBguxq1Xn5YguvyuAnIlgIOfFaw,39394
31
+ euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
31
32
  euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
32
- euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
33
+ euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
33
34
  euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
34
35
  euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
35
- euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
36
+ euroeval/dataset_configs/finnish.py,sha256=lZA2bY_ul9qh3uGFrTNe7q15WyZ04EL9OYmrkcNjygY,1857
36
37
  euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
37
38
  euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
38
39
  euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
@@ -50,11 +51,11 @@ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5w
50
51
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
51
52
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
52
53
  euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
53
- euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
54
+ euroeval/task_group_utils/sequence_classification.py,sha256=MCdO5h3v_LWTkrvKAeefPq7rl1H5mFed50nAL4uZq0E,13837
54
55
  euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
55
56
  euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
56
- euroeval-15.7.0.dist-info/METADATA,sha256=8oMsbhHWeO7j4KQdn4lpt-O94Nw0erwRoD_Ogk6CX2U,13669
57
- euroeval-15.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
58
- euroeval-15.7.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
59
- euroeval-15.7.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
60
- euroeval-15.7.0.dist-info/RECORD,,
57
+ euroeval-15.7.2.dist-info/METADATA,sha256=nCF9GI8kOoKP3Up_KgPSxe4pnomawC1rQqRGlYoEsIA,13669
58
+ euroeval-15.7.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
+ euroeval-15.7.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
+ euroeval-15.7.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
+ euroeval-15.7.2.dist-info/RECORD,,