EuroEval 15.6.1__py3-none-any.whl → 15.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
1
  """Templates for the Sentiment Analysis task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  SENT_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
44
44
  default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
45
45
  "documento. Responde con {labels_str}, y nada más.",
46
46
  ),
47
+ FI: PromptConfig(
48
+ default_prompt_label_mapping=dict(
49
+ positive="positiivinen", neutral="neutrali", negative="negatiivinen"
50
+ ),
51
+ default_prompt_prefix="Seuraavassa on arvosteluja ja niiden tunnesävy, joka "
52
+ "voi olla {labels_str}.",
53
+ default_prompt_template="Teksti: {text}\nTunnesävy: {label}",
54
+ default_instruction_prompt="Teksti: {text}\n\nLuokittele arvostelun tunnesävy. "
55
+ "Vastaa vain {labels_str}, ei muuta.",
56
+ ),
47
57
  FO: PromptConfig(
48
58
  default_prompt_label_mapping=dict(
49
59
  positive="positivt", neutral="neutralt", negative="negativt"
@@ -1,7 +1,7 @@
1
1
  """Templates for the Summarization task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
5
5
 
6
6
  # TODO: Missing Faroese
7
7
  SUMM_TEMPLATES = {
@@ -36,6 +36,14 @@ SUMM_TEMPLATES = {
36
36
  "documento anterior.",
37
37
  default_prompt_label_mapping=dict(),
38
38
  ),
39
+ FI: PromptConfig(
40
+ default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
41
+ "tiivistelmiä.",
42
+ default_prompt_template="Uutisartikkeli: {text}\nTiivistelmä: {target_text}",
43
+ default_instruction_prompt="Uutisartikkeli: {text}\n\nKirjoita tiivistelmä "
44
+ "yllä olevasta artikkelista.",
45
+ default_prompt_label_mapping=dict(),
46
+ ),
39
47
  FR: PromptConfig(
40
48
  default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
41
49
  default_prompt_template="Document: {text}\nRésumé: {target_text}",
euroeval/scores.py CHANGED
@@ -18,6 +18,7 @@ def log_scores(
18
18
  metric_configs: list["MetricConfig"],
19
19
  scores: list[dict[str, float]],
20
20
  model_id: str,
21
+ model_revision: str,
21
22
  ) -> "ScoreDict":
22
23
  """Log the scores.
23
24
 
@@ -30,13 +31,18 @@ def log_scores(
30
31
  The scores that are to be logged. This is a list of dictionaries full of
31
32
  scores.
32
33
  model_id:
33
- The full Hugging Face Hub path to the pretrained transformer model.
34
+ The model ID of the model that was evaluated.
35
+ model_revision:
36
+ The revision of the model.
34
37
 
35
38
  Returns:
36
39
  A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
37
40
  identical to `scores` and 'total' being a dictionary with the aggregated scores
38
41
  (means and standard errors).
39
42
  """
43
+ if model_revision and model_revision != "main":
44
+ model_id += f"@{model_revision}"
45
+
40
46
  logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
41
47
 
42
48
  total_dict: dict[str, float] = dict()
@@ -132,22 +132,23 @@ def extract_labels_from_generation(
132
132
  The predicted labels.
133
133
  """
134
134
  if model_output.scores is not None:
135
- return get_closest_logprobs_labels(
135
+ labels = get_closest_logprobs_labels(
136
136
  generation_logprobs=model_output.scores,
137
137
  dataset_config=dataset_config,
138
138
  first_label_token_mapping=first_label_token_mapping,
139
139
  )
140
- else:
141
- return get_closest_word_edit_labels(
142
- generated_sequences=model_output.sequences, dataset_config=dataset_config
143
- )
140
+ if labels is not None:
141
+ return labels
142
+ return get_closest_word_edit_labels(
143
+ generated_sequences=model_output.sequences, dataset_config=dataset_config
144
+ )
144
145
 
145
146
 
146
147
  def get_closest_logprobs_labels(
147
148
  generation_logprobs: list[list[list[tuple[str, float]]]],
148
149
  dataset_config: "DatasetConfig",
149
150
  first_label_token_mapping: dict[str, str] | bool,
150
- ) -> list[str]:
151
+ ) -> list[str] | None:
151
152
  """Get the labels with the highest predicted logprob value.
152
153
 
153
154
  In case a candidate label is split into multiple tokens, we only use the first
@@ -167,7 +168,7 @@ def get_closest_logprobs_labels(
167
168
  mapping is outputted then the model will always output scores).
168
169
 
169
170
  Returns:
170
- The predicted labels.
171
+ The predicted labels, or None if labels could not be extracted.
171
172
 
172
173
  Raises:
173
174
  InvalidBenchmark:
@@ -193,10 +194,7 @@ def get_closest_logprobs_labels(
193
194
  # We want to use the first generated label which contains a unique candidate
194
195
  # label, as the output label
195
196
  output_label: str | None = None
196
- previously_generated_labels: list[str] = list()
197
- for label_idx, generated_label in enumerate(generated_labels):
198
- generated_label = "".join(previously_generated_labels) + generated_label
199
-
197
+ for generated_label in generated_labels:
200
198
  # Get the candidate labels that starts with the generated label
201
199
  if isinstance(first_label_token_mapping, dict):
202
200
  if any(
@@ -222,31 +220,28 @@ def get_closest_logprobs_labels(
222
220
  if candidate_label.startswith(generated_label)
223
221
  }
224
222
 
225
- # If we can uniquely determine the output label, we break the loop. If
226
- # there are multiple possible labels then we store the current one, and
227
- # concatenate it with the next generated label. We can only do this if
228
- # the current one is the first one, however, since we're using greedy
229
- # sampling. In case this happens for a label that is not the first one,
230
- # we warn the user.
223
+ # If we can uniquely determine the output label, we break the loop.
231
224
  if len(candidate_output_labels) == 1:
232
225
  output_label = candidate_output_labels.pop()
233
226
  break
227
+
228
+ # If we have multiple candidate labels, we cannot uniquely determine the
229
+ # output label, so we abandon extracting the labels using logprobs and
230
+ # fall back to using word edit distance.
234
231
  elif len(candidate_output_labels) > 1:
235
- if label_idx == 0:
236
- previously_generated_labels.append(generated_label)
237
- else:
238
- output_label = candidate_output_labels.pop()
239
- candidate_output_labels.add(output_label)
240
- raise InvalidBenchmark(
241
- "Multiple candidate labels found for the generated label "
242
- f"{generated_label!r}: {candidate_output_labels}. Since "
243
- "this is not the first generated label, we cannot "
244
- "concatenate it with the next generated label. We are thus "
245
- f"forced to use the arbitrary {output_label!r} as the "
246
- "output label, potentially resulting in worse performance. "
247
- "Please report this issue to the EuroEval team at "
248
- "github.com/EuroEval/EuroEval/issues."
249
- )
232
+ log_once(
233
+ "Multiple candidate labels found for the generated label "
234
+ f"{generated_label!r}: {candidate_output_labels}. This means "
235
+ "that using logprobs to extract the labels is not reliable, "
236
+ "and we will instead fall back to extracting the labels "
237
+ "using word edit distance.",
238
+ level=logging.DEBUG,
239
+ )
240
+ return None
241
+
242
+ # If no candidate label is found, we ignore the generated label, as it
243
+ # basically means that the model is just really bad at generating
244
+ # labels.
250
245
  elif len(candidate_output_labels) == 0:
251
246
  logger.debug(
252
247
  f"No candidate label found for the generated label "
@@ -10,11 +10,7 @@ from evaluate import EvaluationModule
10
10
  from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
11
11
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
12
12
  from ..exceptions import InvalidBenchmark
13
- from ..utils import (
14
- HiddenPrints,
15
- clear_memory,
16
- raise_if_model_output_contains_nan_values,
17
- )
13
+ from ..utils import HiddenPrints, raise_if_model_output_contains_nan_values
18
14
 
19
15
  if t.TYPE_CHECKING:
20
16
  from transformers.trainer_utils import EvalPrediction
@@ -89,20 +85,8 @@ def compute_metrics(
89
85
  score_dict: dict[str, float] | None = metric.compute(
90
86
  predictions=predictions, references=labels, **cfg.compute_kwargs
91
87
  )
92
-
93
- # Clear the cache of the BERTScorer to avoid memory leaks
94
- for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
95
- if hasattr(metric, attribute):
96
- delattr(metric, attribute)
97
-
98
- clear_memory()
99
88
  break
100
89
  except Exception as e:
101
- # Clear the cache of the BERTScorer to avoid memory leaks
102
- if hasattr(metric, "cached_bertscorer"):
103
- del metric.cached_bertscorer
104
- clear_memory()
105
-
106
90
  oom_error = [
107
91
  "CUDA out of memory",
108
92
  "CUDA error",
@@ -111,16 +95,7 @@ def compute_metrics(
111
95
  if not any(error in str(e) for error in oom_error):
112
96
  raise InvalidBenchmark(str(e))
113
97
 
114
- if cfg.compute_kwargs.get("batch_size", 1) > 1:
115
- batch_size = cfg.compute_kwargs["batch_size"]
116
- cfg.compute_kwargs["batch_size"] = batch_size // 2
117
- logger.debug(
118
- "Out of memory error occurred during the computation of "
119
- f"the metric {cfg.pretty_name}. Reducing the batch size to "
120
- f"{cfg.compute_kwargs['batch_size']}."
121
- )
122
- elif cfg.compute_kwargs.get("device", "cpu") != "cpu":
123
- cfg.compute_kwargs["batch_size"] = 32
98
+ if cfg.compute_kwargs.get("device", "cpu") != "cpu":
124
99
  cfg.compute_kwargs["device"] = "cpu"
125
100
  logger.debug(
126
101
  "Out of memory error occurred during the computation of "
@@ -129,6 +104,14 @@ def compute_metrics(
129
104
  )
130
105
  else:
131
106
  raise InvalidBenchmark(str(e))
107
+ finally:
108
+ for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
109
+ if hasattr(metric, attribute):
110
+ logger.debug(
111
+ f"Deleting the {attribute!r} attribute of the metric "
112
+ f"{cfg.pretty_name} to free up memory."
113
+ )
114
+ delattr(metric, attribute)
132
115
 
133
116
  # The metric returns None if we are running on multi-GPU and the current
134
117
  # process is not the main process
euroeval/tasks.py CHANGED
@@ -142,7 +142,7 @@ SUMM = Task(
142
142
  huggingface_id="bertscore",
143
143
  results_key="f1",
144
144
  compute_kwargs=dict(
145
- model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=32
145
+ model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
146
146
  ),
147
147
  ),
148
148
  MetricConfig(
@@ -7,6 +7,7 @@ import typing as t
7
7
  import torch
8
8
 
9
9
  from .constants import TASK_GROUPS_USING_LOGPROBS
10
+ from .enums import GenerativeType
10
11
  from .exceptions import InvalidModel
11
12
  from .utils import log_once
12
13
 
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
14
15
  from transformers.tokenization_utils import PreTrainedTokenizer
15
16
  from transformers.tokenization_utils_base import PreTrainedTokenizerBase
16
17
 
17
- from .data_models import DatasetConfig
18
+ from .data_models import DatasetConfig, ModelConfig
18
19
 
19
20
 
20
21
  logger = logging.getLogger("euroeval")
@@ -254,35 +255,50 @@ def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | N
254
255
 
255
256
 
256
257
  def get_first_label_token_mapping(
257
- dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
258
+ dataset_config: "DatasetConfig",
259
+ model_config: "ModelConfig",
260
+ tokenizer: "PreTrainedTokenizer | None",
261
+ generative_type: "GenerativeType | None",
258
262
  ) -> dict[str, str] | bool:
259
263
  """Check if the model should output scores.
260
264
 
261
265
  Args:
262
266
  dataset_config:
263
267
  The dataset configuration.
268
+ model_config:
269
+ The model configuration.
264
270
  tokenizer:
265
271
  The tokenizer, or None if not available.
272
+ generative_type:
273
+ The generative type, or None if not available.
266
274
 
267
275
  Returns:
268
276
  A mapping from labels to the first token in each label, or alternatively a
269
277
  Boolean value indicating whether the model should output scores (if the mapping
270
278
  is outputted then the model will always output scores).
271
279
  """
280
+ if generative_type == GenerativeType.REASONING:
281
+ log_once(
282
+ f"The model {model_config.model_id!r} is a reasoning model and "
283
+ "thus does not support logprobs, so we do not enable it.",
284
+ level=logging.DEBUG,
285
+ )
286
+ return False
287
+
272
288
  # If we do not have any tokenizer, then we cannot check if the model should output
273
289
  # scores and we just assume it should if the dataset supports it
274
290
  output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
275
291
  if tokenizer is None:
276
292
  if output_scores:
277
293
  log_once(
278
- "The model will output scores, since the dataset supports it and no "
279
- "tokenizer is available.",
294
+ f"The model {model_config.model_id!r} will output scores, since the "
295
+ "dataset supports it and no tokenizer is available.",
280
296
  level=logging.DEBUG,
281
297
  )
282
298
  else:
283
299
  log_once(
284
- "The model will not output scores, since the dataset does not support "
285
- "it and no tokenizer is available.",
300
+ f"The model {model_config.model_id!r} will not output scores, since "
301
+ "the dataset does not support it and no tokenizer is available.",
286
302
  level=logging.DEBUG,
287
303
  )
288
304
  return output_scores
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.6.1
3
+ Version: 15.7.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -32,7 +32,7 @@ Requires-Python: <4.0,>=3.10
32
32
  Requires-Dist: accelerate>=0.34.2
33
33
  Requires-Dist: bert-score>=0.3.13
34
34
  Requires-Dist: click>=8.1.3
35
- Requires-Dist: datasets>=2.15.0
35
+ Requires-Dist: datasets>=3.5.0
36
36
  Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
38
  Requires-Dist: huggingface-hub>=0.30.1
@@ -239,6 +239,18 @@ A huge thank you to all the contributors who have helped make this project a suc
239
239
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
240
240
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
241
241
 
242
+
243
+ ### Contribute to EuroEval
244
+
245
+ We welcome contributions to EuroEval! Whether you're fixing bugs, adding features, or
246
+ contributing new datasets, your help makes this project better for everyone.
247
+
248
+ - **General contributions**: Check out our [contribution guidelines](CONTRIBUTING.md)
249
+ for information on how to get started.
250
+ - **Adding datasets**: If you're interested in adding a new dataset to EuroEval, we have
251
+ a [dedicated guide](NEW_DATASET_GUIDE.md) with step-by-step instructions.
252
+
253
+
242
254
  ### Special Thanks
243
255
  - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
244
256
  [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
@@ -1,37 +1,39 @@
1
1
  euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
2
  euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
- euroeval/benchmarker.py,sha256=7LVFr7zL7OeJPs7WVYwekNnEmiIKPXHydcbAkW99MUk,48080
3
+ euroeval/benchmarker.py,sha256=OnjGVblWW20wSmA7Tr2c-qE3g8FIjxW6wTJySAcGxVk,48492
4
4
  euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
- euroeval/constants.py,sha256=t2mAT8tE3Dn2lXWHTnaFoaOIaUcdiBjJTASCt7nSdkg,1984
7
- euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
- euroeval/data_models.py,sha256=oZLrGg1dhIIwbgtEzq4U_fu_ZbBsz35mrqsyizuZNPw,23138
6
+ euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
7
+ euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
8
+ euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
11
11
  euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
12
12
  euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
13
+ euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
13
14
  euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
14
15
  euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
15
16
  euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
16
17
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
18
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
- euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
19
+ euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
19
20
  euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
20
- euroeval/tasks.py,sha256=VVXFDcEM250KTGXd1pxQb8vwdia4ZJxgTUY5Kdsa-ik,7070
21
- euroeval/tokenization_utils.py,sha256=PNuS-FTdVrL9TWNDGlq42MvUggKwmyYM0BnC5I37IO0,11876
21
+ euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
22
+ euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
22
23
  euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
23
24
  euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
24
25
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
26
  euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
26
27
  euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
27
28
  euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
28
- euroeval/benchmark_modules/litellm.py,sha256=wohdi1WoeJ-JEdQLgg2q3JbZJA77XO7yGZaTRvbRU4o,47575
29
- euroeval/benchmark_modules/vllm.py,sha256=FTpwal5WdrVsOpkjm_RXwf6-2PrNrrP1LO6BVGYb6GE,48086
29
+ euroeval/benchmark_modules/litellm.py,sha256=v_rbCm2FiTMqcUui_09k3E1-s5uOmbfAvSy2c7Mm0_E,42636
30
+ euroeval/benchmark_modules/vllm.py,sha256=Q-3vtZz5XxQQImJxOiF0XDrQ4T_p0bkgdPw1Jobgu3s,39380
30
31
  euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
31
32
  euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
32
- euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
33
+ euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
33
34
  euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
34
35
  euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
36
+ euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
35
37
  euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
36
38
  euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
37
39
  euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
@@ -40,20 +42,20 @@ euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada
40
42
  euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
41
43
  euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
42
44
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
43
- euroeval/prompt_templates/linguistic_acceptability.py,sha256=sx_WqLm7N6Thll6COUCCA0lXe9RMZ7WhoH6X498pixM,6232
44
- euroeval/prompt_templates/multiple_choice.py,sha256=H0CDQPs_WzgSJ7oI_FBzHs0TOF0Na2qZYJLhDC7S8tk,4710
45
- euroeval/prompt_templates/named_entity_recognition.py,sha256=T65oFEtVT8JRF9c7bq2nPm233rftPdEAGic0DU-toko,11835
46
- euroeval/prompt_templates/reading_comprehension.py,sha256=WbQoal_tjoTt7qsmSZXEWwlI77vgiANcZoZC1l1AZjc,6090
47
- euroeval/prompt_templates/sentiment_classification.py,sha256=LcFD89e5nMOv4u-Unj8_jHpNjKMmgKPEfz0-e39VbsM,6639
48
- euroeval/prompt_templates/summarization.py,sha256=eX0uUTf_5Xorm6f_TlBBNwLC9zKvR7YJkP0RSaLWgIw,4585
45
+ euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
46
+ euroeval/prompt_templates/multiple_choice.py,sha256=6iEqiPpT-3WJN_gsyhyapnwsrcsYGdVkSkzwn-VKKxw,5101
47
+ euroeval/prompt_templates/named_entity_recognition.py,sha256=Xd6gBJD2e1l8-We2Ujor7crRUBcbgnNeeVknBIrTMJo,12737
48
+ euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLOS7ZrDUeasuLFt-dtqCnYk,6585
49
+ euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
50
+ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
49
51
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
50
52
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
51
53
  euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
52
- euroeval/task_group_utils/sequence_classification.py,sha256=gqd0-l5o7vAY5QIpGSkSqwJwez3Y0r5SqOiywfPNW8A,12239
53
- euroeval/task_group_utils/text_to_text.py,sha256=QECnGdZ0YLjsbMc6LwXqVi4KMuITdiOjmJUNQtAAOW0,5712
54
+ euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
55
+ euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
54
56
  euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
55
- euroeval-15.6.1.dist-info/METADATA,sha256=4i98IBxn6yWh4ugBW-SnljmDfKEXBSfRGjZyf_dlOUs,13183
56
- euroeval-15.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
57
- euroeval-15.6.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
58
- euroeval-15.6.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
59
- euroeval-15.6.1.dist-info/RECORD,,
57
+ euroeval-15.7.1.dist-info/METADATA,sha256=Fj6QejwQCK0zGuP_DHSQ7sul195ivUqOUCT5AVxgLSI,13669
58
+ euroeval-15.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
+ euroeval-15.7.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
60
+ euroeval-15.7.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
61
+ euroeval-15.7.1.dist-info/RECORD,,