EuroEval 15.11.0__py3-none-any.whl → 15.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,6 +42,7 @@ def build_benchmark_config(
42
42
  num_iterations: int,
43
43
  api_base: str | None,
44
44
  api_version: str | None,
45
+ gpu_memory_utilization: float,
45
46
  debug: bool,
46
47
  run_with_cli: bool,
47
48
  only_allow_safetensors: bool,
@@ -102,6 +103,11 @@ def build_benchmark_config(
102
103
  model on an inference API.
103
104
  api_version:
104
105
  The version of the API to use for a given inference API.
106
+ gpu_memory_utilization:
107
+ The GPU memory utilization to use for vLLM. A larger value will result in
108
+ faster evaluation, but at the risk of running out of GPU memory. Only reduce
109
+ this if you are running out of GPU memory. Only relevant if the model is
110
+ generative.
105
111
  debug:
106
112
  Whether to run the benchmark in debug mode.
107
113
  run_with_cli:
@@ -154,6 +160,7 @@ def build_benchmark_config(
154
160
  num_iterations=num_iterations,
155
161
  api_base=api_base,
156
162
  api_version=api_version,
163
+ gpu_memory_utilization=gpu_memory_utilization,
157
164
  debug=debug,
158
165
  run_with_cli=run_with_cli,
159
166
  only_allow_safetensors=only_allow_safetensors,
@@ -757,7 +757,7 @@ def load_model_and_tokenizer(
757
757
  model = LLM(
758
758
  model=model_id,
759
759
  tokenizer=model_id,
760
- gpu_memory_utilization=0.9,
760
+ gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
761
761
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
762
762
  download_dir=download_dir,
763
763
  trust_remote_code=benchmark_config.trust_remote_code,
euroeval/benchmarker.py CHANGED
@@ -78,6 +78,7 @@ class Benchmarker:
78
78
  num_iterations: int = 10,
79
79
  api_base: str | None = None,
80
80
  api_version: str | None = None,
81
+ gpu_memory_utilization: float = 0.9,
81
82
  debug: bool = False,
82
83
  run_with_cli: bool = False,
83
84
  only_allow_safetensors: bool = False,
@@ -145,6 +146,11 @@ class Benchmarker:
145
146
  to a model on an inference API. Defaults to None.
146
147
  api_version:
147
148
  The version of the API to use. Defaults to None.
149
+ gpu_memory_utilization:
150
+ The GPU memory utilization to use for vLLM. Only relevant if the model
151
+ is generative. A larger value will result in faster evaluation, but at
152
+ the risk of running out of GPU memory. Only reduce this if you are
153
+ running out of GPU memory. Defaults to 0.9.
148
154
  debug:
149
155
  Whether to output debug information. Defaults to False.
150
156
  run_with_cli:
@@ -192,6 +198,7 @@ class Benchmarker:
192
198
  num_iterations=num_iterations,
193
199
  api_base=api_base,
194
200
  api_version=api_version,
201
+ gpu_memory_utilization=gpu_memory_utilization,
195
202
  debug=debug,
196
203
  run_with_cli=run_with_cli,
197
204
  only_allow_safetensors=only_allow_safetensors,
euroeval/cli.py CHANGED
@@ -186,6 +186,14 @@ from .tasks import get_all_tasks
186
186
  help="The version of the API to use. Only relevant if `model` refers to a model on "
187
187
  "an inference API.",
188
188
  )
189
+ @click.option(
190
+ "--gpu-memory-utilization",
191
+ default=0.9,
192
+ show_default=True,
193
+ help="The GPU memory utilization to use for vLLM. A larger value will result in "
194
+ "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
195
+ "if you are running out of GPU memory. Only relevant if the model is generative.",
196
+ )
189
197
  @click.option(
190
198
  "--debug/--no-debug",
191
199
  default=False,
@@ -223,6 +231,7 @@ def benchmark(
223
231
  num_iterations: int,
224
232
  api_base: str | None,
225
233
  api_version: str | None,
234
+ gpu_memory_utilization: float,
226
235
  debug: bool,
227
236
  only_allow_safetensors: bool,
228
237
  ) -> None:
@@ -258,6 +267,7 @@ def benchmark(
258
267
  num_iterations=num_iterations,
259
268
  api_base=api_base,
260
269
  api_version=api_version,
270
+ gpu_memory_utilization=gpu_memory_utilization,
261
271
  debug=debug,
262
272
  run_with_cli=True,
263
273
  only_allow_safetensors=only_allow_safetensors,
euroeval/data_models.py CHANGED
@@ -168,6 +168,11 @@ class BenchmarkConfig:
168
168
  api_version:
169
169
  The version of the API to use. Only relevant if `model` refers to a model on
170
170
  an inference API.
171
+ gpu_memory_utilization:
172
+ The GPU memory utilization to use for vLLM. A larger value will result in
173
+ faster evaluation, but at the risk of running out of GPU memory. Only reduce
174
+ this if you are running out of GPU memory. Only relevant if the model is
175
+ generative.
171
176
  debug:
172
177
  Whether to run the benchmark in debug mode.
173
178
  run_with_cli:
@@ -196,6 +201,7 @@ class BenchmarkConfig:
196
201
  num_iterations: int
197
202
  api_base: str | None
198
203
  api_version: str | None
204
+ gpu_memory_utilization: float
199
205
  debug: bool
200
206
  run_with_cli: bool
201
207
  only_allow_safetensors: bool
@@ -227,6 +233,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
227
233
  num_iterations: int
228
234
  api_base: str | None
229
235
  api_version: str | None
236
+ gpu_memory_utilization: float
230
237
  debug: bool
231
238
  run_with_cli: bool
232
239
  only_allow_safetensors: bool
@@ -13,6 +13,7 @@ from .german import * # noqa: F403
13
13
  from .icelandic import * # noqa: F403
14
14
  from .italian import * # noqa: F403
15
15
  from .norwegian import * # noqa: F403
16
+ from .portuguese import * # noqa: F403
16
17
  from .spanish import * # noqa: F403
17
18
  from .swedish import * # noqa: F403
18
19
 
@@ -0,0 +1,74 @@
1
+ """All Portuguese dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import PT
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ SST2_PT_CONFIG = DatasetConfig(
10
+ name="sst2-pt",
11
+ pretty_name="the truncated version of the Portuguese sentiment classification "
12
+ "dataset SST2-pt, translated from the English SST2 dataset",
13
+ huggingface_id="EuroEval/sst2-pt-mini",
14
+ task=SENT,
15
+ languages=[PT],
16
+ _labels=["positive", "negative"],
17
+ )
18
+
19
+
20
+ MMLU_PT_CONFIG = DatasetConfig(
21
+ name="mmlu-pt",
22
+ pretty_name="the truncated version of the Portuguese knowledge dataset MMLU-pt, "
23
+ "translated from the English MMLU dataset",
24
+ huggingface_id="EuroEval/mmlu-pt-mini",
25
+ task=KNOW,
26
+ languages=[PT],
27
+ )
28
+
29
+
30
+ GOLDENSWAG_PT_CONFIG = DatasetConfig(
31
+ name="goldenswag-pt",
32
+ pretty_name="the truncated version of the Portuguese common-sense reasoning "
33
+ "dataset GoldenSwag-pt, translated from the English GoldenSwag dataset",
34
+ huggingface_id="EuroEval/goldenswag-pt-mini",
35
+ task=COMMON_SENSE,
36
+ languages=[PT],
37
+ )
38
+
39
+
40
+ SCALA_PT = DatasetConfig(
41
+ name="scala-pt",
42
+ pretty_name="the Portuguese part of the linguistic acceptability dataset ScaLA",
43
+ huggingface_id="EuroEval/scala-pt",
44
+ task=LA,
45
+ languages=[PT],
46
+ )
47
+
48
+ HAREM_CONFIG = DatasetConfig(
49
+ name="harem",
50
+ pretty_name="the Portuguese named entity recognition dataset HAREM",
51
+ huggingface_id="EuroEval/harem",
52
+ task=NER,
53
+ languages=[PT],
54
+ )
55
+
56
+ PUBLICO_CONFIG = DatasetConfig(
57
+ name="publico",
58
+ pretty_name="the truncated version of the Portuguese summarisation dataset Público",
59
+ huggingface_id="EuroEval/publico-mini",
60
+ task=SUMM,
61
+ languages=[PT],
62
+ )
63
+
64
+
65
+ ### Unofficial datasets ###
66
+
67
+ BOOLQ_PT_CONFIG = DatasetConfig(
68
+ name="boolq-pt",
69
+ pretty_name="the Portuguese multiple choice reading comprehension dataset "
70
+ "BoolQ-pt, translated from the English BoolQ dataset",
71
+ huggingface_id="EuroEval/boolq-pt",
72
+ task=MCRC,
73
+ languages=[PT],
74
+ )
@@ -8,7 +8,8 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
8
8
 
9
9
  SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
10
10
  name="sentiment-headlines-es",
11
- pretty_name="the truncated version of the Spanish sentiment headlines dataset",
11
+ pretty_name="the truncated version of the Spanish sentiment classification dataset "
12
+ "SentimentHeadlines",
12
13
  huggingface_id="EuroEval/sentiment-headlines-es",
13
14
  task=SENT,
14
15
  languages=[ES],
@@ -33,7 +34,7 @@ CONLL_ES_CONFIG = DatasetConfig(
33
34
 
34
35
  MLQA_ES_CONFIG = DatasetConfig(
35
36
  name="mlqa-es",
36
- pretty_name="the Spanish version of the MLQA reading comprehension dataset",
37
+ pretty_name="the Spanish version of the reading comprehension dataset MLQA",
37
38
  huggingface_id="EuroEval/mlqa-es",
38
39
  task=RC,
39
40
  languages=[ES],
@@ -70,7 +71,7 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
70
71
 
71
72
  XQUAD_ES_CONFIG = DatasetConfig(
72
73
  name="xquad-es",
73
- pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
74
+ pretty_name="the Spanish version of the reading comprehension dataset XQuAD",
74
75
  huggingface_id="EuroEval/xquad-es",
75
76
  task=RC,
76
77
  languages=[ES],
euroeval/generation.py CHANGED
@@ -200,17 +200,35 @@ def generate_single_iteration(
200
200
  all_preds.extend(extracted_labels)
201
201
 
202
202
  if "label" in non_cached_dataset.column_names:
203
+ non_cached_labels = non_cached_dataset["label"]
204
+ if not isinstance(non_cached_labels, list):
205
+ non_cached_labels = list(non_cached_labels)
206
+ cached_labels = cached_dataset["label"]
207
+ if not isinstance(cached_labels, list):
208
+ cached_labels = list(cached_labels)
203
209
  ground_truth = [
204
210
  label.lower() if isinstance(label, str) else label
205
- for label in non_cached_dataset["label"] + cached_dataset["label"]
211
+ for label in non_cached_labels + cached_labels
206
212
  ]
207
213
  elif "labels" in non_cached_dataset.column_names:
214
+ non_cached_labels = non_cached_dataset["labels"]
215
+ if not isinstance(non_cached_labels, list):
216
+ non_cached_labels = list(non_cached_labels)
217
+ cached_labels = cached_dataset["labels"]
218
+ if not isinstance(cached_labels, list):
219
+ cached_labels = list(cached_labels)
208
220
  ground_truth = [
209
221
  [label.lower() if isinstance(label, str) else label for label in label_list]
210
- for label_list in non_cached_dataset["labels"] + cached_dataset["labels"]
222
+ for label_list in non_cached_labels + cached_labels
211
223
  ]
212
224
  elif "target_text" in non_cached_dataset.column_names:
213
- ground_truth = non_cached_dataset["target_text"] + cached_dataset["target_text"]
225
+ non_cached_labels = non_cached_dataset["target_text"]
226
+ if not isinstance(non_cached_labels, list):
227
+ non_cached_labels = list(non_cached_labels)
228
+ cached_labels = cached_dataset["target_text"]
229
+ if not isinstance(cached_labels, list):
230
+ cached_labels = list(cached_labels)
231
+ ground_truth = non_cached_labels + cached_labels
214
232
  else:
215
233
  raise ValueError(
216
234
  "The dataset must have either a 'label', 'labels', or 'target_text' column"
@@ -306,7 +324,7 @@ def debug_log(
306
324
  ):
307
325
  logger.info(
308
326
  f"Input: '{input_text}'\n"
309
- f"Raw outout: '{raw_output}'\n"
327
+ f"Raw output: '{raw_output}'\n"
310
328
  f"Prediction: '{prediction}'\n"
311
329
  f"Label: '{label}'"
312
330
  )
@@ -323,7 +323,6 @@ def apply_prompt(
323
323
  tokenize=False,
324
324
  add_generation_prompt=True,
325
325
  chat_template=chat_template,
326
- enable_thinking=True,
327
326
  )
328
327
  for messages in messages_list
329
328
  ]
@@ -272,6 +272,7 @@ class HumanEvaluator:
272
272
  num_iterations=iteration + 1,
273
273
  api_base=None,
274
274
  api_version=None,
275
+ gpu_memory_utilization=0.9,
275
276
  debug=False,
276
277
  run_with_cli=True,
277
278
  only_allow_safetensors=False,
euroeval/languages.py CHANGED
@@ -36,7 +36,7 @@ NN = Language(
36
36
  )
37
37
  ES = Language(code="es", name="Spanish", _and_separator="y", _or_separator="o")
38
38
  SV = Language(code="sv", name="Swedish", _and_separator="och", _or_separator="eller")
39
-
39
+ PT = Language(code="pt", name="Portuguese", _and_separator="e", _or_separator="ou")
40
40
 
41
41
  AB = Language(code="ab", name="Abkhazian")
42
42
  AA = Language(code="aa", name="Afar")
@@ -152,7 +152,6 @@ PI = Language(code="pi", name="Pali")
152
152
  PS = Language(code="ps", name="Pashto")
153
153
  FA = Language(code="fa", name="Persian")
154
154
  PL = Language(code="pl", name="Polish")
155
- PT = Language(code="pt", name="Portuguese")
156
155
  PA = Language(code="pa", name="Punjabi")
157
156
  QU = Language(code="qu", name="Quechua")
158
157
  RO = Language(code="ro", name="Romanian")
@@ -1,7 +1,7 @@
1
1
  """Templates for the Linguistic Acceptability task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  LA_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
36
36
  default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
37
37
  "gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
38
38
  ),
39
+ PT: PromptConfig(
40
+ default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
41
+ default_prompt_prefix="Seguem-se abaixo textos e se são "
42
+ "gramaticalmente correctos",
43
+ default_prompt_template="Texto: {text}\nGramaticalmente correcto: {label}",
44
+ default_instruction_prompt="Texto: {text}\n\nDetermina se o texto é "
45
+ "gramaticalmente correcto ou não. Responde com {labels_str}, e nada mais.",
46
+ ),
39
47
  FI: PromptConfig(
40
48
  default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
41
49
  default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
@@ -1,7 +1,7 @@
1
1
  """Templates for all multiple choice tasks."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  # TODO: Missing Faroese
7
7
  MULTIPLE_CHOICE_TEMPLATES = {
@@ -36,6 +36,14 @@ MULTIPLE_CHOICE_TEMPLATES = {
36
36
  "usando solo {labels_str}, y nada más.",
37
37
  default_prompt_label_mapping="auto",
38
38
  ),
39
+ PT: PromptConfig(
40
+ default_prompt_prefix="As seguintes são perguntas de escolha múltipla "
41
+ "(com respostas).",
42
+ default_prompt_template="Pergunta: {text}\nResposta: {label}",
43
+ default_instruction_prompt="Pergunta: {text}\n\nResponde à pergunta "
44
+ "acima usando só {labels_str}, e nada mais.",
45
+ default_prompt_label_mapping="auto",
46
+ ),
39
47
  FI: PromptConfig(
40
48
  default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
41
49
  default_prompt_template="Kysymys: {text}\nVastaus: {label}",
@@ -1,7 +1,7 @@
1
1
  """Templates for the Named Entity Recognition task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  NER_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
80
80
  "claves {labels_str}. Los valores deben ser listas de las "
81
81
  "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
82
82
  ),
83
+ PT: PromptConfig(
84
+ default_prompt_label_mapping={
85
+ "b-per": "pessoa",
86
+ "i-per": "pessoa",
87
+ "b-loc": "local",
88
+ "i-loc": "local",
89
+ "b-org": "organização",
90
+ "i-org": "organização",
91
+ "b-misc": "diverso",
92
+ "i-misc": "diverso",
93
+ },
94
+ default_prompt_prefix="Seguem-se frases e dicionários JSON com as entidades "
95
+ "mencionadas presentes na frase indicada.",
96
+ default_prompt_template="Frase: {text}\nEntidades mencionadas: {label}",
97
+ default_instruction_prompt="Frase: {text}\n\nIdentifica as entidades "
98
+ "mencionadas na frase. Deves devolver um dicionário JSON com as chaves "
99
+ "{labels_str}. Os valores devem ser listas contendo as entidades "
100
+ "mencionadas desse tipo, tal como ocorrem na frase.",
101
+ ),
83
102
  FI: PromptConfig(
84
103
  default_prompt_label_mapping={
85
104
  "b-per": "henkilö",
@@ -1,7 +1,7 @@
1
1
  """Templates for the Sentiment Analysis task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  SENT_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
44
44
  default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
45
45
  "documento. Responde con {labels_str}, y nada más.",
46
46
  ),
47
+ PT: PromptConfig(
48
+ default_prompt_label_mapping=dict(
49
+ positive="positivo", neutral="neutro", negative="negativo"
50
+ ),
51
+ default_prompt_prefix="Abaixo encontras documentos e os seus "
52
+ "sentimentos correspondentes, que podem ser {labels_str}.",
53
+ default_prompt_template="Documento: {text}\nSentimento: {label}",
54
+ default_instruction_prompt="Documento: {text}\n\nClassifica o "
55
+ "sentimento do documento. Responde apenas com {labels_str}.",
56
+ ),
47
57
  FI: PromptConfig(
48
58
  default_prompt_label_mapping=dict(
49
59
  positive="positiivinen", neutral="neutrali", negative="negatiivinen"
@@ -1,7 +1,7 @@
1
1
  """Templates for the Summarization task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  # TODO: Missing Faroese
7
7
  SUMM_TEMPLATES = {
@@ -36,6 +36,13 @@ SUMM_TEMPLATES = {
36
36
  "documento anterior.",
37
37
  default_prompt_label_mapping=dict(),
38
38
  ),
39
+ PT: PromptConfig(
40
+ default_prompt_prefix="Abaixo encontras documentos com resumos associados.",
41
+ default_prompt_template="Documento: {text}\nResumo: {target_text}",
42
+ default_instruction_prompt="Documento: {text}\n\nEscreve um resumo do "
43
+ "documento anterior.",
44
+ default_prompt_label_mapping=dict(),
45
+ ),
39
46
  FI: PromptConfig(
40
47
  default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
41
48
  "tiivistelmiä.",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.11.0
3
+ Version: 15.12.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,18 +1,18 @@
1
1
  euroeval/__init__.py,sha256=fZyR9R3C3vwGJS3CrCJ6ySr_FDnMu_Aqnz0FdadWEEs,3399
2
- euroeval/benchmark_config_factory.py,sha256=icTeT5C-bNCJmvSWFlxKdEpRboZN8OjwaHGu7JM-2xI,11158
3
- euroeval/benchmarker.py,sha256=RlD8z2TYT4dqKvFtfmbU2pS7ZZ8l_3ErYttIcSxjPMg,48040
2
+ euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
3
+ euroeval/benchmarker.py,sha256=SDBzdCa4I8u1XDeN_1mKTFzfaaQbbY_oWcHt3niADxk,48497
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
- euroeval/cli.py,sha256=d8JztMi_RbpUlEBXidd6DQ-xeC-xhozf_qU6Vkzye20,8161
5
+ euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
6
6
  euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
7
7
  euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
8
- euroeval/data_models.py,sha256=lrF8XAVVZFqof3O0Bq2nMSTuqhkDaoMixIoUMqgsAo8,21647
8
+ euroeval/data_models.py,sha256=gPHyIoN2A5_O-cJgyb6jhn6enH8zsiIBI09W_wdHMQs,22031
9
9
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
11
  euroeval/finetuning.py,sha256=BrPZ-6qFY8K-dwfaRwNetVYfYburoQwLQty6pn6iP_s,11340
12
- euroeval/generation.py,sha256=pXs2VwfLvUpwXRN8LcHvzE_HTXMkGSYc4wGv9vsz1BA,10758
13
- euroeval/generation_utils.py,sha256=8HOFE2xdnCPRMe3TiHh--n7Oy3rMV7MAnERpW9vplUA,13352
14
- euroeval/human_evaluation.py,sha256=9CMXrkzM7Q-vltFL1fD9hYwahQtWT12aHMU8PgGO5_c,27497
15
- euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
12
+ euroeval/generation.py,sha256=1fqFEWwM2RzI3uPZem95VFWbN8EfrKZQTrHEP34ihHs,11622
13
+ euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
14
+ euroeval/human_evaluation.py,sha256=Jtz3K5Lqne48wPZWf4EAd3d-n_wX27nGJHigjhV1D7s,27537
15
+ euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
16
16
  euroeval/metrics.py,sha256=nxosyoRjlk7TcoAOkjU7zx2TB43b9tA8M1m4V1s5eKU,15516
17
17
  euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
18
18
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
@@ -28,8 +28,8 @@ euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1Pn
28
28
  euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
29
29
  euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
30
30
  euroeval/benchmark_modules/litellm.py,sha256=_gKBbJsXzo_cHJVaeuQpHRBENEZUGS_vcC-uGIhhmHA,52111
31
- euroeval/benchmark_modules/vllm.py,sha256=LXWkCUaIpP3cboj1bAGM6N8pR02mX6-XZFJheZDbfAQ,38798
32
- euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
31
+ euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
32
+ euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
33
33
  euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
34
34
  euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
35
35
  euroeval/dataset_configs/english.py,sha256=1q8XJqIVWBBNkldL7t-cVnU2O9EUb9_xoVRSN8arN90,2561
@@ -40,23 +40,24 @@ euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbK
40
40
  euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
41
41
  euroeval/dataset_configs/italian.py,sha256=KNjCvTzsEqH_EEk3At8slKqNwWWiIdbv_t5ke7n9nZI,2660
42
42
  euroeval/dataset_configs/norwegian.py,sha256=30YGdDPtDszG10BNDVHb-XXTGgGIIgDUNGoeM9q0K_E,5385
43
- euroeval/dataset_configs/spanish.py,sha256=NviL-FzJ5jq1bLTRvbtZBiGrAmZjxyijZNpKZFrnT-M,2527
43
+ euroeval/dataset_configs/portuguese.py,sha256=-HSDsujWfK__nV2SCu-z0ne0AXLDszOT05oYphQUDTw,2063
44
+ euroeval/dataset_configs/spanish.py,sha256=Yzm1kiilEKoHyd3xD2wrw596Ac9UcaWhlE93GlOFjlc,2558
44
45
  euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
45
46
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
46
- euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
47
- euroeval/prompt_templates/multiple_choice.py,sha256=6iEqiPpT-3WJN_gsyhyapnwsrcsYGdVkSkzwn-VKKxw,5101
48
- euroeval/prompt_templates/named_entity_recognition.py,sha256=Xd6gBJD2e1l8-We2Ujor7crRUBcbgnNeeVknBIrTMJo,12737
47
+ euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
48
+ euroeval/prompt_templates/multiple_choice.py,sha256=F9ItGQtnaaez15A8MQ1UCpKRDsLM-AZyRdYetGAofa0,5494
49
+ euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
49
50
  euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLOS7ZrDUeasuLFt-dtqCnYk,6585
50
- euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
51
- euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
51
+ euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
52
+ euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
52
53
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
53
54
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
54
55
  euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6zmoaPgYVyzMmOkNjr58,27284
55
56
  euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
56
57
  euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
57
58
  euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
58
- euroeval-15.11.0.dist-info/METADATA,sha256=NiRBsSAD6L_q4-y0AVkfoUoZA-9oD27uSK80cWpO_co,13479
59
- euroeval-15.11.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
60
- euroeval-15.11.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
61
- euroeval-15.11.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
62
- euroeval-15.11.0.dist-info/RECORD,,
59
+ euroeval-15.12.0.dist-info/METADATA,sha256=8cY6HWgAZgrCkIA20lVKuf42y-e7U1MZQZSTdF3e7ig,13479
60
+ euroeval-15.12.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
+ euroeval-15.12.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
+ euroeval-15.12.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
+ euroeval-15.12.0.dist-info/RECORD,,