EuroEval 15.11.0__py3-none-any.whl → 15.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/benchmark_config_factory.py +7 -0
- euroeval/benchmark_modules/vllm.py +1 -1
- euroeval/benchmarker.py +7 -0
- euroeval/cli.py +10 -0
- euroeval/data_models.py +7 -0
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/portuguese.py +74 -0
- euroeval/dataset_configs/spanish.py +4 -3
- euroeval/generation.py +22 -4
- euroeval/generation_utils.py +0 -1
- euroeval/human_evaluation.py +1 -0
- euroeval/languages.py +1 -2
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +9 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +8 -1
- {euroeval-15.11.0.dist-info → euroeval-15.12.0.dist-info}/METADATA +1 -1
- {euroeval-15.11.0.dist-info → euroeval-15.12.0.dist-info}/RECORD +22 -21
- {euroeval-15.11.0.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
- {euroeval-15.11.0.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.11.0.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -42,6 +42,7 @@ def build_benchmark_config(
|
|
|
42
42
|
num_iterations: int,
|
|
43
43
|
api_base: str | None,
|
|
44
44
|
api_version: str | None,
|
|
45
|
+
gpu_memory_utilization: float,
|
|
45
46
|
debug: bool,
|
|
46
47
|
run_with_cli: bool,
|
|
47
48
|
only_allow_safetensors: bool,
|
|
@@ -102,6 +103,11 @@ def build_benchmark_config(
|
|
|
102
103
|
model on an inference API.
|
|
103
104
|
api_version:
|
|
104
105
|
The version of the API to use for a given inference API.
|
|
106
|
+
gpu_memory_utilization:
|
|
107
|
+
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
108
|
+
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
109
|
+
this if you are running out of GPU memory. Only relevant if the model is
|
|
110
|
+
generative.
|
|
105
111
|
debug:
|
|
106
112
|
Whether to run the benchmark in debug mode.
|
|
107
113
|
run_with_cli:
|
|
@@ -154,6 +160,7 @@ def build_benchmark_config(
|
|
|
154
160
|
num_iterations=num_iterations,
|
|
155
161
|
api_base=api_base,
|
|
156
162
|
api_version=api_version,
|
|
163
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
157
164
|
debug=debug,
|
|
158
165
|
run_with_cli=run_with_cli,
|
|
159
166
|
only_allow_safetensors=only_allow_safetensors,
|
|
@@ -757,7 +757,7 @@ def load_model_and_tokenizer(
|
|
|
757
757
|
model = LLM(
|
|
758
758
|
model=model_id,
|
|
759
759
|
tokenizer=model_id,
|
|
760
|
-
gpu_memory_utilization=
|
|
760
|
+
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
761
761
|
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
762
762
|
download_dir=download_dir,
|
|
763
763
|
trust_remote_code=benchmark_config.trust_remote_code,
|
euroeval/benchmarker.py
CHANGED
|
@@ -78,6 +78,7 @@ class Benchmarker:
|
|
|
78
78
|
num_iterations: int = 10,
|
|
79
79
|
api_base: str | None = None,
|
|
80
80
|
api_version: str | None = None,
|
|
81
|
+
gpu_memory_utilization: float = 0.9,
|
|
81
82
|
debug: bool = False,
|
|
82
83
|
run_with_cli: bool = False,
|
|
83
84
|
only_allow_safetensors: bool = False,
|
|
@@ -145,6 +146,11 @@ class Benchmarker:
|
|
|
145
146
|
to a model on an inference API. Defaults to None.
|
|
146
147
|
api_version:
|
|
147
148
|
The version of the API to use. Defaults to None.
|
|
149
|
+
gpu_memory_utilization:
|
|
150
|
+
The GPU memory utilization to use for vLLM. Only relevant if the model
|
|
151
|
+
is generative. A larger value will result in faster evaluation, but at
|
|
152
|
+
the risk of running out of GPU memory. Only reduce this if you are
|
|
153
|
+
running out of GPU memory. Defaults to 0.9.
|
|
148
154
|
debug:
|
|
149
155
|
Whether to output debug information. Defaults to False.
|
|
150
156
|
run_with_cli:
|
|
@@ -192,6 +198,7 @@ class Benchmarker:
|
|
|
192
198
|
num_iterations=num_iterations,
|
|
193
199
|
api_base=api_base,
|
|
194
200
|
api_version=api_version,
|
|
201
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
195
202
|
debug=debug,
|
|
196
203
|
run_with_cli=run_with_cli,
|
|
197
204
|
only_allow_safetensors=only_allow_safetensors,
|
euroeval/cli.py
CHANGED
|
@@ -186,6 +186,14 @@ from .tasks import get_all_tasks
|
|
|
186
186
|
help="The version of the API to use. Only relevant if `model` refers to a model on "
|
|
187
187
|
"an inference API.",
|
|
188
188
|
)
|
|
189
|
+
@click.option(
|
|
190
|
+
"--gpu-memory-utilization",
|
|
191
|
+
default=0.9,
|
|
192
|
+
show_default=True,
|
|
193
|
+
help="The GPU memory utilization to use for vLLM. A larger value will result in "
|
|
194
|
+
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
|
195
|
+
"if you are running out of GPU memory. Only relevant if the model is generative.",
|
|
196
|
+
)
|
|
189
197
|
@click.option(
|
|
190
198
|
"--debug/--no-debug",
|
|
191
199
|
default=False,
|
|
@@ -223,6 +231,7 @@ def benchmark(
|
|
|
223
231
|
num_iterations: int,
|
|
224
232
|
api_base: str | None,
|
|
225
233
|
api_version: str | None,
|
|
234
|
+
gpu_memory_utilization: float,
|
|
226
235
|
debug: bool,
|
|
227
236
|
only_allow_safetensors: bool,
|
|
228
237
|
) -> None:
|
|
@@ -258,6 +267,7 @@ def benchmark(
|
|
|
258
267
|
num_iterations=num_iterations,
|
|
259
268
|
api_base=api_base,
|
|
260
269
|
api_version=api_version,
|
|
270
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
261
271
|
debug=debug,
|
|
262
272
|
run_with_cli=True,
|
|
263
273
|
only_allow_safetensors=only_allow_safetensors,
|
euroeval/data_models.py
CHANGED
|
@@ -168,6 +168,11 @@ class BenchmarkConfig:
|
|
|
168
168
|
api_version:
|
|
169
169
|
The version of the API to use. Only relevant if `model` refers to a model on
|
|
170
170
|
an inference API.
|
|
171
|
+
gpu_memory_utilization:
|
|
172
|
+
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
173
|
+
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
174
|
+
this if you are running out of GPU memory. Only relevant if the model is
|
|
175
|
+
generative.
|
|
171
176
|
debug:
|
|
172
177
|
Whether to run the benchmark in debug mode.
|
|
173
178
|
run_with_cli:
|
|
@@ -196,6 +201,7 @@ class BenchmarkConfig:
|
|
|
196
201
|
num_iterations: int
|
|
197
202
|
api_base: str | None
|
|
198
203
|
api_version: str | None
|
|
204
|
+
gpu_memory_utilization: float
|
|
199
205
|
debug: bool
|
|
200
206
|
run_with_cli: bool
|
|
201
207
|
only_allow_safetensors: bool
|
|
@@ -227,6 +233,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
227
233
|
num_iterations: int
|
|
228
234
|
api_base: str | None
|
|
229
235
|
api_version: str | None
|
|
236
|
+
gpu_memory_utilization: float
|
|
230
237
|
debug: bool
|
|
231
238
|
run_with_cli: bool
|
|
232
239
|
only_allow_safetensors: bool
|
|
@@ -13,6 +13,7 @@ from .german import * # noqa: F403
|
|
|
13
13
|
from .icelandic import * # noqa: F403
|
|
14
14
|
from .italian import * # noqa: F403
|
|
15
15
|
from .norwegian import * # noqa: F403
|
|
16
|
+
from .portuguese import * # noqa: F403
|
|
16
17
|
from .spanish import * # noqa: F403
|
|
17
18
|
from .swedish import * # noqa: F403
|
|
18
19
|
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""All Portuguese dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import PT
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SST2_PT_CONFIG = DatasetConfig(
|
|
10
|
+
name="sst2-pt",
|
|
11
|
+
pretty_name="the truncated version of the Portuguese sentiment classification "
|
|
12
|
+
"dataset SST2-pt, translated from the English SST2 dataset",
|
|
13
|
+
huggingface_id="EuroEval/sst2-pt-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[PT],
|
|
16
|
+
_labels=["positive", "negative"],
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
MMLU_PT_CONFIG = DatasetConfig(
|
|
21
|
+
name="mmlu-pt",
|
|
22
|
+
pretty_name="the truncated version of the Portuguese knowledge dataset MMLU-pt, "
|
|
23
|
+
"translated from the English MMLU dataset",
|
|
24
|
+
huggingface_id="EuroEval/mmlu-pt-mini",
|
|
25
|
+
task=KNOW,
|
|
26
|
+
languages=[PT],
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
GOLDENSWAG_PT_CONFIG = DatasetConfig(
|
|
31
|
+
name="goldenswag-pt",
|
|
32
|
+
pretty_name="the truncated version of the Portuguese common-sense reasoning "
|
|
33
|
+
"dataset GoldenSwag-pt, translated from the English GoldenSwag dataset",
|
|
34
|
+
huggingface_id="EuroEval/goldenswag-pt-mini",
|
|
35
|
+
task=COMMON_SENSE,
|
|
36
|
+
languages=[PT],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SCALA_PT = DatasetConfig(
|
|
41
|
+
name="scala-pt",
|
|
42
|
+
pretty_name="the Portuguese part of the linguistic acceptability dataset ScaLA",
|
|
43
|
+
huggingface_id="EuroEval/scala-pt",
|
|
44
|
+
task=LA,
|
|
45
|
+
languages=[PT],
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
HAREM_CONFIG = DatasetConfig(
|
|
49
|
+
name="harem",
|
|
50
|
+
pretty_name="the Portuguese named entity recognition dataset HAREM",
|
|
51
|
+
huggingface_id="EuroEval/harem",
|
|
52
|
+
task=NER,
|
|
53
|
+
languages=[PT],
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
PUBLICO_CONFIG = DatasetConfig(
|
|
57
|
+
name="publico",
|
|
58
|
+
pretty_name="the truncated version of the Portuguese summarisation dataset Público",
|
|
59
|
+
huggingface_id="EuroEval/publico-mini",
|
|
60
|
+
task=SUMM,
|
|
61
|
+
languages=[PT],
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
### Unofficial datasets ###
|
|
66
|
+
|
|
67
|
+
BOOLQ_PT_CONFIG = DatasetConfig(
|
|
68
|
+
name="boolq-pt",
|
|
69
|
+
pretty_name="the Portuguese multiple choice reading comprehension dataset "
|
|
70
|
+
"BoolQ-pt, translated from the English BoolQ dataset",
|
|
71
|
+
huggingface_id="EuroEval/boolq-pt",
|
|
72
|
+
task=MCRC,
|
|
73
|
+
languages=[PT],
|
|
74
|
+
)
|
|
@@ -8,7 +8,8 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
|
8
8
|
|
|
9
9
|
SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
|
|
10
10
|
name="sentiment-headlines-es",
|
|
11
|
-
pretty_name="the truncated version of the Spanish sentiment
|
|
11
|
+
pretty_name="the truncated version of the Spanish sentiment classification dataset "
|
|
12
|
+
"SentimentHeadlines",
|
|
12
13
|
huggingface_id="EuroEval/sentiment-headlines-es",
|
|
13
14
|
task=SENT,
|
|
14
15
|
languages=[ES],
|
|
@@ -33,7 +34,7 @@ CONLL_ES_CONFIG = DatasetConfig(
|
|
|
33
34
|
|
|
34
35
|
MLQA_ES_CONFIG = DatasetConfig(
|
|
35
36
|
name="mlqa-es",
|
|
36
|
-
pretty_name="the Spanish version of the
|
|
37
|
+
pretty_name="the Spanish version of the reading comprehension dataset MLQA",
|
|
37
38
|
huggingface_id="EuroEval/mlqa-es",
|
|
38
39
|
task=RC,
|
|
39
40
|
languages=[ES],
|
|
@@ -70,7 +71,7 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
|
70
71
|
|
|
71
72
|
XQUAD_ES_CONFIG = DatasetConfig(
|
|
72
73
|
name="xquad-es",
|
|
73
|
-
pretty_name="the Spanish version of the
|
|
74
|
+
pretty_name="the Spanish version of the reading comprehension dataset XQuAD",
|
|
74
75
|
huggingface_id="EuroEval/xquad-es",
|
|
75
76
|
task=RC,
|
|
76
77
|
languages=[ES],
|
euroeval/generation.py
CHANGED
|
@@ -200,17 +200,35 @@ def generate_single_iteration(
|
|
|
200
200
|
all_preds.extend(extracted_labels)
|
|
201
201
|
|
|
202
202
|
if "label" in non_cached_dataset.column_names:
|
|
203
|
+
non_cached_labels = non_cached_dataset["label"]
|
|
204
|
+
if not isinstance(non_cached_labels, list):
|
|
205
|
+
non_cached_labels = list(non_cached_labels)
|
|
206
|
+
cached_labels = cached_dataset["label"]
|
|
207
|
+
if not isinstance(cached_labels, list):
|
|
208
|
+
cached_labels = list(cached_labels)
|
|
203
209
|
ground_truth = [
|
|
204
210
|
label.lower() if isinstance(label, str) else label
|
|
205
|
-
for label in
|
|
211
|
+
for label in non_cached_labels + cached_labels
|
|
206
212
|
]
|
|
207
213
|
elif "labels" in non_cached_dataset.column_names:
|
|
214
|
+
non_cached_labels = non_cached_dataset["labels"]
|
|
215
|
+
if not isinstance(non_cached_labels, list):
|
|
216
|
+
non_cached_labels = list(non_cached_labels)
|
|
217
|
+
cached_labels = cached_dataset["labels"]
|
|
218
|
+
if not isinstance(cached_labels, list):
|
|
219
|
+
cached_labels = list(cached_labels)
|
|
208
220
|
ground_truth = [
|
|
209
221
|
[label.lower() if isinstance(label, str) else label for label in label_list]
|
|
210
|
-
for label_list in
|
|
222
|
+
for label_list in non_cached_labels + cached_labels
|
|
211
223
|
]
|
|
212
224
|
elif "target_text" in non_cached_dataset.column_names:
|
|
213
|
-
|
|
225
|
+
non_cached_labels = non_cached_dataset["target_text"]
|
|
226
|
+
if not isinstance(non_cached_labels, list):
|
|
227
|
+
non_cached_labels = list(non_cached_labels)
|
|
228
|
+
cached_labels = cached_dataset["target_text"]
|
|
229
|
+
if not isinstance(cached_labels, list):
|
|
230
|
+
cached_labels = list(cached_labels)
|
|
231
|
+
ground_truth = non_cached_labels + cached_labels
|
|
214
232
|
else:
|
|
215
233
|
raise ValueError(
|
|
216
234
|
"The dataset must have either a 'label', 'labels', or 'target_text' column"
|
|
@@ -306,7 +324,7 @@ def debug_log(
|
|
|
306
324
|
):
|
|
307
325
|
logger.info(
|
|
308
326
|
f"Input: '{input_text}'\n"
|
|
309
|
-
f"Raw
|
|
327
|
+
f"Raw output: '{raw_output}'\n"
|
|
310
328
|
f"Prediction: '{prediction}'\n"
|
|
311
329
|
f"Label: '{label}'"
|
|
312
330
|
)
|
euroeval/generation_utils.py
CHANGED
euroeval/human_evaluation.py
CHANGED
euroeval/languages.py
CHANGED
|
@@ -36,7 +36,7 @@ NN = Language(
|
|
|
36
36
|
)
|
|
37
37
|
ES = Language(code="es", name="Spanish", _and_separator="y", _or_separator="o")
|
|
38
38
|
SV = Language(code="sv", name="Swedish", _and_separator="och", _or_separator="eller")
|
|
39
|
-
|
|
39
|
+
PT = Language(code="pt", name="Portuguese", _and_separator="e", _or_separator="ou")
|
|
40
40
|
|
|
41
41
|
AB = Language(code="ab", name="Abkhazian")
|
|
42
42
|
AA = Language(code="aa", name="Afar")
|
|
@@ -152,7 +152,6 @@ PI = Language(code="pi", name="Pali")
|
|
|
152
152
|
PS = Language(code="ps", name="Pashto")
|
|
153
153
|
FA = Language(code="fa", name="Persian")
|
|
154
154
|
PL = Language(code="pl", name="Polish")
|
|
155
|
-
PT = Language(code="pt", name="Portuguese")
|
|
156
155
|
PA = Language(code="pa", name="Punjabi")
|
|
157
156
|
QU = Language(code="qu", name="Quechua")
|
|
158
157
|
RO = Language(code="ro", name="Romanian")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Linguistic Acceptability task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
LA_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
|
|
|
36
36
|
default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
|
|
37
37
|
"gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
|
|
38
38
|
),
|
|
39
|
+
PT: PromptConfig(
|
|
40
|
+
default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
|
|
41
|
+
default_prompt_prefix="Seguem-se abaixo textos e se são "
|
|
42
|
+
"gramaticalmente correctos",
|
|
43
|
+
default_prompt_template="Texto: {text}\nGramaticalmente correcto: {label}",
|
|
44
|
+
default_instruction_prompt="Texto: {text}\n\nDetermina se o texto é "
|
|
45
|
+
"gramaticalmente correcto ou não. Responde com {labels_str}, e nada mais.",
|
|
46
|
+
),
|
|
39
47
|
FI: PromptConfig(
|
|
40
48
|
default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
|
|
41
49
|
default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for all multiple choice tasks."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
MULTIPLE_CHOICE_TEMPLATES = {
|
|
@@ -36,6 +36,14 @@ MULTIPLE_CHOICE_TEMPLATES = {
|
|
|
36
36
|
"usando solo {labels_str}, y nada más.",
|
|
37
37
|
default_prompt_label_mapping="auto",
|
|
38
38
|
),
|
|
39
|
+
PT: PromptConfig(
|
|
40
|
+
default_prompt_prefix="As seguintes são perguntas de escolha múltipla "
|
|
41
|
+
"(com respostas).",
|
|
42
|
+
default_prompt_template="Pergunta: {text}\nResposta: {label}",
|
|
43
|
+
default_instruction_prompt="Pergunta: {text}\n\nResponde à pergunta "
|
|
44
|
+
"acima usando só {labels_str}, e nada mais.",
|
|
45
|
+
default_prompt_label_mapping="auto",
|
|
46
|
+
),
|
|
39
47
|
FI: PromptConfig(
|
|
40
48
|
default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
|
|
41
49
|
default_prompt_template="Kysymys: {text}\nVastaus: {label}",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Named Entity Recognition task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
NER_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
|
|
|
80
80
|
"claves {labels_str}. Los valores deben ser listas de las "
|
|
81
81
|
"entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
|
|
82
82
|
),
|
|
83
|
+
PT: PromptConfig(
|
|
84
|
+
default_prompt_label_mapping={
|
|
85
|
+
"b-per": "pessoa",
|
|
86
|
+
"i-per": "pessoa",
|
|
87
|
+
"b-loc": "local",
|
|
88
|
+
"i-loc": "local",
|
|
89
|
+
"b-org": "organização",
|
|
90
|
+
"i-org": "organização",
|
|
91
|
+
"b-misc": "diverso",
|
|
92
|
+
"i-misc": "diverso",
|
|
93
|
+
},
|
|
94
|
+
default_prompt_prefix="Seguem-se frases e dicionários JSON com as entidades "
|
|
95
|
+
"mencionadas presentes na frase indicada.",
|
|
96
|
+
default_prompt_template="Frase: {text}\nEntidades mencionadas: {label}",
|
|
97
|
+
default_instruction_prompt="Frase: {text}\n\nIdentifica as entidades "
|
|
98
|
+
"mencionadas na frase. Deves devolver um dicionário JSON com as chaves "
|
|
99
|
+
"{labels_str}. Os valores devem ser listas contendo as entidades "
|
|
100
|
+
"mencionadas desse tipo, tal como ocorrem na frase.",
|
|
101
|
+
),
|
|
83
102
|
FI: PromptConfig(
|
|
84
103
|
default_prompt_label_mapping={
|
|
85
104
|
"b-per": "henkilö",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Sentiment Analysis task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
SENT_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
|
|
|
44
44
|
default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
|
|
45
45
|
"documento. Responde con {labels_str}, y nada más.",
|
|
46
46
|
),
|
|
47
|
+
PT: PromptConfig(
|
|
48
|
+
default_prompt_label_mapping=dict(
|
|
49
|
+
positive="positivo", neutral="neutro", negative="negativo"
|
|
50
|
+
),
|
|
51
|
+
default_prompt_prefix="Abaixo encontras documentos e os seus "
|
|
52
|
+
"sentimentos correspondentes, que podem ser {labels_str}.",
|
|
53
|
+
default_prompt_template="Documento: {text}\nSentimento: {label}",
|
|
54
|
+
default_instruction_prompt="Documento: {text}\n\nClassifica o "
|
|
55
|
+
"sentimento do documento. Responde apenas com {labels_str}.",
|
|
56
|
+
),
|
|
47
57
|
FI: PromptConfig(
|
|
48
58
|
default_prompt_label_mapping=dict(
|
|
49
59
|
positive="positiivinen", neutral="neutrali", negative="negatiivinen"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Summarization task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
SUMM_TEMPLATES = {
|
|
@@ -36,6 +36,13 @@ SUMM_TEMPLATES = {
|
|
|
36
36
|
"documento anterior.",
|
|
37
37
|
default_prompt_label_mapping=dict(),
|
|
38
38
|
),
|
|
39
|
+
PT: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Abaixo encontras documentos com resumos associados.",
|
|
41
|
+
default_prompt_template="Documento: {text}\nResumo: {target_text}",
|
|
42
|
+
default_instruction_prompt="Documento: {text}\n\nEscreve um resumo do "
|
|
43
|
+
"documento anterior.",
|
|
44
|
+
default_prompt_label_mapping=dict(),
|
|
45
|
+
),
|
|
39
46
|
FI: PromptConfig(
|
|
40
47
|
default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
|
|
41
48
|
"tiivistelmiä.",
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
euroeval/__init__.py,sha256=fZyR9R3C3vwGJS3CrCJ6ySr_FDnMu_Aqnz0FdadWEEs,3399
|
|
2
|
-
euroeval/benchmark_config_factory.py,sha256=
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
|
|
3
|
+
euroeval/benchmarker.py,sha256=SDBzdCa4I8u1XDeN_1mKTFzfaaQbbY_oWcHt3niADxk,48497
|
|
4
4
|
euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
|
|
5
|
-
euroeval/cli.py,sha256=
|
|
5
|
+
euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
|
|
6
6
|
euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
|
|
7
7
|
euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=gPHyIoN2A5_O-cJgyb6jhn6enH8zsiIBI09W_wdHMQs,22031
|
|
9
9
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
10
|
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
11
|
euroeval/finetuning.py,sha256=BrPZ-6qFY8K-dwfaRwNetVYfYburoQwLQty6pn6iP_s,11340
|
|
12
|
-
euroeval/generation.py,sha256=
|
|
13
|
-
euroeval/generation_utils.py,sha256=
|
|
14
|
-
euroeval/human_evaluation.py,sha256=
|
|
15
|
-
euroeval/languages.py,sha256=
|
|
12
|
+
euroeval/generation.py,sha256=1fqFEWwM2RzI3uPZem95VFWbN8EfrKZQTrHEP34ihHs,11622
|
|
13
|
+
euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
|
|
14
|
+
euroeval/human_evaluation.py,sha256=Jtz3K5Lqne48wPZWf4EAd3d-n_wX27nGJHigjhV1D7s,27537
|
|
15
|
+
euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
|
|
16
16
|
euroeval/metrics.py,sha256=nxosyoRjlk7TcoAOkjU7zx2TB43b9tA8M1m4V1s5eKU,15516
|
|
17
17
|
euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
|
|
18
18
|
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
@@ -28,8 +28,8 @@ euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1Pn
|
|
|
28
28
|
euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
|
|
29
29
|
euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
|
|
30
30
|
euroeval/benchmark_modules/litellm.py,sha256=_gKBbJsXzo_cHJVaeuQpHRBENEZUGS_vcC-uGIhhmHA,52111
|
|
31
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
32
|
-
euroeval/dataset_configs/__init__.py,sha256=
|
|
31
|
+
euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
|
|
32
|
+
euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
|
|
33
33
|
euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
|
|
34
34
|
euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
|
|
35
35
|
euroeval/dataset_configs/english.py,sha256=1q8XJqIVWBBNkldL7t-cVnU2O9EUb9_xoVRSN8arN90,2561
|
|
@@ -40,23 +40,24 @@ euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbK
|
|
|
40
40
|
euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
|
|
41
41
|
euroeval/dataset_configs/italian.py,sha256=KNjCvTzsEqH_EEk3At8slKqNwWWiIdbv_t5ke7n9nZI,2660
|
|
42
42
|
euroeval/dataset_configs/norwegian.py,sha256=30YGdDPtDszG10BNDVHb-XXTGgGIIgDUNGoeM9q0K_E,5385
|
|
43
|
-
euroeval/dataset_configs/
|
|
43
|
+
euroeval/dataset_configs/portuguese.py,sha256=-HSDsujWfK__nV2SCu-z0ne0AXLDszOT05oYphQUDTw,2063
|
|
44
|
+
euroeval/dataset_configs/spanish.py,sha256=Yzm1kiilEKoHyd3xD2wrw596Ac9UcaWhlE93GlOFjlc,2558
|
|
44
45
|
euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
|
|
45
46
|
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
46
|
-
euroeval/prompt_templates/linguistic_acceptability.py,sha256=
|
|
47
|
-
euroeval/prompt_templates/multiple_choice.py,sha256=
|
|
48
|
-
euroeval/prompt_templates/named_entity_recognition.py,sha256=
|
|
47
|
+
euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
|
|
48
|
+
euroeval/prompt_templates/multiple_choice.py,sha256=F9ItGQtnaaez15A8MQ1UCpKRDsLM-AZyRdYetGAofa0,5494
|
|
49
|
+
euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
|
|
49
50
|
euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLOS7ZrDUeasuLFt-dtqCnYk,6585
|
|
50
|
-
euroeval/prompt_templates/sentiment_classification.py,sha256=
|
|
51
|
-
euroeval/prompt_templates/summarization.py,sha256=
|
|
51
|
+
euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
|
|
52
|
+
euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
|
|
52
53
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
53
54
|
euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
|
|
54
55
|
euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6zmoaPgYVyzMmOkNjr58,27284
|
|
55
56
|
euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
|
|
56
57
|
euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
|
|
57
58
|
euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
|
|
58
|
-
euroeval-15.
|
|
59
|
-
euroeval-15.
|
|
60
|
-
euroeval-15.
|
|
61
|
-
euroeval-15.
|
|
62
|
-
euroeval-15.
|
|
59
|
+
euroeval-15.12.0.dist-info/METADATA,sha256=8cY6HWgAZgrCkIA20lVKuf42y-e7U1MZQZSTdF3e7ig,13479
|
|
60
|
+
euroeval-15.12.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
61
|
+
euroeval-15.12.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
62
|
+
euroeval-15.12.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
63
|
+
euroeval-15.12.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|