EuroEval 15.2.0__py3-none-any.whl → 15.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/fresh.py +3 -1
- euroeval/benchmark_modules/vllm.py +6 -2
- euroeval/benchmarker.py +10 -12
- euroeval/data_loading.py +9 -3
- euroeval/dataset_configs.py +242 -6
- euroeval/task_utils/question_answering.py +10 -7
- euroeval/task_utils/sequence_classification.py +11 -2
- euroeval/task_utils/text_to_text.py +10 -1
- euroeval/task_utils/token_classification.py +9 -3
- euroeval/utils.py +2 -2
- {euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/METADATA +4 -1
- {euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/RECORD +15 -15
- {euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/WHEEL +0 -0
- {euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/entry_points.txt +0 -0
- {euroeval-15.2.0.dist-info → euroeval-15.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -221,7 +221,9 @@ def load_model_and_tokenizer(
|
|
|
221
221
|
|
|
222
222
|
match dataset_config.task.task_group:
|
|
223
223
|
case (
|
|
224
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
224
|
+
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
225
|
+
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
226
|
+
| TaskGroup.SPEED
|
|
225
227
|
):
|
|
226
228
|
model_cls_mapping = dict(
|
|
227
229
|
fresh_xlm_roberta_base=XLMRobertaForSequenceClassification,
|
|
@@ -1151,7 +1151,7 @@ def get_end_of_reasoning_token_id(
|
|
|
1151
1151
|
):
|
|
1152
1152
|
log_once(
|
|
1153
1153
|
message=(
|
|
1154
|
-
f"Detected reasoning token {reasoning_token!r} and end
|
|
1154
|
+
f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
|
|
1155
1155
|
f"token {end_of_reasoning_token!r}, but one of them is not registered "
|
|
1156
1156
|
"as a special token, so assuming it is not a real reasoning token."
|
|
1157
1157
|
),
|
|
@@ -1160,7 +1160,11 @@ def get_end_of_reasoning_token_id(
|
|
|
1160
1160
|
return None
|
|
1161
1161
|
|
|
1162
1162
|
log_once(
|
|
1163
|
-
message=
|
|
1163
|
+
message=(
|
|
1164
|
+
f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
|
|
1165
|
+
f"token {end_of_reasoning_token!r}."
|
|
1166
|
+
),
|
|
1167
|
+
level=logging.DEBUG,
|
|
1164
1168
|
)
|
|
1165
1169
|
|
|
1166
1170
|
# Encode the end of reasoning token and return its ID
|
euroeval/benchmarker.py
CHANGED
|
@@ -18,7 +18,7 @@ from .data_loading import load_data
|
|
|
18
18
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
19
19
|
from .dataset_configs import get_all_dataset_configs
|
|
20
20
|
from .enums import Device, ModelType
|
|
21
|
-
from .exceptions import InvalidBenchmark, InvalidModel
|
|
21
|
+
from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
|
|
22
22
|
from .finetuning import finetune
|
|
23
23
|
from .generation import generate
|
|
24
24
|
from .model_config import get_model_config
|
|
@@ -769,23 +769,21 @@ class Benchmarker:
|
|
|
769
769
|
logger.debug(f"Results:\n{results}")
|
|
770
770
|
return record
|
|
771
771
|
|
|
772
|
+
except HuggingFaceHubDown:
|
|
773
|
+
wait_time = 30
|
|
774
|
+
logger.debug(
|
|
775
|
+
f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
|
|
776
|
+
"seconds."
|
|
777
|
+
)
|
|
778
|
+
sleep(wait_time)
|
|
779
|
+
continue
|
|
780
|
+
|
|
772
781
|
except (InvalidBenchmark, InvalidModel) as e:
|
|
773
782
|
# If the model ID is not valid then raise an error
|
|
774
783
|
model_err_msg = "does not exist on the Hugging Face Hub"
|
|
775
784
|
if benchmark_config.raise_errors and model_err_msg in str(e):
|
|
776
785
|
raise e
|
|
777
786
|
|
|
778
|
-
# Otherwise, if the error is due to Hugging Face Hub being down, then
|
|
779
|
-
# wait a bit and try again
|
|
780
|
-
elif "The Hugging Face Hub seems to be down." in str(e):
|
|
781
|
-
wait_time = 30
|
|
782
|
-
logger.debug(
|
|
783
|
-
"The Hugging Face Hub seems to be down. Retrying in "
|
|
784
|
-
f"{wait_time} seconds."
|
|
785
|
-
)
|
|
786
|
-
sleep(wait_time)
|
|
787
|
-
continue
|
|
788
|
-
|
|
789
787
|
# Otherwise, if the error is due to the MPS fallback not being enabled,
|
|
790
788
|
# then raise an error asking the user to enable it
|
|
791
789
|
elif "PYTORCH_ENABLE_MPS_FALLBACK" in str(e):
|
euroeval/data_loading.py
CHANGED
|
@@ -10,7 +10,7 @@ from huggingface_hub.errors import HfHubHTTPError
|
|
|
10
10
|
from numpy.random import Generator
|
|
11
11
|
|
|
12
12
|
from .data_models import BenchmarkConfig, DatasetConfig
|
|
13
|
-
from .exceptions import InvalidBenchmark
|
|
13
|
+
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
14
14
|
from .utils import unscramble
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger("euroeval")
|
|
@@ -31,6 +31,12 @@ def load_data(
|
|
|
31
31
|
|
|
32
32
|
Returns:
|
|
33
33
|
A list of bootstrapped datasets, one for each iteration.
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
InvalidBenchmark:
|
|
37
|
+
If the dataset cannot be loaded.
|
|
38
|
+
HuggingFaceHubDown:
|
|
39
|
+
If the Hugging Face Hub is down.
|
|
34
40
|
"""
|
|
35
41
|
num_attempts = 5
|
|
36
42
|
for _ in range(num_attempts):
|
|
@@ -41,14 +47,14 @@ def load_data(
|
|
|
41
47
|
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
42
48
|
)
|
|
43
49
|
break
|
|
44
|
-
except (FileNotFoundError, DatasetsError):
|
|
50
|
+
except (FileNotFoundError, DatasetsError, ConnectionError):
|
|
45
51
|
logger.warning(
|
|
46
52
|
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
47
53
|
)
|
|
48
54
|
time.sleep(1)
|
|
49
55
|
continue
|
|
50
56
|
except HfHubHTTPError:
|
|
51
|
-
raise
|
|
57
|
+
raise HuggingFaceHubDown()
|
|
52
58
|
else:
|
|
53
59
|
raise InvalidBenchmark(
|
|
54
60
|
f"Failed to load dataset {dataset_config.huggingface_id!r} after "
|
euroeval/dataset_configs.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""All dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from .data_models import DatasetConfig
|
|
4
|
-
from .languages import DA, DE, EN, FO, FR, IS, NB, NL, NN, NO, SV, get_all_languages
|
|
4
|
+
from .languages import DA, DE, EN, FO, FR, IS, IT, NB, NL, NN, NO, SV, get_all_languages
|
|
5
5
|
from .tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SPEED, SUMM
|
|
6
6
|
|
|
7
7
|
|
|
@@ -244,6 +244,26 @@ ALLOCINE_CONFIG = DatasetConfig(
|
|
|
244
244
|
max_generated_tokens=5,
|
|
245
245
|
)
|
|
246
246
|
|
|
247
|
+
SENTIPOLC_CONFIG = DatasetConfig(
|
|
248
|
+
name="sentipolc16",
|
|
249
|
+
pretty_name="the truncated version of the Italian sentiment classification "
|
|
250
|
+
"dataset Sentipolc-16",
|
|
251
|
+
huggingface_id="EuroEval/sentipolc16-mini",
|
|
252
|
+
task=SENT,
|
|
253
|
+
languages=[IT],
|
|
254
|
+
labels=["negative", "neutral", "positive"],
|
|
255
|
+
prompt_prefix="Di seguito sono riportati i testi e il loro sentimento, che può "
|
|
256
|
+
"essere 'positivo', 'neutro' o 'negativo'.",
|
|
257
|
+
prompt_template="Tweet: {text}\nSentimento: {label}",
|
|
258
|
+
prompt_label_mapping=dict(
|
|
259
|
+
positive="positivo", neutral="neutro", negative="negativo"
|
|
260
|
+
),
|
|
261
|
+
instruction_prompt="Tweet: {text}\n\nClassificare il sentimento nel Tweet. "
|
|
262
|
+
"Rispondete con 'positivo', 'neutro' o 'negativo', e nient'altro.",
|
|
263
|
+
num_few_shot_examples=12,
|
|
264
|
+
max_generated_tokens=5,
|
|
265
|
+
)
|
|
266
|
+
|
|
247
267
|
|
|
248
268
|
### NAMED ENTITY RECOGNITION DATASETS ###
|
|
249
269
|
|
|
@@ -718,6 +738,85 @@ WIKIANN_FO_CONFIG = DatasetConfig(
|
|
|
718
738
|
unofficial=True,
|
|
719
739
|
)
|
|
720
740
|
|
|
741
|
+
WIKINEURAL_IT_CONFIG = DatasetConfig(
|
|
742
|
+
name="wikineural-it",
|
|
743
|
+
pretty_name="the truncated version of the Italian named "
|
|
744
|
+
"entity recognition dataset WikiNEuRal IT",
|
|
745
|
+
huggingface_id="EuroEval/wikineural-mini-it",
|
|
746
|
+
task=NER,
|
|
747
|
+
languages=[IT],
|
|
748
|
+
labels=[
|
|
749
|
+
"o",
|
|
750
|
+
"b-loc",
|
|
751
|
+
"i-loc",
|
|
752
|
+
"b-org",
|
|
753
|
+
"i-org",
|
|
754
|
+
"b-per",
|
|
755
|
+
"i-per",
|
|
756
|
+
"b-misc",
|
|
757
|
+
"i-misc",
|
|
758
|
+
],
|
|
759
|
+
prompt_prefix="Di seguito sono riportate le frasi e i dizionari JSON con le entità "
|
|
760
|
+
"denominate presenti nella frase data.",
|
|
761
|
+
prompt_template="Frase: {text}\nEntità denominate: {label}",
|
|
762
|
+
prompt_label_mapping={
|
|
763
|
+
"b-per": "persona",
|
|
764
|
+
"i-per": "persona",
|
|
765
|
+
"b-loc": "posizione",
|
|
766
|
+
"i-loc": "posizione",
|
|
767
|
+
"b-org": "organizzazione",
|
|
768
|
+
"i-org": "organizzazione",
|
|
769
|
+
"b-misc": "varie",
|
|
770
|
+
"i-misc": "varie",
|
|
771
|
+
},
|
|
772
|
+
instruction_prompt="Frase: {text}\n\nIdentificare le entità nominate nella frase. "
|
|
773
|
+
"Il risultato dovrebbe essere un dizionario JSON con le chiavi 'persona', "
|
|
774
|
+
"'posizione', 'organizzazione' e 'varie'. I valori devono essere elenchi di entità "
|
|
775
|
+
"nominate di quel tipo, esattamente come appaiono nella frase.",
|
|
776
|
+
num_few_shot_examples=8,
|
|
777
|
+
max_generated_tokens=128,
|
|
778
|
+
unofficial=True,
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
MULTINERD_IT_CONFIG = DatasetConfig(
|
|
782
|
+
name="multinerd-it",
|
|
783
|
+
pretty_name="the truncated version of the Italian part of the named "
|
|
784
|
+
"entity recognition dataset MultiNERD",
|
|
785
|
+
huggingface_id="EuroEval/multinerd-mini-it",
|
|
786
|
+
task=NER,
|
|
787
|
+
languages=[IT],
|
|
788
|
+
labels=[
|
|
789
|
+
"o",
|
|
790
|
+
"b-loc",
|
|
791
|
+
"i-loc",
|
|
792
|
+
"b-org",
|
|
793
|
+
"i-org",
|
|
794
|
+
"b-per",
|
|
795
|
+
"i-per",
|
|
796
|
+
"b-misc",
|
|
797
|
+
"i-misc",
|
|
798
|
+
],
|
|
799
|
+
prompt_prefix="Di seguito sono riportate le frasi e i dizionari JSON con le entità "
|
|
800
|
+
"denominate presenti nella frase data.",
|
|
801
|
+
prompt_template="Frase: {text}\nEntità denominate: {label}",
|
|
802
|
+
prompt_label_mapping={
|
|
803
|
+
"b-per": "persona",
|
|
804
|
+
"i-per": "persona",
|
|
805
|
+
"b-loc": "posizione",
|
|
806
|
+
"i-loc": "posizione",
|
|
807
|
+
"b-org": "organizzazione",
|
|
808
|
+
"i-org": "organizzazione",
|
|
809
|
+
"b-misc": "varie",
|
|
810
|
+
"i-misc": "varie",
|
|
811
|
+
},
|
|
812
|
+
instruction_prompt="Frase: {text}\n\nIdentificare le entità nominate nella frase. "
|
|
813
|
+
"Il risultato dovrebbe essere un dizionario JSON con le chiavi 'persona', "
|
|
814
|
+
"'posizione', 'organizzazione' e 'varie'. I valori devono essere elenchi di entità "
|
|
815
|
+
"nominate di quel tipo, esattamente come appaiono nella frase.",
|
|
816
|
+
num_few_shot_examples=8,
|
|
817
|
+
max_generated_tokens=128,
|
|
818
|
+
)
|
|
819
|
+
|
|
721
820
|
|
|
722
821
|
### LINGUISTIC ACCEPTABILITY DATASETS ###
|
|
723
822
|
|
|
@@ -789,6 +888,25 @@ SCALA_NN_CONFIG = DatasetConfig(
|
|
|
789
888
|
max_generated_tokens=5,
|
|
790
889
|
)
|
|
791
890
|
|
|
891
|
+
NO_COLA_CONFIG = DatasetConfig(
|
|
892
|
+
name="no-cola",
|
|
893
|
+
pretty_name="the truncated version of the Norwegian linguistic acceptability "
|
|
894
|
+
"dataset NoCoLA",
|
|
895
|
+
huggingface_id="EuroEval/no-cola-mini",
|
|
896
|
+
task=LA,
|
|
897
|
+
languages=[NB, NO],
|
|
898
|
+
labels=["incorrect", "correct"],
|
|
899
|
+
prompt_prefix="Følgende er setninger og hvorvidt de er grammatisk korrekte.",
|
|
900
|
+
prompt_template="Setning: {text}\nGrammatisk korrekt: {label}",
|
|
901
|
+
instruction_prompt="Setning: {text}\n\nBestem om setningen er grammatisk korrekt "
|
|
902
|
+
"eller ikke. Svar med 'ja' hvis setningen er korrekt og 'nei' hvis den ikke er, "
|
|
903
|
+
"og ikke noe annet.",
|
|
904
|
+
prompt_label_mapping=dict(correct="ja", incorrect="nei"),
|
|
905
|
+
num_few_shot_examples=12,
|
|
906
|
+
max_generated_tokens=5,
|
|
907
|
+
unofficial=True,
|
|
908
|
+
)
|
|
909
|
+
|
|
792
910
|
SCALA_IS_CONFIG = DatasetConfig(
|
|
793
911
|
name="scala-is",
|
|
794
912
|
pretty_name="the Icelandic part of the linguistic acceptability dataset ScaLA",
|
|
@@ -893,6 +1011,24 @@ SCALA_FR_CONFIG = DatasetConfig(
|
|
|
893
1011
|
max_generated_tokens=5,
|
|
894
1012
|
)
|
|
895
1013
|
|
|
1014
|
+
SCALA_IT_CONFIG = DatasetConfig(
|
|
1015
|
+
name="scala-it",
|
|
1016
|
+
pretty_name="the Italian part of the linguistic acceptability dataset ScaLA",
|
|
1017
|
+
huggingface_id="EuroEval/scala-it",
|
|
1018
|
+
task=LA,
|
|
1019
|
+
languages=[IT],
|
|
1020
|
+
labels=["incorrect", "correct"],
|
|
1021
|
+
prompt_prefix="Di seguito sono riportate le frasi e la loro correttezza "
|
|
1022
|
+
"grammaticale.",
|
|
1023
|
+
prompt_template="Frase : {text}\nGrammaticalmente corretto : {label}",
|
|
1024
|
+
prompt_label_mapping=dict(correct="si", incorrect="no"),
|
|
1025
|
+
instruction_prompt="Frase: {text}\n\nStabilite se la frase è grammaticalmente "
|
|
1026
|
+
"corretta o meno. Rispondete con 'si' se la frase è corretta e con 'no' se "
|
|
1027
|
+
"non lo è, e nient'altro.",
|
|
1028
|
+
num_few_shot_examples=12,
|
|
1029
|
+
max_generated_tokens=5,
|
|
1030
|
+
)
|
|
1031
|
+
|
|
896
1032
|
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
897
1033
|
name="dutch-cola",
|
|
898
1034
|
pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
|
|
@@ -1139,10 +1275,26 @@ SQUAD_NL_CONFIG = DatasetConfig(
|
|
|
1139
1275
|
max_generated_tokens=32,
|
|
1140
1276
|
)
|
|
1141
1277
|
|
|
1278
|
+
SQUAD_IT_CONFIG = DatasetConfig(
|
|
1279
|
+
name="squad-it",
|
|
1280
|
+
pretty_name="the truncated version of the Italian reading comprehension dataset "
|
|
1281
|
+
"SQuAD-it, translated from the English SQuAD dataset",
|
|
1282
|
+
huggingface_id="EuroEval/squad-it-mini",
|
|
1283
|
+
task=RC,
|
|
1284
|
+
languages=[IT],
|
|
1285
|
+
labels=["start_positions", "end_positions"],
|
|
1286
|
+
prompt_prefix="I testi che seguono sono accompagnati da domande e risposte.",
|
|
1287
|
+
prompt_template="Testo: {text}\nDomanda: {question}\nRispondere in massimo "
|
|
1288
|
+
"3 parole: {label}",
|
|
1289
|
+
instruction_prompt="Testo: {text}\n\nRispondi alla seguente domanda sul "
|
|
1290
|
+
"in un massimo di 3 parole.\n\nDomanda: {question}",
|
|
1291
|
+
num_few_shot_examples=4,
|
|
1292
|
+
max_generated_tokens=32,
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1142
1295
|
ICELANDIC_QA_CONFIG = DatasetConfig(
|
|
1143
1296
|
name="icelandic-qa",
|
|
1144
|
-
pretty_name="the Icelandic reading comprehension dataset
|
|
1145
|
-
"and history",
|
|
1297
|
+
pretty_name="the Icelandic reading comprehension dataset IcelandicQA",
|
|
1146
1298
|
huggingface_id="EuroEval/icelandic-qa",
|
|
1147
1299
|
task=RC,
|
|
1148
1300
|
languages=[IS],
|
|
@@ -1352,6 +1504,20 @@ ORANGE_SUM_CONFIG = DatasetConfig(
|
|
|
1352
1504
|
max_generated_tokens=256,
|
|
1353
1505
|
)
|
|
1354
1506
|
|
|
1507
|
+
ILPOST_SUM_CONFIG = DatasetConfig(
|
|
1508
|
+
name="ilpost-sum",
|
|
1509
|
+
pretty_name="the truncated version of the Italian summarisation dataset IlPost",
|
|
1510
|
+
huggingface_id="EuroEval/ilpost-sum",
|
|
1511
|
+
task=SUMM,
|
|
1512
|
+
languages=[IT],
|
|
1513
|
+
prompt_prefix="Di seguito sono riportati gli articoli con i relativi riassunti.",
|
|
1514
|
+
prompt_template="Articolo di cronaca: {text}\nSintesi: {target_text}",
|
|
1515
|
+
instruction_prompt="Articolo di cronaca: {text}\n\nScrivete un riassunto "
|
|
1516
|
+
"dell'articolo sopra citato.",
|
|
1517
|
+
num_few_shot_examples=1,
|
|
1518
|
+
max_generated_tokens=256,
|
|
1519
|
+
)
|
|
1520
|
+
|
|
1355
1521
|
# TODO: Faroese summarization
|
|
1356
1522
|
|
|
1357
1523
|
|
|
@@ -1377,7 +1543,7 @@ DANSKE_TALEMAADER_CONFIG = DatasetConfig(
|
|
|
1377
1543
|
DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
|
|
1378
1544
|
name="danish-citizen-tests",
|
|
1379
1545
|
pretty_name="the Danish knowledge dataset Danish Citizen Tests",
|
|
1380
|
-
huggingface_id="EuroEval/danish-citizen-tests",
|
|
1546
|
+
huggingface_id="EuroEval/danish-citizen-tests-updated",
|
|
1381
1547
|
task=KNOW,
|
|
1382
1548
|
languages=[DA],
|
|
1383
1549
|
labels=["a", "b", "c", "d"],
|
|
@@ -1390,6 +1556,22 @@ DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
|
|
|
1390
1556
|
max_generated_tokens=5,
|
|
1391
1557
|
)
|
|
1392
1558
|
|
|
1559
|
+
NRK_QUIZ_QA_CONFIG = DatasetConfig(
|
|
1560
|
+
name="nrk-quiz-qa",
|
|
1561
|
+
pretty_name="the truncated version of the Norwegian knowledge dataset NRK Quiz QA",
|
|
1562
|
+
huggingface_id="EuroEval/nrk-quiz-qa-mini",
|
|
1563
|
+
task=KNOW,
|
|
1564
|
+
languages=[NB, NN, NO],
|
|
1565
|
+
labels=["a", "b", "c", "d"],
|
|
1566
|
+
prompt_prefix="Følgende er flervalgsspørsmål (med svar).",
|
|
1567
|
+
prompt_template="Spørsmål: {text}\nSvar: {label}",
|
|
1568
|
+
prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
|
|
1569
|
+
instruction_prompt="Spørsmål: {text}\n\nBesvar følgende spørsmål med 'a', 'b', "
|
|
1570
|
+
"'c' eller 'd', og ikke noe annet.",
|
|
1571
|
+
num_few_shot_examples=5,
|
|
1572
|
+
max_generated_tokens=5,
|
|
1573
|
+
)
|
|
1574
|
+
|
|
1393
1575
|
MMLU_NO_CONFIG = DatasetConfig(
|
|
1394
1576
|
name="mmlu-no",
|
|
1395
1577
|
pretty_name="the truncated version of the Norwegian knowledge dataset MMLU-no, "
|
|
@@ -1405,6 +1587,7 @@ MMLU_NO_CONFIG = DatasetConfig(
|
|
|
1405
1587
|
"'c' eller 'd', og ikke noe annet.",
|
|
1406
1588
|
num_few_shot_examples=5,
|
|
1407
1589
|
max_generated_tokens=5,
|
|
1590
|
+
unofficial=True,
|
|
1408
1591
|
)
|
|
1409
1592
|
|
|
1410
1593
|
MMLU_SV_CONFIG = DatasetConfig(
|
|
@@ -1444,7 +1627,8 @@ MMLU_IS_CONFIG = DatasetConfig(
|
|
|
1444
1627
|
|
|
1445
1628
|
ICELANDIC_KNOWLEDGE_CONFIG = DatasetConfig(
|
|
1446
1629
|
name="icelandic-knowledge",
|
|
1447
|
-
pretty_name="the
|
|
1630
|
+
pretty_name="the Icelandic knowledge dataset IcelandicKnowledge, derived from the "
|
|
1631
|
+
"IcelandicQA dataset",
|
|
1448
1632
|
huggingface_id="EuroEval/icelandic-knowledge",
|
|
1449
1633
|
task=KNOW,
|
|
1450
1634
|
languages=[IS],
|
|
@@ -1456,7 +1640,6 @@ ICELANDIC_KNOWLEDGE_CONFIG = DatasetConfig(
|
|
|
1456
1640
|
"'b', 'c' eða 'd'.",
|
|
1457
1641
|
num_few_shot_examples=5,
|
|
1458
1642
|
max_generated_tokens=5,
|
|
1459
|
-
unofficial=True,
|
|
1460
1643
|
)
|
|
1461
1644
|
|
|
1462
1645
|
MMLU_DE_CONFIG = DatasetConfig(
|
|
@@ -1545,6 +1728,23 @@ MMLU_FR_CONFIG = DatasetConfig(
|
|
|
1545
1728
|
max_generated_tokens=5,
|
|
1546
1729
|
)
|
|
1547
1730
|
|
|
1731
|
+
MMLU_IT_CONFIG = DatasetConfig(
|
|
1732
|
+
name="mmlu-it",
|
|
1733
|
+
pretty_name="the truncated version of the Italian knowledge dataset MMLU-it, "
|
|
1734
|
+
"translated from the English MMLU dataset",
|
|
1735
|
+
huggingface_id="EuroEval/mmlu-it-mini",
|
|
1736
|
+
task=KNOW,
|
|
1737
|
+
languages=[IT],
|
|
1738
|
+
labels=["a", "b", "c", "d"],
|
|
1739
|
+
prompt_prefix="Le seguenti sono domande a scelta multipla (con relative risposte).",
|
|
1740
|
+
prompt_template="Domanda: {text}\nRéponse: {label}",
|
|
1741
|
+
prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
|
|
1742
|
+
instruction_prompt="Domanda: {text}\n\nRispondete alla domanda precedente con "
|
|
1743
|
+
"'a', 'b', 'c' o 'd' e nient'altro.",
|
|
1744
|
+
num_few_shot_examples=5,
|
|
1745
|
+
max_generated_tokens=5,
|
|
1746
|
+
)
|
|
1747
|
+
|
|
1548
1748
|
ARC_DA_CONFIG = DatasetConfig(
|
|
1549
1749
|
name="arc-da",
|
|
1550
1750
|
pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
|
|
@@ -1614,6 +1814,7 @@ ARC_IS_CONFIG = DatasetConfig(
|
|
|
1614
1814
|
"'b', 'c' eða 'd', og engu öðru.",
|
|
1615
1815
|
num_few_shot_examples=5,
|
|
1616
1816
|
max_generated_tokens=5,
|
|
1817
|
+
unofficial=True,
|
|
1617
1818
|
)
|
|
1618
1819
|
|
|
1619
1820
|
ARC_DE_CONFIG = DatasetConfig(
|
|
@@ -1691,6 +1892,23 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
|
|
|
1691
1892
|
max_generated_tokens=5,
|
|
1692
1893
|
)
|
|
1693
1894
|
|
|
1895
|
+
NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
1896
|
+
name="nor-common-sense-qa",
|
|
1897
|
+
pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
|
|
1898
|
+
"NorCommonSenseQA",
|
|
1899
|
+
huggingface_id="EuroEval/nor-common-sense-qa",
|
|
1900
|
+
task=COMMON_SENSE,
|
|
1901
|
+
languages=[NB, NN, NO],
|
|
1902
|
+
labels=["a", "b", "c", "d", "e"],
|
|
1903
|
+
prompt_prefix="Følgende er flervalgsspørsmål (med svar).",
|
|
1904
|
+
prompt_template="Spørsmål: {text}\nSvar: {label}",
|
|
1905
|
+
prompt_label_mapping=dict(a="a", b="b", c="c", d="d", e="e"),
|
|
1906
|
+
instruction_prompt="Spørsmål: {text}\n\nBesvar følgende spørsmål med 'a', 'b', "
|
|
1907
|
+
"'c' eller 'd', og ikke noe annet.",
|
|
1908
|
+
num_few_shot_examples=5,
|
|
1909
|
+
max_generated_tokens=5,
|
|
1910
|
+
)
|
|
1911
|
+
|
|
1694
1912
|
HELLASWAG_NO_CONFIG = DatasetConfig(
|
|
1695
1913
|
name="hellaswag-no",
|
|
1696
1914
|
pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
|
|
@@ -1706,6 +1924,7 @@ HELLASWAG_NO_CONFIG = DatasetConfig(
|
|
|
1706
1924
|
"'c' eller 'd', og ikke noe annet.",
|
|
1707
1925
|
num_few_shot_examples=5,
|
|
1708
1926
|
max_generated_tokens=5,
|
|
1927
|
+
unofficial=True,
|
|
1709
1928
|
)
|
|
1710
1929
|
|
|
1711
1930
|
HELLASWAG_SV_CONFIG = DatasetConfig(
|
|
@@ -1829,6 +2048,23 @@ HELLASWAG_FR_CONFIG = DatasetConfig(
|
|
|
1829
2048
|
max_generated_tokens=5,
|
|
1830
2049
|
)
|
|
1831
2050
|
|
|
2051
|
+
HELLASWAG_IT_CONFIG = DatasetConfig(
|
|
2052
|
+
name="hellaswag-it",
|
|
2053
|
+
pretty_name="the truncated version of the Italian common-sense reasoning dataset "
|
|
2054
|
+
"HellaSwag-it, translated from the English HellaSwag dataset",
|
|
2055
|
+
huggingface_id="EuroEval/hellaswag-it-mini",
|
|
2056
|
+
task=COMMON_SENSE,
|
|
2057
|
+
languages=[IT],
|
|
2058
|
+
labels=["a", "b", "c", "d"],
|
|
2059
|
+
prompt_prefix="Le seguenti sono domande a scelta multipla (con relative risposte).",
|
|
2060
|
+
prompt_template="Domanda: {text}\nRéponse: {label}",
|
|
2061
|
+
prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
|
|
2062
|
+
instruction_prompt="Domanda: {text}\n\nRispondete alla domanda precedente con "
|
|
2063
|
+
"'a', 'b', 'c' o 'd' e nient'altro.",
|
|
2064
|
+
num_few_shot_examples=5,
|
|
2065
|
+
max_generated_tokens=5,
|
|
2066
|
+
)
|
|
2067
|
+
|
|
1832
2068
|
# TODO: Faroese common sense reasoning
|
|
1833
2069
|
|
|
1834
2070
|
|
|
@@ -8,7 +8,7 @@ from collections import defaultdict
|
|
|
8
8
|
import evaluate
|
|
9
9
|
import numpy as np
|
|
10
10
|
from evaluate import EvaluationModule
|
|
11
|
-
from transformers import PreTrainedTokenizer
|
|
11
|
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
|
|
12
12
|
from transformers.trainer import Trainer
|
|
13
13
|
|
|
14
14
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
@@ -21,12 +21,8 @@ if t.TYPE_CHECKING:
|
|
|
21
21
|
import torch.nn as nn
|
|
22
22
|
from datasets.arrow_dataset import Dataset
|
|
23
23
|
from transformers import (
|
|
24
|
-
BaseImageProcessor,
|
|
25
24
|
EvalPrediction,
|
|
26
|
-
FeatureExtractionMixin,
|
|
27
25
|
PreTrainedModel,
|
|
28
|
-
PreTrainedTokenizerBase,
|
|
29
|
-
ProcessorMixin,
|
|
30
26
|
TrainerCallback,
|
|
31
27
|
TrainingArguments,
|
|
32
28
|
)
|
|
@@ -65,7 +61,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
65
61
|
|
|
66
62
|
# Get the CLS token id for the tokenizer
|
|
67
63
|
if self.tokenizer is not None:
|
|
68
|
-
assert isinstance(self.tokenizer,
|
|
64
|
+
assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
|
|
69
65
|
special_token_metadata = get_special_token_metadata(self.tokenizer)
|
|
70
66
|
self.cls_token_id = special_token_metadata["cls_token_id"]
|
|
71
67
|
|
|
@@ -147,7 +143,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
147
143
|
|
|
148
144
|
|
|
149
145
|
def compute_metrics(
|
|
150
|
-
model_outputs_and_labels: tuple[
|
|
146
|
+
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
151
147
|
dataset_config: "DatasetConfig",
|
|
152
148
|
benchmark_config: "BenchmarkConfig",
|
|
153
149
|
) -> dict[str, float]:
|
|
@@ -167,6 +163,13 @@ def compute_metrics(
|
|
|
167
163
|
values.
|
|
168
164
|
"""
|
|
169
165
|
model_outputs, labels = model_outputs_and_labels
|
|
166
|
+
|
|
167
|
+
# If the model outputs is a pair, then the first element corresponds to the model
|
|
168
|
+
# predictions
|
|
169
|
+
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
170
|
+
model_outputs = model_outputs[0]
|
|
171
|
+
|
|
172
|
+
assert not isinstance(model_outputs, tuple)
|
|
170
173
|
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
171
174
|
|
|
172
175
|
metrics = {
|
|
@@ -13,6 +13,8 @@ from ..data_models import BenchmarkConfig, GenerativeModelOutput
|
|
|
13
13
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
14
14
|
|
|
15
15
|
if t.TYPE_CHECKING:
|
|
16
|
+
from transformers import EvalPrediction
|
|
17
|
+
|
|
16
18
|
from ..data_models import DatasetConfig
|
|
17
19
|
from ..types import Labels, Predictions
|
|
18
20
|
|
|
@@ -21,7 +23,7 @@ logger = logging.getLogger("euroeval")
|
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
def compute_metrics(
|
|
24
|
-
model_outputs_and_labels: tuple[
|
|
26
|
+
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
25
27
|
dataset_config: "DatasetConfig",
|
|
26
28
|
benchmark_config: "BenchmarkConfig",
|
|
27
29
|
) -> dict[str, float]:
|
|
@@ -42,7 +44,11 @@ def compute_metrics(
|
|
|
42
44
|
"""
|
|
43
45
|
model_outputs, labels = model_outputs_and_labels
|
|
44
46
|
label2id = {label: idx for idx, label in dataset_config.id2label.items()}
|
|
45
|
-
|
|
47
|
+
|
|
48
|
+
# If the model outputs is a pair, then the first element corresponds to the model
|
|
49
|
+
# predictions
|
|
50
|
+
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
51
|
+
model_outputs = model_outputs[0]
|
|
46
52
|
|
|
47
53
|
metrics = {
|
|
48
54
|
metric_cfg.name: (
|
|
@@ -61,6 +67,9 @@ def compute_metrics(
|
|
|
61
67
|
else:
|
|
62
68
|
predictions = model_outputs
|
|
63
69
|
|
|
70
|
+
assert not isinstance(model_outputs, tuple)
|
|
71
|
+
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
72
|
+
|
|
64
73
|
prompt_label_to_label_mapping = {
|
|
65
74
|
prompt_label: label
|
|
66
75
|
for label, prompt_label in dataset_config.prompt_label_mapping.items()
|
|
@@ -17,6 +17,8 @@ from ..utils import (
|
|
|
17
17
|
)
|
|
18
18
|
|
|
19
19
|
if t.TYPE_CHECKING:
|
|
20
|
+
from transformers import EvalPrediction
|
|
21
|
+
|
|
20
22
|
from ..types import Labels, Predictions
|
|
21
23
|
|
|
22
24
|
|
|
@@ -24,7 +26,7 @@ logger = logging.getLogger("euroeval")
|
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
def compute_metrics(
|
|
27
|
-
model_outputs_and_labels: tuple[
|
|
29
|
+
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
28
30
|
dataset_config: "DatasetConfig",
|
|
29
31
|
benchmark_config: "BenchmarkConfig",
|
|
30
32
|
) -> dict[str, float]:
|
|
@@ -44,6 +46,13 @@ def compute_metrics(
|
|
|
44
46
|
values.
|
|
45
47
|
"""
|
|
46
48
|
model_outputs, labels = model_outputs_and_labels
|
|
49
|
+
|
|
50
|
+
# If the model outputs is a pair, then the first element corresponds to the model
|
|
51
|
+
# predictions
|
|
52
|
+
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
53
|
+
model_outputs = model_outputs[0]
|
|
54
|
+
|
|
55
|
+
assert not isinstance(model_outputs, tuple)
|
|
47
56
|
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
48
57
|
|
|
49
58
|
metrics = {
|
|
@@ -16,7 +16,7 @@ from ..exceptions import InvalidBenchmark, NeedsExtraInstalled
|
|
|
16
16
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
17
17
|
|
|
18
18
|
if t.TYPE_CHECKING:
|
|
19
|
-
from transformers import BatchEncoding
|
|
19
|
+
from transformers import BatchEncoding, EvalPrediction
|
|
20
20
|
|
|
21
21
|
from ..types import Labels, Predictions
|
|
22
22
|
|
|
@@ -28,7 +28,7 @@ logger = logging.getLogger("euroeval")
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def compute_metrics(
|
|
31
|
-
model_outputs_and_labels: tuple[
|
|
31
|
+
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
32
32
|
has_misc_tags: bool,
|
|
33
33
|
dataset_config: "DatasetConfig",
|
|
34
34
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -51,7 +51,11 @@ def compute_metrics(
|
|
|
51
51
|
values.
|
|
52
52
|
"""
|
|
53
53
|
model_outputs, labels = model_outputs_and_labels
|
|
54
|
-
|
|
54
|
+
|
|
55
|
+
# If the model outputs is a pair, then the first element corresponds to the model
|
|
56
|
+
# predictions
|
|
57
|
+
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
58
|
+
model_outputs = model_outputs[0]
|
|
55
59
|
|
|
56
60
|
metrics = {
|
|
57
61
|
metric_cfg.name: (
|
|
@@ -93,6 +97,8 @@ def compute_metrics(
|
|
|
93
97
|
else:
|
|
94
98
|
predictions = model_outputs # type: ignore[assignment]
|
|
95
99
|
|
|
100
|
+
raise_if_model_output_contains_nan_values(model_output=predictions)
|
|
101
|
+
|
|
96
102
|
# Replace predicted tag with either MISC or O tags if they are not part of the
|
|
97
103
|
# dataset
|
|
98
104
|
labels_without_misc = {
|
euroeval/utils.py
CHANGED
|
@@ -21,7 +21,7 @@ import requests
|
|
|
21
21
|
import torch
|
|
22
22
|
from datasets.utils import disable_progress_bar
|
|
23
23
|
from requests.exceptions import RequestException
|
|
24
|
-
from transformers import PreTrainedTokenizer
|
|
24
|
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
|
|
25
25
|
from transformers import logging as tf_logging
|
|
26
26
|
|
|
27
27
|
from .exceptions import InvalidModel, NaNValueInModelOutput
|
|
@@ -231,7 +231,7 @@ def internet_connection_available() -> bool:
|
|
|
231
231
|
return False
|
|
232
232
|
|
|
233
233
|
|
|
234
|
-
def get_special_token_metadata(tokenizer: "
|
|
234
|
+
def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
|
|
235
235
|
"""Get the special token metadata for a tokenizer.
|
|
236
236
|
|
|
237
237
|
Args:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.3.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -49,6 +49,7 @@ Requires-Dist: sacremoses>=0.1.1
|
|
|
49
49
|
Requires-Dist: scikit-learn<1.6.0
|
|
50
50
|
Requires-Dist: sentencepiece>=0.1.96
|
|
51
51
|
Requires-Dist: seqeval>=1.2.2
|
|
52
|
+
Requires-Dist: setuptools>=75.8.2
|
|
52
53
|
Requires-Dist: tenacity>=9.0.0
|
|
53
54
|
Requires-Dist: termcolor>=2.0.0
|
|
54
55
|
Requires-Dist: torch>=2.3.0
|
|
@@ -76,6 +77,8 @@ Description-Content-Type: text/markdown
|
|
|
76
77
|
|
|
77
78
|
### The robust European language model benchmark.
|
|
78
79
|
|
|
80
|
+
_(formerly known as ScandEval)_
|
|
81
|
+
|
|
79
82
|
______________________________________________________________________
|
|
80
83
|
[](https://euroeval.com)
|
|
81
84
|
[](https://pypi.org/project/euroeval/)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
euroeval/__init__.py,sha256=3od9_ucHlILSbe4WCR8k5PbeorvmUr-VjOKXJ01I0fA,2165
|
|
2
2
|
euroeval/benchmark_config_factory.py,sha256=pi4Lu--ySKZRd9ItG6VKS6BPLis64vL-7UE99VSXq5Y,12534
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
3
|
+
euroeval/benchmarker.py,sha256=__DdnOvI9CNpgqPT1hsTl0GZFTyQ6KRfiQowCuh36sc,46534
|
|
4
4
|
euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
6
|
euroeval/constants.py,sha256=qFrm3cRT6UlnTXfHUmxqZsr0SBsGskjV1qrUlnAW-aw,1473
|
|
7
|
-
euroeval/data_loading.py,sha256=
|
|
7
|
+
euroeval/data_loading.py,sha256=RoatBJMpGurP_y5O3KrEvly8Z_yYEapQnnMZ_tWWrlc,3272
|
|
8
8
|
euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
|
|
9
|
-
euroeval/dataset_configs.py,sha256=
|
|
9
|
+
euroeval/dataset_configs.py,sha256=Cj3McxA0JTC7RKzXofzpJfmIhoXAfF756f_1SZUaPlw,84391
|
|
10
10
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
11
11
|
euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
|
|
12
12
|
euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
|
|
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
|
|
|
20
20
|
euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
|
|
21
21
|
euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
|
|
22
22
|
euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
23
|
+
euroeval/utils.py,sha256=K4z2IQilLJo6Cf8bzM46PYTaylDv6bYi7FRbHTbZulE,18736
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
|
|
26
|
-
euroeval/benchmark_modules/fresh.py,sha256=
|
|
26
|
+
euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
|
|
27
27
|
euroeval/benchmark_modules/hf.py,sha256=n3VIUA7XOOTgbSMkmYp5S06iJV0kp7aMq8YzRb0EDLw,41741
|
|
28
28
|
euroeval/benchmark_modules/litellm.py,sha256=uMPzUjTU54UHDmBImzWUFCGUupKvZNQN-2u0c8UaM3s,34488
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=cw7onFYXQ66cr2c4WTB90VYtQYc47lkwz6A25FW8sBs,43444
|
|
30
30
|
euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
31
31
|
euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
|
|
32
|
-
euroeval/task_utils/question_answering.py,sha256=
|
|
33
|
-
euroeval/task_utils/sequence_classification.py,sha256=
|
|
34
|
-
euroeval/task_utils/text_to_text.py,sha256
|
|
35
|
-
euroeval/task_utils/token_classification.py,sha256=
|
|
36
|
-
euroeval-15.
|
|
37
|
-
euroeval-15.
|
|
38
|
-
euroeval-15.
|
|
39
|
-
euroeval-15.
|
|
40
|
-
euroeval-15.
|
|
32
|
+
euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
|
|
33
|
+
euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
|
|
34
|
+
euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
|
|
35
|
+
euroeval/task_utils/token_classification.py,sha256=yT1YvZzmqNaVSRZ67BvyURhlkgTm3ltWPft4HxodZAE,17983
|
|
36
|
+
euroeval-15.3.1.dist-info/METADATA,sha256=elF7s_zt2tj9Hl1EMMDfNoMtskYK5Xh9i-N36vvzfQs,10263
|
|
37
|
+
euroeval-15.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
euroeval-15.3.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
39
|
+
euroeval-15.3.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
40
|
+
euroeval-15.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|