EuroEval 15.5.0__py3-none-any.whl → 15.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (53) hide show
  1. euroeval/benchmark_modules/base.py +3 -2
  2. euroeval/benchmark_modules/fresh.py +8 -6
  3. euroeval/benchmark_modules/hf.py +33 -31
  4. euroeval/benchmark_modules/litellm.py +120 -56
  5. euroeval/benchmark_modules/vllm.py +41 -26
  6. euroeval/benchmarker.py +23 -21
  7. euroeval/callbacks.py +2 -2
  8. euroeval/constants.py +1 -1
  9. euroeval/data_models.py +257 -42
  10. euroeval/dataset_configs/__init__.py +61 -0
  11. euroeval/dataset_configs/danish.py +120 -0
  12. euroeval/dataset_configs/dutch.py +123 -0
  13. euroeval/dataset_configs/english.py +88 -0
  14. euroeval/dataset_configs/faroese.py +53 -0
  15. euroeval/dataset_configs/french.py +83 -0
  16. euroeval/dataset_configs/german.py +91 -0
  17. euroeval/dataset_configs/icelandic.py +148 -0
  18. euroeval/dataset_configs/italian.py +81 -0
  19. euroeval/dataset_configs/norwegian.py +178 -0
  20. euroeval/dataset_configs/spanish.py +78 -0
  21. euroeval/dataset_configs/swedish.py +100 -0
  22. euroeval/exceptions.py +10 -10
  23. euroeval/finetuning.py +6 -10
  24. euroeval/generation.py +1 -0
  25. euroeval/human_evaluation.py +2 -2
  26. euroeval/languages.py +20 -13
  27. euroeval/model_cache.py +1 -1
  28. euroeval/model_loading.py +1 -12
  29. euroeval/prompt_templates/__init__.py +8 -0
  30. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  31. euroeval/prompt_templates/multiple_choice.py +97 -0
  32. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  33. euroeval/prompt_templates/reading_comprehension.py +118 -0
  34. euroeval/prompt_templates/sentiment_classification.py +137 -0
  35. euroeval/prompt_templates/summarization.py +97 -0
  36. euroeval/speed_benchmark.py +1 -1
  37. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  38. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  39. euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
  40. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  41. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  42. euroeval/tasks.py +54 -0
  43. euroeval/tokenization_utils.py +343 -0
  44. euroeval/types.py +3 -1
  45. euroeval/utils.py +2 -347
  46. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/METADATA +30 -9
  47. euroeval-15.6.0.dist-info/RECORD +59 -0
  48. euroeval/dataset_configs.py +0 -2408
  49. euroeval-15.5.0.dist-info/RECORD +0 -40
  50. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  51. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
  52. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
  53. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,178 @@
1
+ """All Norwegian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import NB, NN, NO
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ NOREC_CONFIG = DatasetConfig(
10
+ name="norec",
11
+ pretty_name="the truncated version of the Norwegian sentiment classification "
12
+ "dataset NoReC",
13
+ huggingface_id="EuroEval/norec-mini",
14
+ task=SENT,
15
+ languages=[NB, NN, NO],
16
+ )
17
+
18
+ SCALA_NB_CONFIG = DatasetConfig(
19
+ name="scala-nb",
20
+ pretty_name="the Bokmål part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-nb",
22
+ task=LA,
23
+ languages=[NB, NO],
24
+ )
25
+
26
+ SCALA_NN_CONFIG = DatasetConfig(
27
+ name="scala-nn",
28
+ pretty_name="the Nynorsk part of the linguistic acceptability dataset ScaLA",
29
+ huggingface_id="EuroEval/scala-nn",
30
+ task=LA,
31
+ languages=[NN],
32
+ )
33
+
34
+ NORNE_NB_CONFIG = DatasetConfig(
35
+ name="norne-nb",
36
+ pretty_name="the truncated version of the Bokmål part of the Norwegian named "
37
+ "entity recognition dataset NorNE",
38
+ huggingface_id="EuroEval/norne-nb-mini",
39
+ task=NER,
40
+ languages=[NB, NO],
41
+ )
42
+
43
+ NORNE_NN_CONFIG = DatasetConfig(
44
+ name="norne-nn",
45
+ pretty_name="the truncated version of the Nynorsk part of the Norwegian named "
46
+ "entity recognition dataset NorNE",
47
+ huggingface_id="EuroEval/norne-nn-mini",
48
+ task=NER,
49
+ languages=[NN],
50
+ )
51
+
52
+ NORQUAD_CONFIG = DatasetConfig(
53
+ name="norquad",
54
+ pretty_name="the truncated version of the Norwegian question answering "
55
+ "dataset NorQuAD",
56
+ huggingface_id="EuroEval/norquad-mini",
57
+ task=RC,
58
+ languages=[NB, NN, NO],
59
+ _num_few_shot_examples=2,
60
+ )
61
+
62
+ NO_SAMMENDRAG_CONFIG = DatasetConfig(
63
+ name="no-sammendrag",
64
+ pretty_name="the truncated version of the Norwegian summarisation dataset "
65
+ "Norske Sammendrag",
66
+ huggingface_id="EuroEval/no-sammendrag-mini",
67
+ task=SUMM,
68
+ languages=[NB, NN, NO],
69
+ )
70
+
71
+ NRK_QUIZ_QA_CONFIG = DatasetConfig(
72
+ name="nrk-quiz-qa",
73
+ pretty_name="the truncated version of the Norwegian knowledge dataset NRK Quiz QA",
74
+ huggingface_id="EuroEval/nrk-quiz-qa-mini",
75
+ task=KNOW,
76
+ languages=[NB, NN, NO],
77
+ )
78
+
79
+ NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
80
+ name="nor-common-sense-qa",
81
+ pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
82
+ "NorCommonSenseQA",
83
+ huggingface_id="EuroEval/nor-common-sense-qa",
84
+ task=COMMON_SENSE,
85
+ languages=[NB, NN, NO],
86
+ )
87
+
88
+
89
+ ### Unofficial datasets ###
90
+
91
+ NO_COLA_CONFIG = DatasetConfig(
92
+ name="no-cola",
93
+ pretty_name="the truncated version of the Norwegian linguistic acceptability "
94
+ "dataset NoCoLA",
95
+ huggingface_id="EuroEval/no-cola-mini",
96
+ task=LA,
97
+ languages=[NB, NO],
98
+ unofficial=True,
99
+ )
100
+
101
+ NORGLM_MULTI_QA = DatasetConfig(
102
+ name="norglm-multi-qa",
103
+ pretty_name="the question answering part of the Norwegian NorGLM multi-task human "
104
+ "annotated dataset NO-Multi-QA-Sum",
105
+ huggingface_id="EuroEval/norglm-multi-qa",
106
+ task=RC,
107
+ languages=[NB, NN, NO],
108
+ _num_few_shot_examples=2,
109
+ unofficial=True,
110
+ )
111
+
112
+ NORGLM_MULTI_SUM = DatasetConfig(
113
+ name="norglm-multi-sum",
114
+ pretty_name="the summarisation part of the Norwegian NorGLM multi-task human "
115
+ "annotated dataset NO-Multi-QA-Sum",
116
+ huggingface_id="EuroEval/norglm-multi-sum",
117
+ task=SUMM,
118
+ languages=[NB, NN, NO],
119
+ unofficial=True,
120
+ )
121
+
122
+ SCHIBSTED_NO_CONFIG = DatasetConfig(
123
+ name="schibsted-no",
124
+ pretty_name="the Norwegian summarisation dataset Schibsted-no",
125
+ huggingface_id="EuroEval/schibsted-article-summaries-no",
126
+ task=SUMM,
127
+ languages=[NB, NN, NO],
128
+ unofficial=True,
129
+ )
130
+
131
+ PERSONAL_SUM_CONFIG = DatasetConfig(
132
+ name="personal-sum",
133
+ pretty_name="the Norwegian summarisation dataset personal-sum",
134
+ huggingface_id="EuroEval/personal-sum",
135
+ task=SUMM,
136
+ languages=[NB, NN, NO],
137
+ unofficial=True,
138
+ )
139
+
140
+ MMLU_NO_CONFIG = DatasetConfig(
141
+ name="mmlu-no",
142
+ pretty_name="the truncated version of the Norwegian knowledge dataset MMLU-no, "
143
+ "translated from the English MMLU dataset",
144
+ huggingface_id="EuroEval/mmlu-no-mini",
145
+ task=KNOW,
146
+ languages=[NB, NN, NO],
147
+ unofficial=True,
148
+ )
149
+
150
+ ARC_NO_CONFIG = DatasetConfig(
151
+ name="arc-no",
152
+ pretty_name="the truncated version of the Norwegian knowledge dataset ARC-no, "
153
+ "translated from the English ARC dataset",
154
+ huggingface_id="EuroEval/arc-no-mini",
155
+ task=KNOW,
156
+ languages=[NB, NN, NO],
157
+ unofficial=True,
158
+ )
159
+
160
+ HELLASWAG_NO_CONFIG = DatasetConfig(
161
+ name="hellaswag-no",
162
+ pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
163
+ "HellaSwag-no, translated from the English HellaSwag dataset",
164
+ huggingface_id="EuroEval/hellaswag-no-mini",
165
+ task=COMMON_SENSE,
166
+ languages=[NB, NN, NO],
167
+ unofficial=True,
168
+ )
169
+
170
+ BELEBELE_NO_CONFIG = DatasetConfig(
171
+ name="belebele-no",
172
+ pretty_name="the Norwegian multiple choice reading comprehension dataset "
173
+ "BeleBele-no, translated from the English BeleBele dataset",
174
+ huggingface_id="EuroEval/belebele-no-mini",
175
+ task=MCRC,
176
+ languages=[NB, NN, NO],
177
+ unofficial=True,
178
+ )
@@ -0,0 +1,78 @@
1
+ """All Spanish dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import ES
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
10
+ name="sentiment-headlines-es",
11
+ pretty_name="the truncated version of the Spanish sentiment headlines dataset",
12
+ huggingface_id="EuroEval/sentiment-headlines-es",
13
+ task=SENT,
14
+ languages=[ES],
15
+ )
16
+
17
+ SCALA_ES_CONFIG = DatasetConfig(
18
+ name="scala-es",
19
+ pretty_name="the Spanish part of the linguistic acceptability dataset ScaLA",
20
+ huggingface_id="EuroEval/scala-es",
21
+ task=LA,
22
+ languages=[ES],
23
+ )
24
+
25
+ CONLL_ES_CONFIG = DatasetConfig(
26
+ name="conll-es",
27
+ pretty_name="the Spanish part of the truncated version of the named entity "
28
+ "recognition dataset CoNLL 2002",
29
+ huggingface_id="EuroEval/conll-es-mini",
30
+ task=NER,
31
+ languages=[ES],
32
+ )
33
+
34
+ MLQA_ES_CONFIG = DatasetConfig(
35
+ name="mlqa-es",
36
+ pretty_name="the Spanish version of the MLQA reading comprehension dataset",
37
+ huggingface_id="EuroEval/mlqa-es",
38
+ task=RC,
39
+ languages=[ES],
40
+ )
41
+
42
+ MLSUM_ES_CONFIG = DatasetConfig(
43
+ name="mlsum-es",
44
+ pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
45
+ huggingface_id="EuroEval/mlsum-es-mini",
46
+ task=SUMM,
47
+ languages=[ES],
48
+ )
49
+
50
+ MMLU_ES_CONFIG = DatasetConfig(
51
+ name="mmlu-es",
52
+ pretty_name="the truncated version of the Spanish knowledge dataset MMLU-es, "
53
+ "translated from the English MMLU dataset",
54
+ huggingface_id="EuroEval/mmlu-es-mini",
55
+ task=KNOW,
56
+ languages=[ES],
57
+ )
58
+
59
+ HELLASWAG_ES_CONFIG = DatasetConfig(
60
+ name="hellaswag-es",
61
+ pretty_name="the truncated version of the Spanish common-sense reasoning dataset "
62
+ "HellaSwag-es, translated from the English HellaSwag dataset",
63
+ huggingface_id="EuroEval/hellaswag-es-mini",
64
+ task=COMMON_SENSE,
65
+ languages=[ES],
66
+ )
67
+
68
+
69
+ ### Unofficial datasets ###
70
+
71
+ XQUAD_ES_CONFIG = DatasetConfig(
72
+ name="xquad-es",
73
+ pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
74
+ huggingface_id="EuroEval/xquad-es",
75
+ task=RC,
76
+ languages=[ES],
77
+ unofficial=True,
78
+ )
@@ -0,0 +1,100 @@
1
+ """All Swedish dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import SV
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ SWEREC_CONFIG = DatasetConfig(
10
+ name="swerec",
11
+ pretty_name="the truncated version of the Swedish sentiment classification "
12
+ "dataset SweReC",
13
+ huggingface_id="EuroEval/swerec-mini",
14
+ task=SENT,
15
+ languages=[SV],
16
+ )
17
+
18
+ SCALA_SV_CONFIG = DatasetConfig(
19
+ name="scala-sv",
20
+ pretty_name="The Swedish part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-sv",
22
+ task=LA,
23
+ languages=[SV],
24
+ )
25
+
26
+ SUC3_CONFIG = DatasetConfig(
27
+ name="suc3",
28
+ pretty_name="the truncated version of the Swedish named entity recognition "
29
+ "dataset SUC 3.0",
30
+ huggingface_id="EuroEval/suc3-mini",
31
+ task=NER,
32
+ languages=[SV],
33
+ )
34
+
35
+ SCANDIQA_SV_CONFIG = DatasetConfig(
36
+ name="scandiqa-sv",
37
+ pretty_name="the Swedish part of the truncated version of the question answering "
38
+ "dataset ScandiQA",
39
+ huggingface_id="EuroEval/scandiqa-sv-mini",
40
+ task=RC,
41
+ languages=[SV],
42
+ )
43
+
44
+ SWEDN_CONFIG = DatasetConfig(
45
+ name="swedn",
46
+ pretty_name="the truncated version of the Swedish summarisation dataset SweDN",
47
+ huggingface_id="EuroEval/swedn-mini",
48
+ task=SUMM,
49
+ languages=[SV],
50
+ )
51
+
52
+ MMLU_SV_CONFIG = DatasetConfig(
53
+ name="mmlu-sv",
54
+ pretty_name="the truncated version of the Swedish knowledge dataset MMLU-sv, "
55
+ "translated from the English MMLU dataset",
56
+ huggingface_id="EuroEval/mmlu-sv-mini",
57
+ task=KNOW,
58
+ languages=[SV],
59
+ )
60
+
61
+ HELLASWAG_SV_CONFIG = DatasetConfig(
62
+ name="hellaswag-sv",
63
+ pretty_name="the truncated version of the Swedish common-sense reasoning dataset "
64
+ "HellaSwag-sv, translated from the English HellaSwag dataset",
65
+ huggingface_id="EuroEval/hellaswag-sv-mini",
66
+ task=COMMON_SENSE,
67
+ languages=[SV],
68
+ )
69
+
70
+
71
+ ### Unofficial datasets ###
72
+
73
+ SCHIBSTED_SV_CONFIG = DatasetConfig(
74
+ name="schibsted-sv",
75
+ pretty_name="the Swedish summarisation dataset Schibsted-sv",
76
+ huggingface_id="EuroEval/schibsted-article-summaries-sv",
77
+ task=SUMM,
78
+ languages=[SV],
79
+ unofficial=True,
80
+ )
81
+
82
+ ARC_SV_CONFIG = DatasetConfig(
83
+ name="arc-sv",
84
+ pretty_name="the truncated version of the Swedish knowledge dataset ARC-sv, "
85
+ "translated from the English ARC dataset",
86
+ huggingface_id="EuroEval/arc-sv-mini",
87
+ task=KNOW,
88
+ languages=[SV],
89
+ unofficial=True,
90
+ )
91
+
92
+ BELEBELE_SV_CONFIG = DatasetConfig(
93
+ name="belebele-sv",
94
+ pretty_name="the Swedish multiple choice reading comprehension dataset "
95
+ "BeleBele-sv, translated from the English BeleBele dataset",
96
+ huggingface_id="EuroEval/belebele-sv-mini",
97
+ task=MCRC,
98
+ languages=[SV],
99
+ unofficial=True,
100
+ )
euroeval/exceptions.py CHANGED
@@ -7,7 +7,7 @@ class InvalidBenchmark(Exception):
7
7
  def __init__(
8
8
  self, message: str = "This model cannot be benchmarked on the given dataset."
9
9
  ) -> None:
10
- """Initialize the exception.
10
+ """Initialise the exception.
11
11
 
12
12
  Args:
13
13
  message:
@@ -23,7 +23,7 @@ class InvalidModel(Exception):
23
23
  def __init__(
24
24
  self, message: str = "The model cannot be benchmarked on any datasets."
25
25
  ) -> None:
26
- """Initialize the exception.
26
+ """Initialise the exception.
27
27
 
28
28
  Args:
29
29
  message:
@@ -39,7 +39,7 @@ class HuggingFaceHubDown(Exception):
39
39
  def __init__(
40
40
  self, message: str = "The Hugging Face Hub is currently down."
41
41
  ) -> None:
42
- """Initialize the exception.
42
+ """Initialise the exception.
43
43
 
44
44
  Args:
45
45
  message:
@@ -55,7 +55,7 @@ class NoInternetConnection(Exception):
55
55
  def __init__(
56
56
  self, message: str = "There is currently no internet connection."
57
57
  ) -> None:
58
- """Initialize the exception.
58
+ """Initialise the exception.
59
59
 
60
60
  Args:
61
61
  message:
@@ -71,7 +71,7 @@ class NaNValueInModelOutput(Exception):
71
71
  def __init__(
72
72
  self, message: str = "There is a NaN value in the model output."
73
73
  ) -> None:
74
- """Initialize the exception.
74
+ """Initialise the exception.
75
75
 
76
76
  Args:
77
77
  message:
@@ -93,7 +93,7 @@ class FlashAttentionNotInstalled(Exception):
93
93
  "pip install flash-attn --no-build-isolation`."
94
94
  ),
95
95
  ) -> None:
96
- """Initialize the exception.
96
+ """Initialise the exception.
97
97
 
98
98
  Args:
99
99
  message:
@@ -107,7 +107,7 @@ class NeedsExtraInstalled(InvalidModel):
107
107
  """The evaluation requires extra to be installed."""
108
108
 
109
109
  def __init__(self, extra: str) -> None:
110
- """Initialize the exception.
110
+ """Initialise the exception.
111
111
 
112
112
  Args:
113
113
  extra:
@@ -126,7 +126,7 @@ class NeedsManualDependency(InvalidModel):
126
126
  """The evaluation requires a dependency to be manually installed."""
127
127
 
128
128
  def __init__(self, package: str) -> None:
129
- """Initialize the exception.
129
+ """Initialise the exception.
130
130
 
131
131
  Args:
132
132
  package:
@@ -146,7 +146,7 @@ class NeedsAdditionalArgument(InvalidModel):
146
146
  def __init__(
147
147
  self, cli_argument: str, script_argument: str, run_with_cli: bool
148
148
  ) -> None:
149
- """Initialize the exception.
149
+ """Initialise the exception.
150
150
 
151
151
  Args:
152
152
  cli_argument:
@@ -177,7 +177,7 @@ class NeedsEnvironmentVariable(InvalidModel):
177
177
  """The evaluation requires an environment variable to be set."""
178
178
 
179
179
  def __init__(self, env_var: str) -> None:
180
- """Initialize the exception.
180
+ """Initialise the exception.
181
181
 
182
182
  Args:
183
183
  env_var:
euroeval/finetuning.py CHANGED
@@ -7,14 +7,13 @@ import typing as t
7
7
  import torch
8
8
  from datasets import DatasetDict
9
9
  from tqdm.auto import tqdm
10
- from transformers import (
10
+ from transformers.trainer_callback import (
11
11
  EarlyStoppingCallback,
12
- IntervalStrategy,
13
12
  PrinterCallback,
14
13
  ProgressCallback,
15
- TrainingArguments,
16
14
  )
17
- from transformers.trainer import OptimizerNames
15
+ from transformers.trainer_utils import IntervalStrategy
16
+ from transformers.training_args import OptimizerNames, TrainingArguments
18
17
 
19
18
  from .benchmark_modules import BenchmarkModule
20
19
  from .callbacks import NeverLeaveProgressCallback
@@ -67,9 +66,6 @@ def finetune(
67
66
  else:
68
67
  dtype = DataType.FP32
69
68
 
70
- # TEMP
71
- dtype = DataType.FP32
72
-
73
69
  bs: int = benchmark_config.batch_size
74
70
  scores: list[dict[str, float]] = list()
75
71
  for idx in tqdm(
@@ -212,7 +208,7 @@ def finetune_single_iteration(
212
208
 
213
209
  if not benchmark_config.verbose:
214
210
 
215
- def no_logging(logs: dict[str, float]) -> None:
211
+ def no_logging(logs: dict[str, float], start_time: float | None = None) -> None:
216
212
  return
217
213
 
218
214
  trainer.log = no_logging
@@ -292,7 +288,7 @@ def get_training_args(
292
288
 
293
289
  training_args = TrainingArguments(
294
290
  output_dir=model_config.model_cache_dir,
295
- evaluation_strategy=IntervalStrategy.STEPS,
291
+ eval_strategy=IntervalStrategy.STEPS,
296
292
  logging_strategy=logging_strategy,
297
293
  save_strategy=IntervalStrategy.STEPS,
298
294
  eval_steps=30,
@@ -304,11 +300,11 @@ def get_training_args(
304
300
  save_total_limit=1,
305
301
  per_device_train_batch_size=batch_size,
306
302
  per_device_eval_batch_size=batch_size,
303
+ optim=OptimizerNames.ADAMW_TORCH,
307
304
  learning_rate=2e-5,
308
305
  warmup_ratio=0.01,
309
306
  gradient_accumulation_steps=32 // batch_size,
310
307
  load_best_model_at_end=True,
311
- optim=OptimizerNames.ADAMW_TORCH,
312
308
  seed=4242 + iteration_idx,
313
309
  fp16=dtype == DataType.FP16,
314
310
  bf16=dtype == DataType.BF16,
euroeval/generation.py CHANGED
@@ -133,6 +133,7 @@ def generate_single_iteration(
133
133
  all_preds: list[str] = list()
134
134
 
135
135
  if len(non_cached_dataset) > 0:
136
+ itr: t.Iterable
136
137
  match model.batching_preference:
137
138
  case BatchingPreference.SINGLE_SAMPLE:
138
139
  itr = tqdm(iterable=non_cached_dataset, leave=False)
@@ -17,7 +17,7 @@ from .dataset_configs import SPEED_CONFIG, get_all_dataset_configs
17
17
  from .enums import GenerativeType, TaskGroup
18
18
  from .exceptions import NeedsExtraInstalled
19
19
  from .scores import aggregate_scores
20
- from .task_utils import (
20
+ from .task_group_utils import (
21
21
  question_answering,
22
22
  sequence_classification,
23
23
  text_to_text,
@@ -44,7 +44,7 @@ class HumanEvaluator:
44
44
  description: str,
45
45
  dummy_model_id: str = "mistralai/Mistral-7B-v0.1",
46
46
  ) -> None:
47
- """Initialize the HumanEvaluator.
47
+ """Initialise the HumanEvaluator.
48
48
 
49
49
  Args:
50
50
  annotator_id:
euroeval/languages.py CHANGED
@@ -17,6 +17,26 @@ def get_all_languages() -> dict[str, Language]:
17
17
  return {cfg.code: cfg for cfg in globals().values() if isinstance(cfg, Language)}
18
18
 
19
19
 
20
+ ### Currently Supported Lanuages ###
21
+ DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
22
+ NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
23
+ EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
24
+ FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
25
+ FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
26
+ DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
27
+ IS = Language(code="is", name="Icelandic", _and_separator="og", _or_separator="eða")
28
+ IT = Language(code="it", name="Italian", _and_separator="e", _or_separator="o")
29
+ NO = Language(code="no", name="Norwegian", _and_separator="og", _or_separator="eller")
30
+ NB = Language(
31
+ code="nb", name="Norwegian Bokmål", _and_separator="og", _or_separator="eller"
32
+ )
33
+ NN = Language(
34
+ code="nn", name="Norwegian Nynorsk", _and_separator="og", _or_separator="eller"
35
+ )
36
+ ES = Language(code="es", name="Spanish", _and_separator="y", _or_separator="o")
37
+ SV = Language(code="sv", name="Swedish", _and_separator="och", _or_separator="eller")
38
+
39
+
20
40
  AB = Language(code="ab", name="Abkhazian")
21
41
  AA = Language(code="aa", name="Afar")
22
42
  AF = Language(code="af", name="Afrikaans")
@@ -52,25 +72,19 @@ CO = Language(code="co", name="Corsican")
52
72
  CR = Language(code="cr", name="Cree")
53
73
  HR = Language(code="hr", name="Croatian")
54
74
  CS = Language(code="cs", name="Czech")
55
- DA = Language(code="da", name="Danish")
56
75
  DV = Language(code="dv", name="Divehi")
57
- NL = Language(code="nl", name="Dutch")
58
76
  DZ = Language(code="dz", name="Dzongkha")
59
- EN = Language(code="en", name="English")
60
77
  EO = Language(code="eo", name="Esperanto")
61
78
  ET = Language(code="et", name="Estonian")
62
79
  EE = Language(code="ee", name="Ewe")
63
- FO = Language(code="fo", name="Faroese")
64
80
  FJ = Language(code="fj", name="Fijian")
65
81
  FI = Language(code="fi", name="Finnish")
66
- FR = Language(code="fr", name="French")
67
82
  FY = Language(code="fy", name="Western Frisian")
68
83
  FF = Language(code="ff", name="Fulah")
69
84
  GD = Language(code="gd", name="Gaelic")
70
85
  GL = Language(code="gl", name="Galician")
71
86
  LG = Language(code="lg", name="Ganda")
72
87
  KA = Language(code="ka", name="Georgian")
73
- DE = Language(code="de", name="German")
74
88
  EL = Language(code="el", name="Greek")
75
89
  KL = Language(code="kl", name="Greenlandic")
76
90
  GN = Language(code="gn", name="Guarani")
@@ -82,7 +96,6 @@ HZ = Language(code="hz", name="Herero")
82
96
  HI = Language(code="hi", name="Hindi")
83
97
  HO = Language(code="ho", name="Hiri Motu")
84
98
  HU = Language(code="hu", name="Hungarian")
85
- IS = Language(code="is", name="Icelandic")
86
99
  IO = Language(code="io", name="Ido")
87
100
  IG = Language(code="ig", name="Igbo")
88
101
  ID = Language(code="id", name="Indonesian")
@@ -91,7 +104,6 @@ IE = Language(code="ie", name="Interlingue")
91
104
  IU = Language(code="iu", name="Inuktitut")
92
105
  IK = Language(code="ik", name="Inupiaq")
93
106
  GA = Language(code="ga", name="Irish")
94
- IT = Language(code="it", name="Italian")
95
107
  JA = Language(code="ja", name="Japanese")
96
108
  KN = Language(code="kn", name="Kannada")
97
109
  KR = Language(code="kr", name="Kanuri")
@@ -130,9 +142,6 @@ ND = Language(code="nd", name="Northern Ndebele")
130
142
  NR = Language(code="nr", name="South Ndebele")
131
143
  NG = Language(code="ng", name="Ndonga")
132
144
  NE = Language(code="ne", name="Nepali")
133
- NO = Language(code="no", name="Norwegian")
134
- NB = Language(code="nb", name="Norwegian Bokmål")
135
- NN = Language(code="nn", name="Norwegian Nynorsk")
136
145
  II = Language(code="ii", name="Sichuan Yi")
137
146
  OC = Language(code="oc", name="Occitan")
138
147
  OJ = Language(code="oj", name="Ojibwa")
@@ -163,11 +172,9 @@ SK = Language(code="sk", name="Slovak")
163
172
  SL = Language(code="sl", name="Slovenian")
164
173
  SO = Language(code="so", name="Somali")
165
174
  ST = Language(code="st", name="Sotho")
166
- ES = Language(code="es", name="Spanish")
167
175
  SU = Language(code="su", name="Sundanese")
168
176
  SW = Language(code="sw", name="Swahili")
169
177
  SS = Language(code="ss", name="Swati")
170
- SV = Language(code="sv", name="Swedish")
171
178
  TL = Language(code="tl", name="Tagalog")
172
179
  TY = Language(code="ty", name="Tahitian")
173
180
  TG = Language(code="tg", name="Tajik")
euroeval/model_cache.py CHANGED
@@ -38,7 +38,7 @@ class ModelCache:
38
38
  def __init__(
39
39
  self, model_cache_dir: "Path", cache_name: str, max_generated_tokens: int
40
40
  ) -> None:
41
- """Initialize the model output cache.
41
+ """Initialise the model output cache.
42
42
 
43
43
  Args:
44
44
  model_cache_dir:
euroeval/model_loading.py CHANGED
@@ -8,9 +8,8 @@ from .benchmark_modules import (
8
8
  LiteLLMModel,
9
9
  VLLMModel,
10
10
  )
11
- from .constants import GENERATIVE_DATASET_TASK_GROUPS
12
11
  from .enums import InferenceBackend, ModelType
13
- from .exceptions import InvalidBenchmark, InvalidModel
12
+ from .exceptions import InvalidModel
14
13
 
15
14
  if t.TYPE_CHECKING:
16
15
  from .benchmark_modules import BenchmarkModule
@@ -59,16 +58,6 @@ def load_model(
59
58
  f"inference backend {model_config.inference_backend!r}."
60
59
  )
61
60
 
62
- # Refuse to benchmark non-generative models on generative tasks
63
- if (
64
- dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
65
- and not model_config.model_type == ModelType.GENERATIVE
66
- ):
67
- raise InvalidBenchmark(
68
- f"Cannot benchmark non-generative model {model_config.model_id!r} on "
69
- f"generative task {dataset_config.task.name!r}."
70
- )
71
-
72
61
  model = model_class(
73
62
  model_config=model_config,
74
63
  dataset_config=dataset_config,
@@ -0,0 +1,8 @@
1
+ """The different prompt templates used in EuroEval."""
2
+
3
+ from .linguistic_acceptability import LA_TEMPLATES
4
+ from .multiple_choice import MULTIPLE_CHOICE_TEMPLATES
5
+ from .named_entity_recognition import NER_TEMPLATES
6
+ from .reading_comprehension import RC_TEMPLATES
7
+ from .sentiment_classification import SENT_TEMPLATES
8
+ from .summarization import SUMM_TEMPLATES