EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. euroeval/__init__.py +7 -0
  2. euroeval/benchmark_config_factory.py +7 -0
  3. euroeval/benchmark_modules/base.py +29 -29
  4. euroeval/benchmark_modules/fresh.py +31 -19
  5. euroeval/benchmark_modules/hf.py +27 -23
  6. euroeval/benchmark_modules/litellm.py +50 -30
  7. euroeval/benchmark_modules/vllm.py +22 -26
  8. euroeval/benchmarker.py +8 -1
  9. euroeval/callbacks.py +17 -13
  10. euroeval/cli.py +10 -0
  11. euroeval/data_loading.py +10 -5
  12. euroeval/data_models.py +9 -40
  13. euroeval/dataset_configs/__init__.py +1 -0
  14. euroeval/dataset_configs/english.py +13 -4
  15. euroeval/dataset_configs/norwegian.py +8 -0
  16. euroeval/dataset_configs/portuguese.py +74 -0
  17. euroeval/dataset_configs/spanish.py +4 -3
  18. euroeval/finetuning.py +9 -8
  19. euroeval/generation.py +27 -8
  20. euroeval/human_evaluation.py +14 -13
  21. euroeval/languages.py +1 -2
  22. euroeval/metrics.py +452 -0
  23. euroeval/prompt_templates/linguistic_acceptability.py +9 -1
  24. euroeval/prompt_templates/multiple_choice.py +9 -1
  25. euroeval/prompt_templates/named_entity_recognition.py +20 -1
  26. euroeval/prompt_templates/sentiment_classification.py +11 -1
  27. euroeval/prompt_templates/summarization.py +8 -1
  28. euroeval/scores.py +14 -19
  29. euroeval/speed_benchmark.py +6 -7
  30. euroeval/task_group_utils/multiple_choice_classification.py +6 -4
  31. euroeval/task_group_utils/question_answering.py +5 -28
  32. euroeval/task_group_utils/sequence_classification.py +6 -30
  33. euroeval/task_group_utils/text_to_text.py +19 -34
  34. euroeval/task_group_utils/token_classification.py +18 -30
  35. euroeval/tasks.py +11 -136
  36. euroeval/types.py +6 -4
  37. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
  38. euroeval-15.12.0.dist-info/RECORD +63 -0
  39. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
  40. euroeval-15.10.1.dist-info/RECORD +0 -61
  41. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
  42. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
@@ -13,14 +13,11 @@ from pathlib import Path
13
13
  from time import sleep
14
14
 
15
15
  import torch
16
- from datasets import DatasetDict
17
16
  from huggingface_hub import snapshot_download
18
17
  from pydantic import conlist, create_model
19
18
  from tqdm.auto import tqdm
20
19
  from transformers.models.auto.configuration_auto import AutoConfig
21
20
  from transformers.models.auto.tokenization_auto import AutoTokenizer
22
- from transformers.tokenization_utils import PreTrainedTokenizer
23
- from transformers.trainer import Trainer
24
21
  from urllib3.exceptions import RequestError
25
22
 
26
23
  from ..constants import (
@@ -34,13 +31,7 @@ from ..constants import (
34
31
  TASKS_USING_JSON,
35
32
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
36
33
  )
37
- from ..data_models import (
38
- BenchmarkConfig,
39
- DatasetConfig,
40
- GenerativeModelOutput,
41
- ModelConfig,
42
- Task,
43
- )
34
+ from ..data_models import GenerativeModelOutput, ModelConfig
44
35
  from ..enums import (
45
36
  BatchingPreference,
46
37
  GenerativeType,
@@ -94,6 +85,13 @@ if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
94
85
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
95
86
  import ray
96
87
 
88
+ if t.TYPE_CHECKING:
89
+ from datasets import DatasetDict
90
+ from transformers.tokenization_utils import PreTrainedTokenizer
91
+ from transformers.trainer import Trainer
92
+
93
+ from ..data_models import BenchmarkConfig, DatasetConfig, Task
94
+
97
95
  logger = logging.getLogger("euroeval")
98
96
 
99
97
 
@@ -106,9 +104,9 @@ class VLLMModel(HuggingFaceEncoderModel):
106
104
 
107
105
  def __init__(
108
106
  self,
109
- model_config: ModelConfig,
110
- dataset_config: DatasetConfig,
111
- benchmark_config: BenchmarkConfig,
107
+ model_config: "ModelConfig",
108
+ dataset_config: "DatasetConfig",
109
+ benchmark_config: "BenchmarkConfig",
112
110
  ) -> None:
113
111
  """Initialise the vLLM model.
114
112
 
@@ -129,8 +127,8 @@ class VLLMModel(HuggingFaceEncoderModel):
129
127
  model, tokenizer = load_model_and_tokenizer(
130
128
  model_config=model_config, benchmark_config=benchmark_config
131
129
  )
132
- self._model: LLM = model
133
- self._tokenizer: PreTrainedTokenizer = tokenizer
130
+ self._model: "LLM" = model
131
+ self._tokenizer: "PreTrainedTokenizer" = tokenizer
134
132
  self.end_of_reasoning_token = get_end_of_reasoning_token(
135
133
  model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
136
134
  )
@@ -230,8 +228,8 @@ class VLLMModel(HuggingFaceEncoderModel):
230
228
  )
231
229
 
232
230
  def prepare_dataset(
233
- self, dataset: DatasetDict, task: Task, itr_idx: int
234
- ) -> DatasetDict:
231
+ self, dataset: "DatasetDict", task: "Task", itr_idx: int
232
+ ) -> "DatasetDict":
235
233
  """Prepare the dataset for the model.
236
234
 
237
235
  This includes things like tokenisation.
@@ -293,7 +291,7 @@ class VLLMModel(HuggingFaceEncoderModel):
293
291
 
294
292
  return dataset
295
293
 
296
- def generate(self, inputs: dict) -> GenerativeModelOutput:
294
+ def generate(self, inputs: dict) -> "GenerativeModelOutput":
297
295
  """Generate outputs from the model.
298
296
 
299
297
  Args:
@@ -524,7 +522,7 @@ class VLLMModel(HuggingFaceEncoderModel):
524
522
 
525
523
  @classmethod
526
524
  def model_exists(
527
- cls, model_id: str, benchmark_config: BenchmarkConfig
525
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
528
526
  ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
529
527
  """Check if a model exists.
530
528
 
@@ -558,8 +556,8 @@ class VLLMModel(HuggingFaceEncoderModel):
558
556
 
559
557
  @classmethod
560
558
  def get_model_config(
561
- cls, model_id: str, benchmark_config: BenchmarkConfig
562
- ) -> ModelConfig:
559
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
560
+ ) -> "ModelConfig":
563
561
  """Fetch the model configuration.
564
562
 
565
563
  Args:
@@ -628,8 +626,8 @@ class VLLMModel(HuggingFaceEncoderModel):
628
626
 
629
627
 
630
628
  def load_model_and_tokenizer(
631
- model_config: ModelConfig, benchmark_config: BenchmarkConfig
632
- ) -> "tuple[LLM, PreTrainedTokenizer]":
629
+ model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
630
+ ) -> tuple["LLM", "PreTrainedTokenizer"]:
633
631
  """Load the model and tokenizer.
634
632
 
635
633
  Args:
@@ -759,7 +757,7 @@ def load_model_and_tokenizer(
759
757
  model = LLM(
760
758
  model=model_id,
761
759
  tokenizer=model_id,
762
- gpu_memory_utilization=0.9,
760
+ gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
763
761
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
764
762
  download_dir=download_dir,
765
763
  trust_remote_code=benchmark_config.trust_remote_code,
@@ -1017,7 +1015,6 @@ def get_custom_stop_tokens(
1017
1015
  """
1018
1016
  candidate_stop_tokens = CUSTOM_STOP_TOKENS
1019
1017
 
1020
- # Create a prompt to check if the model uses the reasoning tokens
1021
1018
  prompt = "Hello"
1022
1019
  if tokenizer.chat_template is not None:
1023
1020
  templated_prompt = tokenizer.apply_chat_template(
@@ -1028,7 +1025,6 @@ def get_custom_stop_tokens(
1028
1025
  assert isinstance(templated_prompt, str)
1029
1026
  prompt = templated_prompt
1030
1027
 
1031
- # Check that the beginning-of-reasoning token is actually used by the model
1032
1028
  max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
1033
1029
  completion = (
1034
1030
  model.generate(
euroeval/benchmarker.py CHANGED
@@ -78,6 +78,7 @@ class Benchmarker:
78
78
  num_iterations: int = 10,
79
79
  api_base: str | None = None,
80
80
  api_version: str | None = None,
81
+ gpu_memory_utilization: float = 0.9,
81
82
  debug: bool = False,
82
83
  run_with_cli: bool = False,
83
84
  only_allow_safetensors: bool = False,
@@ -145,6 +146,11 @@ class Benchmarker:
145
146
  to a model on an inference API. Defaults to None.
146
147
  api_version:
147
148
  The version of the API to use. Defaults to None.
149
+ gpu_memory_utilization:
150
+ The GPU memory utilization to use for vLLM. Only relevant if the model
151
+ is generative. A larger value will result in faster evaluation, but at
152
+ the risk of running out of GPU memory. Only reduce this if you are
153
+ running out of GPU memory. Defaults to 0.9.
148
154
  debug:
149
155
  Whether to output debug information. Defaults to False.
150
156
  run_with_cli:
@@ -192,6 +198,7 @@ class Benchmarker:
192
198
  num_iterations=num_iterations,
193
199
  api_base=api_base,
194
200
  api_version=api_version,
201
+ gpu_memory_utilization=gpu_memory_utilization,
195
202
  debug=debug,
196
203
  run_with_cli=run_with_cli,
197
204
  only_allow_safetensors=only_allow_safetensors,
@@ -767,7 +774,7 @@ class Benchmarker:
767
774
 
768
775
  results = log_scores(
769
776
  dataset_name=dataset_config.pretty_name,
770
- metric_configs=dataset_config.task.metrics,
777
+ metrics=dataset_config.task.metrics,
771
778
  scores=scores,
772
779
  model_id=model_config.model_id,
773
780
  model_revision=model_config.revision,
euroeval/callbacks.py CHANGED
@@ -1,12 +1,16 @@
1
1
  """Callbacks for the Hugging Face Trainer."""
2
2
 
3
3
  import sys
4
+ import typing as t
4
5
  from collections.abc import Sized
5
6
 
6
- from torch.utils.data import DataLoader
7
7
  from tqdm.auto import tqdm
8
- from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
9
- from transformers.training_args import TrainingArguments
8
+ from transformers.trainer_callback import ProgressCallback
9
+
10
+ if t.TYPE_CHECKING:
11
+ from torch.utils.data import DataLoader
12
+ from transformers.trainer_callback import TrainerControl, TrainerState
13
+ from transformers.training_args import TrainingArguments
10
14
 
11
15
 
12
16
  class NeverLeaveProgressCallback(ProgressCallback):
@@ -20,9 +24,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
20
24
 
21
25
  def on_train_begin(
22
26
  self,
23
- args: TrainingArguments,
24
- state: TrainerState,
25
- control: TrainerControl,
27
+ args: "TrainingArguments",
28
+ state: "TrainerState",
29
+ control: "TrainerControl",
26
30
  **kwargs: str,
27
31
  ) -> None:
28
32
  """Callback actions when training begins."""
@@ -38,9 +42,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
38
42
 
39
43
  def on_step_end(
40
44
  self,
41
- args: TrainingArguments,
42
- state: TrainerState,
43
- control: TrainerControl,
45
+ args: "TrainingArguments",
46
+ state: "TrainerState",
47
+ control: "TrainerControl",
44
48
  **kwargs: str,
45
49
  ) -> None:
46
50
  """Callback actions when a training step ends."""
@@ -50,10 +54,10 @@ class NeverLeaveProgressCallback(ProgressCallback):
50
54
 
51
55
  def on_prediction_step(
52
56
  self,
53
- args: TrainingArguments,
54
- state: TrainerState,
55
- control: TrainerControl,
56
- eval_dataloader: DataLoader | None = None,
57
+ args: "TrainingArguments",
58
+ state: "TrainerState",
59
+ control: "TrainerControl",
60
+ eval_dataloader: "DataLoader | None" = None,
57
61
  **kwargs: str,
58
62
  ) -> None:
59
63
  """Callback actions when a prediction step ends."""
euroeval/cli.py CHANGED
@@ -186,6 +186,14 @@ from .tasks import get_all_tasks
186
186
  help="The version of the API to use. Only relevant if `model` refers to a model on "
187
187
  "an inference API.",
188
188
  )
189
+ @click.option(
190
+ "--gpu-memory-utilization",
191
+ default=0.9,
192
+ show_default=True,
193
+ help="The GPU memory utilization to use for vLLM. A larger value will result in "
194
+ "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
195
+ "if you are running out of GPU memory. Only relevant if the model is generative.",
196
+ )
189
197
  @click.option(
190
198
  "--debug/--no-debug",
191
199
  default=False,
@@ -223,6 +231,7 @@ def benchmark(
223
231
  num_iterations: int,
224
232
  api_base: str | None,
225
233
  api_version: str | None,
234
+ gpu_memory_utilization: float,
226
235
  debug: bool,
227
236
  only_allow_safetensors: bool,
228
237
  ) -> None:
@@ -258,6 +267,7 @@ def benchmark(
258
267
  num_iterations=num_iterations,
259
268
  api_base=api_base,
260
269
  api_version=api_version,
270
+ gpu_memory_utilization=gpu_memory_utilization,
261
271
  debug=debug,
262
272
  run_with_cli=True,
263
273
  only_allow_safetensors=only_allow_safetensors,
euroeval/data_loading.py CHANGED
@@ -3,23 +3,28 @@
3
3
  import logging
4
4
  import sys
5
5
  import time
6
+ import typing as t
6
7
 
7
8
  import requests
8
- from datasets import Dataset, DatasetDict, load_dataset
9
+ from datasets import DatasetDict, load_dataset
9
10
  from datasets.exceptions import DatasetsError
10
11
  from huggingface_hub.errors import HfHubHTTPError
11
12
  from numpy.random import Generator
12
13
 
13
- from .data_models import BenchmarkConfig, DatasetConfig
14
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
15
15
  from .utils import unscramble
16
16
 
17
+ if t.TYPE_CHECKING:
18
+ from datasets import Dataset
19
+
20
+ from .data_models import BenchmarkConfig, DatasetConfig
21
+
17
22
  logger = logging.getLogger("euroeval")
18
23
 
19
24
 
20
25
  def load_data(
21
26
  rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
22
- ) -> list[DatasetDict]:
27
+ ) -> list["DatasetDict"]:
23
28
  """Load the raw bootstrapped datasets.
24
29
 
25
30
  Args:
@@ -56,7 +61,7 @@ def load_data(
56
61
  dataset["test"] = dataset["test"].select(range(1))
57
62
 
58
63
  # Bootstrap the splits
59
- bootstrapped_splits: dict[str, list[Dataset]] = dict()
64
+ bootstrapped_splits: dict[str, list["Dataset"]] = dict()
60
65
  for split in ["train", "val", "test"]:
61
66
  bootstrap_indices = rng.integers(
62
67
  0,
@@ -80,7 +85,7 @@ def load_data(
80
85
  return datasets
81
86
 
82
87
 
83
- def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
88
+ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
84
89
  """Load the raw dataset.
85
90
 
86
91
  Args:
euroeval/data_models.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
- import collections.abc as c
4
3
  import json
5
4
  import pathlib
6
5
  import re
@@ -11,48 +10,11 @@ import pydantic
11
10
  import torch
12
11
 
13
12
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
13
+ from .metrics import Metric
14
14
  from .types import ScoreDict
15
15
  from .utils import get_package_version
16
16
 
17
17
 
18
- @dataclass
19
- class MetricConfig:
20
- """Configuration for a metric.
21
-
22
- Attributes:
23
- name:
24
- The name of the metric.
25
- pretty_name:
26
- A longer prettier name for the metric, which allows cases and spaces. Used
27
- for logging.
28
- huggingface_id:
29
- The Hugging Face ID of the metric.
30
- results_key:
31
- The name of the key used to extract the metric scores from the results
32
- dictionary.
33
- compute_kwargs:
34
- Keyword arguments to pass to the metric's compute function. Defaults to
35
- an empty dictionary.
36
- postprocessing_fn:
37
- A function to apply to the metric scores after they are computed, taking
38
- the score to the postprocessed score along with its string representation.
39
- Defaults to x -> (100 * x, f"{x:.2%}").
40
- """
41
-
42
- name: str
43
- pretty_name: str
44
- huggingface_id: str
45
- results_key: str
46
- compute_kwargs: dict[str, t.Any] = field(default_factory=dict)
47
- postprocessing_fn: c.Callable[[float], tuple[float, str]] = field(
48
- default_factory=lambda: lambda raw_score: (100 * raw_score, f"{raw_score:.2%}")
49
- )
50
-
51
- def __hash__(self) -> int:
52
- """Return a hash of the metric configuration."""
53
- return hash(self.name)
54
-
55
-
56
18
  @dataclass
57
19
  class Language:
58
20
  """A benchmarkable language.
@@ -147,7 +109,7 @@ class Task:
147
109
  name: str
148
110
  task_group: TaskGroup
149
111
  template_dict: dict["Language", "PromptConfig"]
150
- metrics: list[MetricConfig]
112
+ metrics: list[Metric]
151
113
  default_num_few_shot_examples: int
152
114
  default_max_generated_tokens: int
153
115
  default_labels: list[str]
@@ -206,6 +168,11 @@ class BenchmarkConfig:
206
168
  api_version:
207
169
  The version of the API to use. Only relevant if `model` refers to a model on
208
170
  an inference API.
171
+ gpu_memory_utilization:
172
+ The GPU memory utilization to use for vLLM. A larger value will result in
173
+ faster evaluation, but at the risk of running out of GPU memory. Only reduce
174
+ this if you are running out of GPU memory. Only relevant if the model is
175
+ generative.
209
176
  debug:
210
177
  Whether to run the benchmark in debug mode.
211
178
  run_with_cli:
@@ -234,6 +201,7 @@ class BenchmarkConfig:
234
201
  num_iterations: int
235
202
  api_base: str | None
236
203
  api_version: str | None
204
+ gpu_memory_utilization: float
237
205
  debug: bool
238
206
  run_with_cli: bool
239
207
  only_allow_safetensors: bool
@@ -265,6 +233,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
265
233
  num_iterations: int
266
234
  api_base: str | None
267
235
  api_version: str | None
236
+ gpu_memory_utilization: float
268
237
  debug: bool
269
238
  run_with_cli: bool
270
239
  only_allow_safetensors: bool
@@ -13,6 +13,7 @@ from .german import * # noqa: F403
13
13
  from .icelandic import * # noqa: F403
14
14
  from .italian import * # noqa: F403
15
15
  from .norwegian import * # noqa: F403
16
+ from .portuguese import * # noqa: F403
16
17
  from .spanish import * # noqa: F403
17
18
  from .swedish import * # noqa: F403
18
19
 
@@ -49,10 +49,10 @@ CNN_DAILYMAIL_CONFIG = DatasetConfig(
49
49
  languages=[EN],
50
50
  )
51
51
 
52
- MMLU_CONFIG = DatasetConfig(
53
- name="mmlu",
54
- pretty_name="the truncated version of the English knowledge dataset MMLU",
55
- huggingface_id="EuroEval/mmlu-mini",
52
+ LIFE_IN_THE_UK_CONFIG = DatasetConfig(
53
+ name="life-in-the-uk",
54
+ pretty_name="the English knowledge dataset Life in the UK",
55
+ huggingface_id="EuroEval/life-in-the-uk",
56
56
  task=KNOW,
57
57
  languages=[EN],
58
58
  )
@@ -86,3 +86,12 @@ BELEBELE_CONFIG = DatasetConfig(
86
86
  languages=[EN],
87
87
  unofficial=True,
88
88
  )
89
+
90
+ MMLU_CONFIG = DatasetConfig(
91
+ name="mmlu",
92
+ pretty_name="the truncated version of the English knowledge dataset MMLU",
93
+ huggingface_id="EuroEval/mmlu-mini",
94
+ task=KNOW,
95
+ languages=[EN],
96
+ unofficial=True,
97
+ )
@@ -76,6 +76,14 @@ NRK_QUIZ_QA_CONFIG = DatasetConfig(
76
76
  languages=[NB, NN, NO],
77
77
  )
78
78
 
79
+ IDIOMS_NO_CONFIG = DatasetConfig(
80
+ name="idioms-no",
81
+ pretty_name="the Norwegian knowledge dataset Idioms-no",
82
+ huggingface_id="EuroEval/idioms-no",
83
+ task=KNOW,
84
+ languages=[NB, NN, NO],
85
+ )
86
+
79
87
  NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
80
88
  name="nor-common-sense-qa",
81
89
  pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
@@ -0,0 +1,74 @@
1
+ """All Portuguese dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import PT
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ SST2_PT_CONFIG = DatasetConfig(
10
+ name="sst2-pt",
11
+ pretty_name="the truncated version of the Portuguese sentiment classification "
12
+ "dataset SST2-pt, translated from the English SST2 dataset",
13
+ huggingface_id="EuroEval/sst2-pt-mini",
14
+ task=SENT,
15
+ languages=[PT],
16
+ _labels=["positive", "negative"],
17
+ )
18
+
19
+
20
+ MMLU_PT_CONFIG = DatasetConfig(
21
+ name="mmlu-pt",
22
+ pretty_name="the truncated version of the Portuguese knowledge dataset MMLU-pt, "
23
+ "translated from the English MMLU dataset",
24
+ huggingface_id="EuroEval/mmlu-pt-mini",
25
+ task=KNOW,
26
+ languages=[PT],
27
+ )
28
+
29
+
30
+ GOLDENSWAG_PT_CONFIG = DatasetConfig(
31
+ name="goldenswag-pt",
32
+ pretty_name="the truncated version of the Portuguese common-sense reasoning "
33
+ "dataset GoldenSwag-pt, translated from the English GoldenSwag dataset",
34
+ huggingface_id="EuroEval/goldenswag-pt-mini",
35
+ task=COMMON_SENSE,
36
+ languages=[PT],
37
+ )
38
+
39
+
40
+ SCALA_PT = DatasetConfig(
41
+ name="scala-pt",
42
+ pretty_name="the Portuguese part of the linguistic acceptability dataset ScaLA",
43
+ huggingface_id="EuroEval/scala-pt",
44
+ task=LA,
45
+ languages=[PT],
46
+ )
47
+
48
+ HAREM_CONFIG = DatasetConfig(
49
+ name="harem",
50
+ pretty_name="the Portuguese named entity recognition dataset HAREM",
51
+ huggingface_id="EuroEval/harem",
52
+ task=NER,
53
+ languages=[PT],
54
+ )
55
+
56
+ PUBLICO_CONFIG = DatasetConfig(
57
+ name="publico",
58
+ pretty_name="the truncated version of the Portuguese summarisation dataset Público",
59
+ huggingface_id="EuroEval/publico-mini",
60
+ task=SUMM,
61
+ languages=[PT],
62
+ )
63
+
64
+
65
+ ### Unofficial datasets ###
66
+
67
+ BOOLQ_PT_CONFIG = DatasetConfig(
68
+ name="boolq-pt",
69
+ pretty_name="the Portuguese multiple choice reading comprehension dataset "
70
+ "BoolQ-pt, translated from the English BoolQ dataset",
71
+ huggingface_id="EuroEval/boolq-pt",
72
+ task=MCRC,
73
+ languages=[PT],
74
+ )
@@ -8,7 +8,8 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
8
8
 
9
9
  SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
10
10
  name="sentiment-headlines-es",
11
- pretty_name="the truncated version of the Spanish sentiment headlines dataset",
11
+ pretty_name="the truncated version of the Spanish sentiment classification dataset "
12
+ "SentimentHeadlines",
12
13
  huggingface_id="EuroEval/sentiment-headlines-es",
13
14
  task=SENT,
14
15
  languages=[ES],
@@ -33,7 +34,7 @@ CONLL_ES_CONFIG = DatasetConfig(
33
34
 
34
35
  MLQA_ES_CONFIG = DatasetConfig(
35
36
  name="mlqa-es",
36
- pretty_name="the Spanish version of the MLQA reading comprehension dataset",
37
+ pretty_name="the Spanish version of the reading comprehension dataset MLQA",
37
38
  huggingface_id="EuroEval/mlqa-es",
38
39
  task=RC,
39
40
  languages=[ES],
@@ -70,7 +71,7 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
70
71
 
71
72
  XQUAD_ES_CONFIG = DatasetConfig(
72
73
  name="xquad-es",
73
- pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
74
+ pretty_name="the Spanish version of the reading comprehension dataset XQuAD",
74
75
  huggingface_id="EuroEval/xquad-es",
75
76
  task=RC,
76
77
  languages=[ES],
euroeval/finetuning.py CHANGED
@@ -5,7 +5,6 @@ import sys
5
5
  import typing as t
6
6
 
7
7
  import torch
8
- from datasets import DatasetDict
9
8
  from tqdm.auto import tqdm
10
9
  from transformers.trainer_callback import (
11
10
  EarlyStoppingCallback,
@@ -15,7 +14,6 @@ from transformers.trainer_callback import (
15
14
  from transformers.trainer_utils import IntervalStrategy
16
15
  from transformers.training_args import OptimizerNames, TrainingArguments
17
16
 
18
- from .benchmark_modules import BenchmarkModule
19
17
  from .callbacks import NeverLeaveProgressCallback
20
18
  from .enums import DataType
21
19
  from .exceptions import InvalidBenchmark, NaNValueInModelOutput
@@ -28,14 +26,17 @@ from .utils import (
28
26
  )
29
27
 
30
28
  if t.TYPE_CHECKING:
29
+ from datasets import DatasetDict
30
+
31
+ from .benchmark_modules import BenchmarkModule
31
32
  from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
32
33
 
33
34
  logger = logging.getLogger("euroeval")
34
35
 
35
36
 
36
37
  def finetune(
37
- model: BenchmarkModule,
38
- datasets: list[DatasetDict],
38
+ model: "BenchmarkModule",
39
+ datasets: list["DatasetDict"],
39
40
  model_config: "ModelConfig",
40
41
  dataset_config: "DatasetConfig",
41
42
  benchmark_config: "BenchmarkConfig",
@@ -155,9 +156,9 @@ def finetune(
155
156
 
156
157
 
157
158
  def finetune_single_iteration(
158
- model: BenchmarkModule | None,
159
- dataset: DatasetDict,
160
- training_args: TrainingArguments,
159
+ model: "BenchmarkModule | None",
160
+ dataset: "DatasetDict",
161
+ training_args: "TrainingArguments",
161
162
  model_config: "ModelConfig",
162
163
  dataset_config: "DatasetConfig",
163
164
  benchmark_config: "BenchmarkConfig",
@@ -254,7 +255,7 @@ def get_training_args(
254
255
  iteration_idx: int,
255
256
  dtype: DataType,
256
257
  batch_size: int | None = None,
257
- ) -> TrainingArguments:
258
+ ) -> "TrainingArguments":
258
259
  """Get the training arguments for the current iteration.
259
260
 
260
261
  Args: