EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +7 -0
- euroeval/benchmark_config_factory.py +7 -0
- euroeval/benchmark_modules/base.py +29 -29
- euroeval/benchmark_modules/fresh.py +31 -19
- euroeval/benchmark_modules/hf.py +27 -23
- euroeval/benchmark_modules/litellm.py +50 -30
- euroeval/benchmark_modules/vllm.py +22 -26
- euroeval/benchmarker.py +8 -1
- euroeval/callbacks.py +17 -13
- euroeval/cli.py +10 -0
- euroeval/data_loading.py +10 -5
- euroeval/data_models.py +9 -40
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/english.py +13 -4
- euroeval/dataset_configs/norwegian.py +8 -0
- euroeval/dataset_configs/portuguese.py +74 -0
- euroeval/dataset_configs/spanish.py +4 -3
- euroeval/finetuning.py +9 -8
- euroeval/generation.py +27 -8
- euroeval/human_evaluation.py +14 -13
- euroeval/languages.py +1 -2
- euroeval/metrics.py +452 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +9 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +8 -1
- euroeval/scores.py +14 -19
- euroeval/speed_benchmark.py +6 -7
- euroeval/task_group_utils/multiple_choice_classification.py +6 -4
- euroeval/task_group_utils/question_answering.py +5 -28
- euroeval/task_group_utils/sequence_classification.py +6 -30
- euroeval/task_group_utils/text_to_text.py +19 -34
- euroeval/task_group_utils/token_classification.py +18 -30
- euroeval/tasks.py +11 -136
- euroeval/types.py +6 -4
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
- euroeval-15.12.0.dist-info/RECORD +63 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
- euroeval-15.10.1.dist-info/RECORD +0 -61
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
|
@@ -13,14 +13,11 @@ from pathlib import Path
|
|
|
13
13
|
from time import sleep
|
|
14
14
|
|
|
15
15
|
import torch
|
|
16
|
-
from datasets import DatasetDict
|
|
17
16
|
from huggingface_hub import snapshot_download
|
|
18
17
|
from pydantic import conlist, create_model
|
|
19
18
|
from tqdm.auto import tqdm
|
|
20
19
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
21
20
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
22
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
23
|
-
from transformers.trainer import Trainer
|
|
24
21
|
from urllib3.exceptions import RequestError
|
|
25
22
|
|
|
26
23
|
from ..constants import (
|
|
@@ -34,13 +31,7 @@ from ..constants import (
|
|
|
34
31
|
TASKS_USING_JSON,
|
|
35
32
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
36
33
|
)
|
|
37
|
-
from ..data_models import
|
|
38
|
-
BenchmarkConfig,
|
|
39
|
-
DatasetConfig,
|
|
40
|
-
GenerativeModelOutput,
|
|
41
|
-
ModelConfig,
|
|
42
|
-
Task,
|
|
43
|
-
)
|
|
34
|
+
from ..data_models import GenerativeModelOutput, ModelConfig
|
|
44
35
|
from ..enums import (
|
|
45
36
|
BatchingPreference,
|
|
46
37
|
GenerativeType,
|
|
@@ -94,6 +85,13 @@ if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
|
94
85
|
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
95
86
|
import ray
|
|
96
87
|
|
|
88
|
+
if t.TYPE_CHECKING:
|
|
89
|
+
from datasets import DatasetDict
|
|
90
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
91
|
+
from transformers.trainer import Trainer
|
|
92
|
+
|
|
93
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, Task
|
|
94
|
+
|
|
97
95
|
logger = logging.getLogger("euroeval")
|
|
98
96
|
|
|
99
97
|
|
|
@@ -106,9 +104,9 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
106
104
|
|
|
107
105
|
def __init__(
|
|
108
106
|
self,
|
|
109
|
-
model_config: ModelConfig,
|
|
110
|
-
dataset_config: DatasetConfig,
|
|
111
|
-
benchmark_config: BenchmarkConfig,
|
|
107
|
+
model_config: "ModelConfig",
|
|
108
|
+
dataset_config: "DatasetConfig",
|
|
109
|
+
benchmark_config: "BenchmarkConfig",
|
|
112
110
|
) -> None:
|
|
113
111
|
"""Initialise the vLLM model.
|
|
114
112
|
|
|
@@ -129,8 +127,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
129
127
|
model, tokenizer = load_model_and_tokenizer(
|
|
130
128
|
model_config=model_config, benchmark_config=benchmark_config
|
|
131
129
|
)
|
|
132
|
-
self._model: LLM = model
|
|
133
|
-
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
130
|
+
self._model: "LLM" = model
|
|
131
|
+
self._tokenizer: "PreTrainedTokenizer" = tokenizer
|
|
134
132
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
135
133
|
model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
|
|
136
134
|
)
|
|
@@ -230,8 +228,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
230
228
|
)
|
|
231
229
|
|
|
232
230
|
def prepare_dataset(
|
|
233
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
234
|
-
) -> DatasetDict:
|
|
231
|
+
self, dataset: "DatasetDict", task: "Task", itr_idx: int
|
|
232
|
+
) -> "DatasetDict":
|
|
235
233
|
"""Prepare the dataset for the model.
|
|
236
234
|
|
|
237
235
|
This includes things like tokenisation.
|
|
@@ -293,7 +291,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
293
291
|
|
|
294
292
|
return dataset
|
|
295
293
|
|
|
296
|
-
def generate(self, inputs: dict) -> GenerativeModelOutput:
|
|
294
|
+
def generate(self, inputs: dict) -> "GenerativeModelOutput":
|
|
297
295
|
"""Generate outputs from the model.
|
|
298
296
|
|
|
299
297
|
Args:
|
|
@@ -524,7 +522,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
524
522
|
|
|
525
523
|
@classmethod
|
|
526
524
|
def model_exists(
|
|
527
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
525
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
528
526
|
) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
|
|
529
527
|
"""Check if a model exists.
|
|
530
528
|
|
|
@@ -558,8 +556,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
558
556
|
|
|
559
557
|
@classmethod
|
|
560
558
|
def get_model_config(
|
|
561
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
562
|
-
) -> ModelConfig:
|
|
559
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
560
|
+
) -> "ModelConfig":
|
|
563
561
|
"""Fetch the model configuration.
|
|
564
562
|
|
|
565
563
|
Args:
|
|
@@ -628,8 +626,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
628
626
|
|
|
629
627
|
|
|
630
628
|
def load_model_and_tokenizer(
|
|
631
|
-
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
632
|
-
) ->
|
|
629
|
+
model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
|
|
630
|
+
) -> tuple["LLM", "PreTrainedTokenizer"]:
|
|
633
631
|
"""Load the model and tokenizer.
|
|
634
632
|
|
|
635
633
|
Args:
|
|
@@ -759,7 +757,7 @@ def load_model_and_tokenizer(
|
|
|
759
757
|
model = LLM(
|
|
760
758
|
model=model_id,
|
|
761
759
|
tokenizer=model_id,
|
|
762
|
-
gpu_memory_utilization=
|
|
760
|
+
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
763
761
|
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
764
762
|
download_dir=download_dir,
|
|
765
763
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
@@ -1017,7 +1015,6 @@ def get_custom_stop_tokens(
|
|
|
1017
1015
|
"""
|
|
1018
1016
|
candidate_stop_tokens = CUSTOM_STOP_TOKENS
|
|
1019
1017
|
|
|
1020
|
-
# Create a prompt to check if the model uses the reasoning tokens
|
|
1021
1018
|
prompt = "Hello"
|
|
1022
1019
|
if tokenizer.chat_template is not None:
|
|
1023
1020
|
templated_prompt = tokenizer.apply_chat_template(
|
|
@@ -1028,7 +1025,6 @@ def get_custom_stop_tokens(
|
|
|
1028
1025
|
assert isinstance(templated_prompt, str)
|
|
1029
1026
|
prompt = templated_prompt
|
|
1030
1027
|
|
|
1031
|
-
# Check that the beginning-of-reasoning token is actually used by the model
|
|
1032
1028
|
max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
|
|
1033
1029
|
completion = (
|
|
1034
1030
|
model.generate(
|
euroeval/benchmarker.py
CHANGED
|
@@ -78,6 +78,7 @@ class Benchmarker:
|
|
|
78
78
|
num_iterations: int = 10,
|
|
79
79
|
api_base: str | None = None,
|
|
80
80
|
api_version: str | None = None,
|
|
81
|
+
gpu_memory_utilization: float = 0.9,
|
|
81
82
|
debug: bool = False,
|
|
82
83
|
run_with_cli: bool = False,
|
|
83
84
|
only_allow_safetensors: bool = False,
|
|
@@ -145,6 +146,11 @@ class Benchmarker:
|
|
|
145
146
|
to a model on an inference API. Defaults to None.
|
|
146
147
|
api_version:
|
|
147
148
|
The version of the API to use. Defaults to None.
|
|
149
|
+
gpu_memory_utilization:
|
|
150
|
+
The GPU memory utilization to use for vLLM. Only relevant if the model
|
|
151
|
+
is generative. A larger value will result in faster evaluation, but at
|
|
152
|
+
the risk of running out of GPU memory. Only reduce this if you are
|
|
153
|
+
running out of GPU memory. Defaults to 0.9.
|
|
148
154
|
debug:
|
|
149
155
|
Whether to output debug information. Defaults to False.
|
|
150
156
|
run_with_cli:
|
|
@@ -192,6 +198,7 @@ class Benchmarker:
|
|
|
192
198
|
num_iterations=num_iterations,
|
|
193
199
|
api_base=api_base,
|
|
194
200
|
api_version=api_version,
|
|
201
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
195
202
|
debug=debug,
|
|
196
203
|
run_with_cli=run_with_cli,
|
|
197
204
|
only_allow_safetensors=only_allow_safetensors,
|
|
@@ -767,7 +774,7 @@ class Benchmarker:
|
|
|
767
774
|
|
|
768
775
|
results = log_scores(
|
|
769
776
|
dataset_name=dataset_config.pretty_name,
|
|
770
|
-
|
|
777
|
+
metrics=dataset_config.task.metrics,
|
|
771
778
|
scores=scores,
|
|
772
779
|
model_id=model_config.model_id,
|
|
773
780
|
model_revision=model_config.revision,
|
euroeval/callbacks.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
"""Callbacks for the Hugging Face Trainer."""
|
|
2
2
|
|
|
3
3
|
import sys
|
|
4
|
+
import typing as t
|
|
4
5
|
from collections.abc import Sized
|
|
5
6
|
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
7
|
from tqdm.auto import tqdm
|
|
8
|
-
from transformers.trainer_callback import ProgressCallback
|
|
9
|
-
|
|
8
|
+
from transformers.trainer_callback import ProgressCallback
|
|
9
|
+
|
|
10
|
+
if t.TYPE_CHECKING:
|
|
11
|
+
from torch.utils.data import DataLoader
|
|
12
|
+
from transformers.trainer_callback import TrainerControl, TrainerState
|
|
13
|
+
from transformers.training_args import TrainingArguments
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
class NeverLeaveProgressCallback(ProgressCallback):
|
|
@@ -20,9 +24,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
20
24
|
|
|
21
25
|
def on_train_begin(
|
|
22
26
|
self,
|
|
23
|
-
args: TrainingArguments,
|
|
24
|
-
state: TrainerState,
|
|
25
|
-
control: TrainerControl,
|
|
27
|
+
args: "TrainingArguments",
|
|
28
|
+
state: "TrainerState",
|
|
29
|
+
control: "TrainerControl",
|
|
26
30
|
**kwargs: str,
|
|
27
31
|
) -> None:
|
|
28
32
|
"""Callback actions when training begins."""
|
|
@@ -38,9 +42,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
38
42
|
|
|
39
43
|
def on_step_end(
|
|
40
44
|
self,
|
|
41
|
-
args: TrainingArguments,
|
|
42
|
-
state: TrainerState,
|
|
43
|
-
control: TrainerControl,
|
|
45
|
+
args: "TrainingArguments",
|
|
46
|
+
state: "TrainerState",
|
|
47
|
+
control: "TrainerControl",
|
|
44
48
|
**kwargs: str,
|
|
45
49
|
) -> None:
|
|
46
50
|
"""Callback actions when a training step ends."""
|
|
@@ -50,10 +54,10 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
50
54
|
|
|
51
55
|
def on_prediction_step(
|
|
52
56
|
self,
|
|
53
|
-
args: TrainingArguments,
|
|
54
|
-
state: TrainerState,
|
|
55
|
-
control: TrainerControl,
|
|
56
|
-
eval_dataloader: DataLoader | None = None,
|
|
57
|
+
args: "TrainingArguments",
|
|
58
|
+
state: "TrainerState",
|
|
59
|
+
control: "TrainerControl",
|
|
60
|
+
eval_dataloader: "DataLoader | None" = None,
|
|
57
61
|
**kwargs: str,
|
|
58
62
|
) -> None:
|
|
59
63
|
"""Callback actions when a prediction step ends."""
|
euroeval/cli.py
CHANGED
|
@@ -186,6 +186,14 @@ from .tasks import get_all_tasks
|
|
|
186
186
|
help="The version of the API to use. Only relevant if `model` refers to a model on "
|
|
187
187
|
"an inference API.",
|
|
188
188
|
)
|
|
189
|
+
@click.option(
|
|
190
|
+
"--gpu-memory-utilization",
|
|
191
|
+
default=0.9,
|
|
192
|
+
show_default=True,
|
|
193
|
+
help="The GPU memory utilization to use for vLLM. A larger value will result in "
|
|
194
|
+
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
|
195
|
+
"if you are running out of GPU memory. Only relevant if the model is generative.",
|
|
196
|
+
)
|
|
189
197
|
@click.option(
|
|
190
198
|
"--debug/--no-debug",
|
|
191
199
|
default=False,
|
|
@@ -223,6 +231,7 @@ def benchmark(
|
|
|
223
231
|
num_iterations: int,
|
|
224
232
|
api_base: str | None,
|
|
225
233
|
api_version: str | None,
|
|
234
|
+
gpu_memory_utilization: float,
|
|
226
235
|
debug: bool,
|
|
227
236
|
only_allow_safetensors: bool,
|
|
228
237
|
) -> None:
|
|
@@ -258,6 +267,7 @@ def benchmark(
|
|
|
258
267
|
num_iterations=num_iterations,
|
|
259
268
|
api_base=api_base,
|
|
260
269
|
api_version=api_version,
|
|
270
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
261
271
|
debug=debug,
|
|
262
272
|
run_with_cli=True,
|
|
263
273
|
only_allow_safetensors=only_allow_safetensors,
|
euroeval/data_loading.py
CHANGED
|
@@ -3,23 +3,28 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
5
|
import time
|
|
6
|
+
import typing as t
|
|
6
7
|
|
|
7
8
|
import requests
|
|
8
|
-
from datasets import
|
|
9
|
+
from datasets import DatasetDict, load_dataset
|
|
9
10
|
from datasets.exceptions import DatasetsError
|
|
10
11
|
from huggingface_hub.errors import HfHubHTTPError
|
|
11
12
|
from numpy.random import Generator
|
|
12
13
|
|
|
13
|
-
from .data_models import BenchmarkConfig, DatasetConfig
|
|
14
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
15
15
|
from .utils import unscramble
|
|
16
16
|
|
|
17
|
+
if t.TYPE_CHECKING:
|
|
18
|
+
from datasets import Dataset
|
|
19
|
+
|
|
20
|
+
from .data_models import BenchmarkConfig, DatasetConfig
|
|
21
|
+
|
|
17
22
|
logger = logging.getLogger("euroeval")
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
def load_data(
|
|
21
26
|
rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
|
|
22
|
-
) -> list[DatasetDict]:
|
|
27
|
+
) -> list["DatasetDict"]:
|
|
23
28
|
"""Load the raw bootstrapped datasets.
|
|
24
29
|
|
|
25
30
|
Args:
|
|
@@ -56,7 +61,7 @@ def load_data(
|
|
|
56
61
|
dataset["test"] = dataset["test"].select(range(1))
|
|
57
62
|
|
|
58
63
|
# Bootstrap the splits
|
|
59
|
-
bootstrapped_splits: dict[str, list[Dataset]] = dict()
|
|
64
|
+
bootstrapped_splits: dict[str, list["Dataset"]] = dict()
|
|
60
65
|
for split in ["train", "val", "test"]:
|
|
61
66
|
bootstrap_indices = rng.integers(
|
|
62
67
|
0,
|
|
@@ -80,7 +85,7 @@ def load_data(
|
|
|
80
85
|
return datasets
|
|
81
86
|
|
|
82
87
|
|
|
83
|
-
def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
|
|
88
|
+
def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
|
|
84
89
|
"""Load the raw dataset.
|
|
85
90
|
|
|
86
91
|
Args:
|
euroeval/data_models.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
|
-
import collections.abc as c
|
|
4
3
|
import json
|
|
5
4
|
import pathlib
|
|
6
5
|
import re
|
|
@@ -11,48 +10,11 @@ import pydantic
|
|
|
11
10
|
import torch
|
|
12
11
|
|
|
13
12
|
from .enums import Device, InferenceBackend, ModelType, TaskGroup
|
|
13
|
+
from .metrics import Metric
|
|
14
14
|
from .types import ScoreDict
|
|
15
15
|
from .utils import get_package_version
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@dataclass
|
|
19
|
-
class MetricConfig:
|
|
20
|
-
"""Configuration for a metric.
|
|
21
|
-
|
|
22
|
-
Attributes:
|
|
23
|
-
name:
|
|
24
|
-
The name of the metric.
|
|
25
|
-
pretty_name:
|
|
26
|
-
A longer prettier name for the metric, which allows cases and spaces. Used
|
|
27
|
-
for logging.
|
|
28
|
-
huggingface_id:
|
|
29
|
-
The Hugging Face ID of the metric.
|
|
30
|
-
results_key:
|
|
31
|
-
The name of the key used to extract the metric scores from the results
|
|
32
|
-
dictionary.
|
|
33
|
-
compute_kwargs:
|
|
34
|
-
Keyword arguments to pass to the metric's compute function. Defaults to
|
|
35
|
-
an empty dictionary.
|
|
36
|
-
postprocessing_fn:
|
|
37
|
-
A function to apply to the metric scores after they are computed, taking
|
|
38
|
-
the score to the postprocessed score along with its string representation.
|
|
39
|
-
Defaults to x -> (100 * x, f"{x:.2%}").
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
name: str
|
|
43
|
-
pretty_name: str
|
|
44
|
-
huggingface_id: str
|
|
45
|
-
results_key: str
|
|
46
|
-
compute_kwargs: dict[str, t.Any] = field(default_factory=dict)
|
|
47
|
-
postprocessing_fn: c.Callable[[float], tuple[float, str]] = field(
|
|
48
|
-
default_factory=lambda: lambda raw_score: (100 * raw_score, f"{raw_score:.2%}")
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
def __hash__(self) -> int:
|
|
52
|
-
"""Return a hash of the metric configuration."""
|
|
53
|
-
return hash(self.name)
|
|
54
|
-
|
|
55
|
-
|
|
56
18
|
@dataclass
|
|
57
19
|
class Language:
|
|
58
20
|
"""A benchmarkable language.
|
|
@@ -147,7 +109,7 @@ class Task:
|
|
|
147
109
|
name: str
|
|
148
110
|
task_group: TaskGroup
|
|
149
111
|
template_dict: dict["Language", "PromptConfig"]
|
|
150
|
-
metrics: list[
|
|
112
|
+
metrics: list[Metric]
|
|
151
113
|
default_num_few_shot_examples: int
|
|
152
114
|
default_max_generated_tokens: int
|
|
153
115
|
default_labels: list[str]
|
|
@@ -206,6 +168,11 @@ class BenchmarkConfig:
|
|
|
206
168
|
api_version:
|
|
207
169
|
The version of the API to use. Only relevant if `model` refers to a model on
|
|
208
170
|
an inference API.
|
|
171
|
+
gpu_memory_utilization:
|
|
172
|
+
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
173
|
+
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
174
|
+
this if you are running out of GPU memory. Only relevant if the model is
|
|
175
|
+
generative.
|
|
209
176
|
debug:
|
|
210
177
|
Whether to run the benchmark in debug mode.
|
|
211
178
|
run_with_cli:
|
|
@@ -234,6 +201,7 @@ class BenchmarkConfig:
|
|
|
234
201
|
num_iterations: int
|
|
235
202
|
api_base: str | None
|
|
236
203
|
api_version: str | None
|
|
204
|
+
gpu_memory_utilization: float
|
|
237
205
|
debug: bool
|
|
238
206
|
run_with_cli: bool
|
|
239
207
|
only_allow_safetensors: bool
|
|
@@ -265,6 +233,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
265
233
|
num_iterations: int
|
|
266
234
|
api_base: str | None
|
|
267
235
|
api_version: str | None
|
|
236
|
+
gpu_memory_utilization: float
|
|
268
237
|
debug: bool
|
|
269
238
|
run_with_cli: bool
|
|
270
239
|
only_allow_safetensors: bool
|
|
@@ -13,6 +13,7 @@ from .german import * # noqa: F403
|
|
|
13
13
|
from .icelandic import * # noqa: F403
|
|
14
14
|
from .italian import * # noqa: F403
|
|
15
15
|
from .norwegian import * # noqa: F403
|
|
16
|
+
from .portuguese import * # noqa: F403
|
|
16
17
|
from .spanish import * # noqa: F403
|
|
17
18
|
from .swedish import * # noqa: F403
|
|
18
19
|
|
|
@@ -49,10 +49,10 @@ CNN_DAILYMAIL_CONFIG = DatasetConfig(
|
|
|
49
49
|
languages=[EN],
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
name="
|
|
54
|
-
pretty_name="the
|
|
55
|
-
huggingface_id="EuroEval/
|
|
52
|
+
LIFE_IN_THE_UK_CONFIG = DatasetConfig(
|
|
53
|
+
name="life-in-the-uk",
|
|
54
|
+
pretty_name="the English knowledge dataset Life in the UK",
|
|
55
|
+
huggingface_id="EuroEval/life-in-the-uk",
|
|
56
56
|
task=KNOW,
|
|
57
57
|
languages=[EN],
|
|
58
58
|
)
|
|
@@ -86,3 +86,12 @@ BELEBELE_CONFIG = DatasetConfig(
|
|
|
86
86
|
languages=[EN],
|
|
87
87
|
unofficial=True,
|
|
88
88
|
)
|
|
89
|
+
|
|
90
|
+
MMLU_CONFIG = DatasetConfig(
|
|
91
|
+
name="mmlu",
|
|
92
|
+
pretty_name="the truncated version of the English knowledge dataset MMLU",
|
|
93
|
+
huggingface_id="EuroEval/mmlu-mini",
|
|
94
|
+
task=KNOW,
|
|
95
|
+
languages=[EN],
|
|
96
|
+
unofficial=True,
|
|
97
|
+
)
|
|
@@ -76,6 +76,14 @@ NRK_QUIZ_QA_CONFIG = DatasetConfig(
|
|
|
76
76
|
languages=[NB, NN, NO],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
+
IDIOMS_NO_CONFIG = DatasetConfig(
|
|
80
|
+
name="idioms-no",
|
|
81
|
+
pretty_name="the Norwegian knowledge dataset Idioms-no",
|
|
82
|
+
huggingface_id="EuroEval/idioms-no",
|
|
83
|
+
task=KNOW,
|
|
84
|
+
languages=[NB, NN, NO],
|
|
85
|
+
)
|
|
86
|
+
|
|
79
87
|
NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
80
88
|
name="nor-common-sense-qa",
|
|
81
89
|
pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""All Portuguese dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import PT
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SST2_PT_CONFIG = DatasetConfig(
|
|
10
|
+
name="sst2-pt",
|
|
11
|
+
pretty_name="the truncated version of the Portuguese sentiment classification "
|
|
12
|
+
"dataset SST2-pt, translated from the English SST2 dataset",
|
|
13
|
+
huggingface_id="EuroEval/sst2-pt-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[PT],
|
|
16
|
+
_labels=["positive", "negative"],
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
MMLU_PT_CONFIG = DatasetConfig(
|
|
21
|
+
name="mmlu-pt",
|
|
22
|
+
pretty_name="the truncated version of the Portuguese knowledge dataset MMLU-pt, "
|
|
23
|
+
"translated from the English MMLU dataset",
|
|
24
|
+
huggingface_id="EuroEval/mmlu-pt-mini",
|
|
25
|
+
task=KNOW,
|
|
26
|
+
languages=[PT],
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
GOLDENSWAG_PT_CONFIG = DatasetConfig(
|
|
31
|
+
name="goldenswag-pt",
|
|
32
|
+
pretty_name="the truncated version of the Portuguese common-sense reasoning "
|
|
33
|
+
"dataset GoldenSwag-pt, translated from the English GoldenSwag dataset",
|
|
34
|
+
huggingface_id="EuroEval/goldenswag-pt-mini",
|
|
35
|
+
task=COMMON_SENSE,
|
|
36
|
+
languages=[PT],
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SCALA_PT = DatasetConfig(
|
|
41
|
+
name="scala-pt",
|
|
42
|
+
pretty_name="the Portuguese part of the linguistic acceptability dataset ScaLA",
|
|
43
|
+
huggingface_id="EuroEval/scala-pt",
|
|
44
|
+
task=LA,
|
|
45
|
+
languages=[PT],
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
HAREM_CONFIG = DatasetConfig(
|
|
49
|
+
name="harem",
|
|
50
|
+
pretty_name="the Portuguese named entity recognition dataset HAREM",
|
|
51
|
+
huggingface_id="EuroEval/harem",
|
|
52
|
+
task=NER,
|
|
53
|
+
languages=[PT],
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
PUBLICO_CONFIG = DatasetConfig(
|
|
57
|
+
name="publico",
|
|
58
|
+
pretty_name="the truncated version of the Portuguese summarisation dataset Público",
|
|
59
|
+
huggingface_id="EuroEval/publico-mini",
|
|
60
|
+
task=SUMM,
|
|
61
|
+
languages=[PT],
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
### Unofficial datasets ###
|
|
66
|
+
|
|
67
|
+
BOOLQ_PT_CONFIG = DatasetConfig(
|
|
68
|
+
name="boolq-pt",
|
|
69
|
+
pretty_name="the Portuguese multiple choice reading comprehension dataset "
|
|
70
|
+
"BoolQ-pt, translated from the English BoolQ dataset",
|
|
71
|
+
huggingface_id="EuroEval/boolq-pt",
|
|
72
|
+
task=MCRC,
|
|
73
|
+
languages=[PT],
|
|
74
|
+
)
|
|
@@ -8,7 +8,8 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
|
8
8
|
|
|
9
9
|
SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
|
|
10
10
|
name="sentiment-headlines-es",
|
|
11
|
-
pretty_name="the truncated version of the Spanish sentiment
|
|
11
|
+
pretty_name="the truncated version of the Spanish sentiment classification dataset "
|
|
12
|
+
"SentimentHeadlines",
|
|
12
13
|
huggingface_id="EuroEval/sentiment-headlines-es",
|
|
13
14
|
task=SENT,
|
|
14
15
|
languages=[ES],
|
|
@@ -33,7 +34,7 @@ CONLL_ES_CONFIG = DatasetConfig(
|
|
|
33
34
|
|
|
34
35
|
MLQA_ES_CONFIG = DatasetConfig(
|
|
35
36
|
name="mlqa-es",
|
|
36
|
-
pretty_name="the Spanish version of the
|
|
37
|
+
pretty_name="the Spanish version of the reading comprehension dataset MLQA",
|
|
37
38
|
huggingface_id="EuroEval/mlqa-es",
|
|
38
39
|
task=RC,
|
|
39
40
|
languages=[ES],
|
|
@@ -70,7 +71,7 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
|
70
71
|
|
|
71
72
|
XQUAD_ES_CONFIG = DatasetConfig(
|
|
72
73
|
name="xquad-es",
|
|
73
|
-
pretty_name="the Spanish version of the
|
|
74
|
+
pretty_name="the Spanish version of the reading comprehension dataset XQuAD",
|
|
74
75
|
huggingface_id="EuroEval/xquad-es",
|
|
75
76
|
task=RC,
|
|
76
77
|
languages=[ES],
|
euroeval/finetuning.py
CHANGED
|
@@ -5,7 +5,6 @@ import sys
|
|
|
5
5
|
import typing as t
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
|
-
from datasets import DatasetDict
|
|
9
8
|
from tqdm.auto import tqdm
|
|
10
9
|
from transformers.trainer_callback import (
|
|
11
10
|
EarlyStoppingCallback,
|
|
@@ -15,7 +14,6 @@ from transformers.trainer_callback import (
|
|
|
15
14
|
from transformers.trainer_utils import IntervalStrategy
|
|
16
15
|
from transformers.training_args import OptimizerNames, TrainingArguments
|
|
17
16
|
|
|
18
|
-
from .benchmark_modules import BenchmarkModule
|
|
19
17
|
from .callbacks import NeverLeaveProgressCallback
|
|
20
18
|
from .enums import DataType
|
|
21
19
|
from .exceptions import InvalidBenchmark, NaNValueInModelOutput
|
|
@@ -28,14 +26,17 @@ from .utils import (
|
|
|
28
26
|
)
|
|
29
27
|
|
|
30
28
|
if t.TYPE_CHECKING:
|
|
29
|
+
from datasets import DatasetDict
|
|
30
|
+
|
|
31
|
+
from .benchmark_modules import BenchmarkModule
|
|
31
32
|
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger("euroeval")
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
def finetune(
|
|
37
|
-
model: BenchmarkModule,
|
|
38
|
-
datasets: list[DatasetDict],
|
|
38
|
+
model: "BenchmarkModule",
|
|
39
|
+
datasets: list["DatasetDict"],
|
|
39
40
|
model_config: "ModelConfig",
|
|
40
41
|
dataset_config: "DatasetConfig",
|
|
41
42
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -155,9 +156,9 @@ def finetune(
|
|
|
155
156
|
|
|
156
157
|
|
|
157
158
|
def finetune_single_iteration(
|
|
158
|
-
model: BenchmarkModule | None,
|
|
159
|
-
dataset: DatasetDict,
|
|
160
|
-
training_args: TrainingArguments,
|
|
159
|
+
model: "BenchmarkModule | None",
|
|
160
|
+
dataset: "DatasetDict",
|
|
161
|
+
training_args: "TrainingArguments",
|
|
161
162
|
model_config: "ModelConfig",
|
|
162
163
|
dataset_config: "DatasetConfig",
|
|
163
164
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -254,7 +255,7 @@ def get_training_args(
|
|
|
254
255
|
iteration_idx: int,
|
|
255
256
|
dtype: DataType,
|
|
256
257
|
batch_size: int | None = None,
|
|
257
|
-
) -> TrainingArguments:
|
|
258
|
+
) -> "TrainingArguments":
|
|
258
259
|
"""Get the training arguments for the current iteration.
|
|
259
260
|
|
|
260
261
|
Args:
|