EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +9 -2
- euroeval/benchmark_config_factory.py +51 -50
- euroeval/benchmark_modules/base.py +9 -21
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +101 -71
- euroeval/benchmark_modules/litellm.py +115 -53
- euroeval/benchmark_modules/vllm.py +107 -92
- euroeval/benchmarker.py +144 -121
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +86 -8
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +80 -29
- euroeval/data_models.py +338 -330
- euroeval/dataset_configs/__init__.py +12 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +55 -93
- euroeval/dataset_configs/dutch.py +48 -87
- euroeval/dataset_configs/english.py +45 -77
- euroeval/dataset_configs/estonian.py +42 -34
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -69
- euroeval/dataset_configs/french.py +39 -75
- euroeval/dataset_configs/german.py +45 -82
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -79
- euroeval/dataset_configs/latvian.py +28 -35
- euroeval/dataset_configs/lithuanian.py +28 -26
- euroeval/dataset_configs/norwegian.py +72 -115
- euroeval/dataset_configs/polish.py +33 -61
- euroeval/dataset_configs/portuguese.py +33 -66
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/spanish.py +42 -77
- euroeval/dataset_configs/swedish.py +52 -90
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +24 -17
- euroeval/generation.py +15 -14
- euroeval/generation_utils.py +8 -8
- euroeval/languages.py +395 -323
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +21 -6
- euroeval/metrics/llm_as_a_judge.py +6 -4
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +17 -19
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +99 -42
- euroeval/prompt_templates/multiple_choice.py +102 -38
- euroeval/prompt_templates/named_entity_recognition.py +172 -51
- euroeval/prompt_templates/reading_comprehension.py +119 -42
- euroeval/prompt_templates/sentiment_classification.py +110 -40
- euroeval/prompt_templates/summarization.py +85 -40
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +11 -10
- euroeval/speed_benchmark.py +5 -6
- euroeval/task_group_utils/multiple_choice_classification.py +2 -4
- euroeval/task_group_utils/question_answering.py +24 -16
- euroeval/task_group_utils/sequence_classification.py +48 -35
- euroeval/task_group_utils/text_to_text.py +19 -9
- euroeval/task_group_utils/token_classification.py +21 -17
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +33 -22
- euroeval/types.py +10 -9
- euroeval/utils.py +35 -149
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/finetuning.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
"""Functions related to the finetuning of models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
6
|
import typing as t
|
|
6
7
|
from functools import partial
|
|
7
8
|
|
|
8
9
|
import torch
|
|
9
|
-
from tqdm.auto import tqdm
|
|
10
10
|
from transformers.trainer_callback import (
|
|
11
11
|
EarlyStoppingCallback,
|
|
12
12
|
PrinterCallback,
|
|
@@ -18,13 +18,9 @@ from transformers.training_args import OptimizerNames, TrainingArguments
|
|
|
18
18
|
from .callbacks import NeverLeaveProgressCallback
|
|
19
19
|
from .enums import DataType
|
|
20
20
|
from .exceptions import InvalidBenchmark, NaNValueInModelOutput
|
|
21
|
+
from .logging_utils import block_terminal_output, get_pbar, log, log_once
|
|
21
22
|
from .model_loading import load_model
|
|
22
|
-
from .utils import
|
|
23
|
-
block_terminal_output,
|
|
24
|
-
clear_memory,
|
|
25
|
-
enforce_reproducibility,
|
|
26
|
-
log_once,
|
|
27
|
-
)
|
|
23
|
+
from .utils import clear_memory, enforce_reproducibility
|
|
28
24
|
|
|
29
25
|
if t.TYPE_CHECKING:
|
|
30
26
|
from datasets import DatasetDict
|
|
@@ -32,16 +28,14 @@ if t.TYPE_CHECKING:
|
|
|
32
28
|
from .benchmark_modules import BenchmarkModule
|
|
33
29
|
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
34
30
|
|
|
35
|
-
logger = logging.getLogger("euroeval")
|
|
36
|
-
|
|
37
31
|
|
|
38
32
|
def finetune(
|
|
39
33
|
model: "BenchmarkModule",
|
|
40
|
-
datasets:
|
|
34
|
+
datasets: c.Sequence["DatasetDict"],
|
|
41
35
|
model_config: "ModelConfig",
|
|
42
36
|
dataset_config: "DatasetConfig",
|
|
43
37
|
benchmark_config: "BenchmarkConfig",
|
|
44
|
-
) ->
|
|
38
|
+
) -> c.Sequence[dict[str, float]]:
|
|
45
39
|
"""Evaluate a model on a dataset through finetuning.
|
|
46
40
|
|
|
47
41
|
Args:
|
|
@@ -58,6 +52,10 @@ def finetune(
|
|
|
58
52
|
|
|
59
53
|
Returns:
|
|
60
54
|
A list of dicts containing the scores for each metric for each iteration.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
InvalidBenchmark:
|
|
58
|
+
If the benchmark could not be completed.
|
|
61
59
|
"""
|
|
62
60
|
# Set the data type to use for the model weights
|
|
63
61
|
using_cuda = benchmark_config.device == torch.device("cuda")
|
|
@@ -70,7 +68,7 @@ def finetune(
|
|
|
70
68
|
|
|
71
69
|
bs: int = benchmark_config.batch_size
|
|
72
70
|
scores: list[dict[str, float]] = list()
|
|
73
|
-
for idx in
|
|
71
|
+
for idx in get_pbar(
|
|
74
72
|
iterable=range(benchmark_config.num_iterations),
|
|
75
73
|
desc="Benchmarking",
|
|
76
74
|
disable=not benchmark_config.progress_bar,
|
|
@@ -80,7 +78,7 @@ def finetune(
|
|
|
80
78
|
model_already_initialized = idx == 0
|
|
81
79
|
|
|
82
80
|
# Run a loop here to deal with automatic reduction of batch size
|
|
83
|
-
|
|
81
|
+
for _ in range(num_attempts := 10):
|
|
84
82
|
# Clear GPU memory
|
|
85
83
|
if not model_already_initialized:
|
|
86
84
|
try:
|
|
@@ -112,7 +110,10 @@ def finetune(
|
|
|
112
110
|
)
|
|
113
111
|
|
|
114
112
|
scores.append(itr_scores)
|
|
115
|
-
|
|
113
|
+
log(
|
|
114
|
+
f"Test scores for iteration {idx}: {itr_scores}",
|
|
115
|
+
level=logging.DEBUG,
|
|
116
|
+
)
|
|
116
117
|
|
|
117
118
|
break
|
|
118
119
|
|
|
@@ -123,9 +124,10 @@ def finetune(
|
|
|
123
124
|
if dtype != DataType.FP32:
|
|
124
125
|
dtype = DataType.FP32
|
|
125
126
|
model_already_initialized = False
|
|
126
|
-
|
|
127
|
+
log(
|
|
127
128
|
"NaN value detected in model outputs while using mixed "
|
|
128
|
-
"precision. Retrying with full fp32 precision."
|
|
129
|
+
"precision. Retrying with full fp32 precision.",
|
|
130
|
+
level=logging.DEBUG,
|
|
129
131
|
)
|
|
130
132
|
else:
|
|
131
133
|
raise InvalidBenchmark(
|
|
@@ -151,7 +153,12 @@ def finetune(
|
|
|
151
153
|
model_already_initialized = False
|
|
152
154
|
|
|
153
155
|
bs //= 2
|
|
154
|
-
|
|
156
|
+
log(f"Reduced batch size to {bs}", level=logging.DEBUG)
|
|
157
|
+
|
|
158
|
+
else:
|
|
159
|
+
raise InvalidBenchmark(
|
|
160
|
+
f"Could not benchmark the model after {num_attempts} attempts!"
|
|
161
|
+
)
|
|
155
162
|
|
|
156
163
|
return scores
|
|
157
164
|
|
euroeval/generation.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Functions related to text generation of models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
6
|
import typing as t
|
|
@@ -11,12 +12,13 @@ from tqdm.auto import tqdm
|
|
|
11
12
|
|
|
12
13
|
from .enums import BatchingPreference, TaskGroup
|
|
13
14
|
from .exceptions import InvalidBenchmark
|
|
15
|
+
from .logging_utils import get_pbar, log, log_once
|
|
14
16
|
from .model_cache import (
|
|
15
17
|
ModelCache,
|
|
16
18
|
load_cached_model_outputs,
|
|
17
19
|
split_dataset_into_cached_and_non_cached,
|
|
18
20
|
)
|
|
19
|
-
from .utils import clear_memory
|
|
21
|
+
from .utils import clear_memory
|
|
20
22
|
|
|
21
23
|
if t.TYPE_CHECKING:
|
|
22
24
|
from datasets import DatasetDict
|
|
@@ -29,16 +31,14 @@ if t.TYPE_CHECKING:
|
|
|
29
31
|
ModelConfig,
|
|
30
32
|
)
|
|
31
33
|
|
|
32
|
-
logger = logging.getLogger("euroeval")
|
|
33
|
-
|
|
34
34
|
|
|
35
35
|
def generate(
|
|
36
36
|
model: "BenchmarkModule",
|
|
37
|
-
datasets:
|
|
37
|
+
datasets: c.Sequence["DatasetDict"],
|
|
38
38
|
model_config: "ModelConfig",
|
|
39
39
|
dataset_config: "DatasetConfig",
|
|
40
40
|
benchmark_config: "BenchmarkConfig",
|
|
41
|
-
) ->
|
|
41
|
+
) -> c.Sequence[dict[str, float]]:
|
|
42
42
|
"""Evaluate a model on a dataset through generation.
|
|
43
43
|
|
|
44
44
|
Args:
|
|
@@ -78,7 +78,7 @@ def generate(
|
|
|
78
78
|
)
|
|
79
79
|
|
|
80
80
|
scores: list[dict[str, float]] = list()
|
|
81
|
-
for idx in
|
|
81
|
+
for idx in get_pbar(
|
|
82
82
|
iterable=range(len(datasets)),
|
|
83
83
|
desc="Benchmarking",
|
|
84
84
|
disable=not benchmark_config.progress_bar,
|
|
@@ -90,7 +90,7 @@ def generate(
|
|
|
90
90
|
dataset_config=dataset_config,
|
|
91
91
|
benchmark_config=benchmark_config,
|
|
92
92
|
)
|
|
93
|
-
|
|
93
|
+
log(f"Test scores for iteration {idx}: {test_scores}", level=logging.DEBUG)
|
|
94
94
|
scores.append(test_scores)
|
|
95
95
|
clear_memory()
|
|
96
96
|
|
|
@@ -142,14 +142,14 @@ def generate_single_iteration(
|
|
|
142
142
|
itr: t.Iterable
|
|
143
143
|
match model.batching_preference:
|
|
144
144
|
case BatchingPreference.SINGLE_SAMPLE:
|
|
145
|
-
itr =
|
|
145
|
+
itr = get_pbar(iterable=non_cached_dataset)
|
|
146
146
|
case BatchingPreference.ALL_AT_ONCE:
|
|
147
147
|
itr = [non_cached_dataset[:]]
|
|
148
148
|
case _:
|
|
149
149
|
num_batches = len(non_cached_dataset) // benchmark_config.batch_size
|
|
150
150
|
if len(non_cached_dataset) % benchmark_config.batch_size != 0:
|
|
151
151
|
num_batches += 1
|
|
152
|
-
itr =
|
|
152
|
+
itr = get_pbar(
|
|
153
153
|
iterable=mit.batched(
|
|
154
154
|
iterable=non_cached_dataset, n=benchmark_config.batch_size
|
|
155
155
|
),
|
|
@@ -254,7 +254,7 @@ def generate_single_iteration(
|
|
|
254
254
|
def debug_log(
|
|
255
255
|
batch: dict[str, t.Any],
|
|
256
256
|
model_output: "GenerativeModelOutput",
|
|
257
|
-
extracted_labels:
|
|
257
|
+
extracted_labels: c.Sequence[dict | str | c.Sequence[str]],
|
|
258
258
|
dataset_config: "DatasetConfig",
|
|
259
259
|
) -> None:
|
|
260
260
|
"""Log inputs and outputs for debugging purposes.
|
|
@@ -297,7 +297,7 @@ def debug_log(
|
|
|
297
297
|
+ "\n"
|
|
298
298
|
+ "\t".join(labels)
|
|
299
299
|
)
|
|
300
|
-
|
|
300
|
+
log("\n\n".join(log_msgs), level=logging.DEBUG)
|
|
301
301
|
return
|
|
302
302
|
|
|
303
303
|
case (
|
|
@@ -332,7 +332,7 @@ def debug_log(
|
|
|
332
332
|
else:
|
|
333
333
|
input_texts = batch["text"]
|
|
334
334
|
|
|
335
|
-
metadata_keys:
|
|
335
|
+
metadata_keys: c.Sequence[str] = [
|
|
336
336
|
key
|
|
337
337
|
for key in batch.keys()
|
|
338
338
|
if key not in ["text", "messages", "label", "labels", "target_text"]
|
|
@@ -347,6 +347,7 @@ def debug_log(
|
|
|
347
347
|
if labels[idx]:
|
|
348
348
|
data_to_log["Label"] = labels[idx]
|
|
349
349
|
data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
|
|
350
|
-
|
|
351
|
-
"\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
|
|
350
|
+
log(
|
|
351
|
+
"\n".join(f"{key}: {value!r}" for key, value in data_to_log.items()),
|
|
352
|
+
level=logging.DEBUG,
|
|
352
353
|
)
|
euroeval/generation_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to generative models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import itertools as it
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
@@ -9,8 +10,9 @@ import typing as t
|
|
|
9
10
|
|
|
10
11
|
from .enums import GenerativeType, TaskGroup
|
|
11
12
|
from .exceptions import InvalidBenchmark, InvalidModel
|
|
13
|
+
from .logging_utils import log_once
|
|
12
14
|
from .tokenisation_utils import apply_chat_template
|
|
13
|
-
from .utils import extract_multiple_choice_labels
|
|
15
|
+
from .utils import extract_multiple_choice_labels
|
|
14
16
|
|
|
15
17
|
if t.TYPE_CHECKING:
|
|
16
18
|
from datasets import DatasetDict
|
|
@@ -18,15 +20,13 @@ if t.TYPE_CHECKING:
|
|
|
18
20
|
|
|
19
21
|
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
20
22
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
23
|
|
|
24
24
|
def extract_few_shot_examples(
|
|
25
25
|
dataset: "DatasetDict",
|
|
26
26
|
dataset_config: "DatasetConfig",
|
|
27
27
|
benchmark_config: "BenchmarkConfig",
|
|
28
28
|
itr_idx: int,
|
|
29
|
-
) ->
|
|
29
|
+
) -> c.Sequence[dict[str, t.Any]]:
|
|
30
30
|
"""Extract few-shot examples from a dataset.
|
|
31
31
|
|
|
32
32
|
This will always extract the examples from the training split.
|
|
@@ -79,7 +79,7 @@ def extract_few_shot_examples(
|
|
|
79
79
|
lambda example: len(example["text"]) < max_num_tokens
|
|
80
80
|
)
|
|
81
81
|
num_short_examples = len(train_with_short_examples)
|
|
82
|
-
if num_short_examples >=
|
|
82
|
+
if num_short_examples >= num_few_shots:
|
|
83
83
|
break
|
|
84
84
|
else:
|
|
85
85
|
raise InvalidBenchmark(
|
|
@@ -144,7 +144,7 @@ def extract_few_shot_examples(
|
|
|
144
144
|
lambda example: len(example["context"]) < max_num_tokens
|
|
145
145
|
)
|
|
146
146
|
num_short_examples = len(train_with_short_examples)
|
|
147
|
-
if num_short_examples >=
|
|
147
|
+
if num_short_examples >= num_few_shots:
|
|
148
148
|
break
|
|
149
149
|
else:
|
|
150
150
|
raise InvalidBenchmark(
|
|
@@ -171,7 +171,7 @@ def extract_few_shot_examples(
|
|
|
171
171
|
|
|
172
172
|
def apply_prompt(
|
|
173
173
|
examples: dict[str, t.Any],
|
|
174
|
-
few_shot_examples:
|
|
174
|
+
few_shot_examples: c.Sequence[dict[str, t.Any]],
|
|
175
175
|
model_config: "ModelConfig",
|
|
176
176
|
dataset_config: "DatasetConfig",
|
|
177
177
|
generative_type: GenerativeType | None,
|
|
@@ -432,7 +432,7 @@ def apply_prompt(
|
|
|
432
432
|
|
|
433
433
|
|
|
434
434
|
def raise_if_wrong_params(
|
|
435
|
-
model_config: "ModelConfig", allowed_params: dict[re.Pattern,
|
|
435
|
+
model_config: "ModelConfig", allowed_params: dict[re.Pattern, c.Sequence[str]]
|
|
436
436
|
) -> None:
|
|
437
437
|
"""Raise an error if the model configuration has invalid parameters.
|
|
438
438
|
|