EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +7 -0
- euroeval/benchmark_config_factory.py +7 -0
- euroeval/benchmark_modules/base.py +29 -29
- euroeval/benchmark_modules/fresh.py +31 -19
- euroeval/benchmark_modules/hf.py +27 -23
- euroeval/benchmark_modules/litellm.py +50 -30
- euroeval/benchmark_modules/vllm.py +22 -26
- euroeval/benchmarker.py +8 -1
- euroeval/callbacks.py +17 -13
- euroeval/cli.py +10 -0
- euroeval/data_loading.py +10 -5
- euroeval/data_models.py +9 -40
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/english.py +13 -4
- euroeval/dataset_configs/norwegian.py +8 -0
- euroeval/dataset_configs/portuguese.py +74 -0
- euroeval/dataset_configs/spanish.py +4 -3
- euroeval/finetuning.py +9 -8
- euroeval/generation.py +27 -8
- euroeval/human_evaluation.py +14 -13
- euroeval/languages.py +1 -2
- euroeval/metrics.py +452 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +9 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +8 -1
- euroeval/scores.py +14 -19
- euroeval/speed_benchmark.py +6 -7
- euroeval/task_group_utils/multiple_choice_classification.py +6 -4
- euroeval/task_group_utils/question_answering.py +5 -28
- euroeval/task_group_utils/sequence_classification.py +6 -30
- euroeval/task_group_utils/text_to_text.py +19 -34
- euroeval/task_group_utils/token_classification.py +18 -30
- euroeval/tasks.py +11 -136
- euroeval/types.py +6 -4
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
- euroeval-15.12.0.dist-info/RECORD +63 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
- euroeval-15.10.1.dist-info/RECORD +0 -61
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
euroeval/__init__.py
CHANGED
|
@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
|
|
|
86
86
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
87
87
|
|
|
88
88
|
|
|
89
|
+
# Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
|
|
90
|
+
# The error comes from the `aiohttp` package, and this environment variable forces the
|
|
91
|
+
# use of `httpx` instead.
|
|
92
|
+
# Link: https://github.com/BerriAI/litellm/issues/11657#issuecomment-3038984975
|
|
93
|
+
os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
|
|
94
|
+
|
|
95
|
+
|
|
89
96
|
# Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
|
|
90
97
|
# but XGrammar does not support having a maximal amount of elements in lists
|
|
91
98
|
os.environ["VLLM_USE_V1"] = "0"
|
|
@@ -42,6 +42,7 @@ def build_benchmark_config(
|
|
|
42
42
|
num_iterations: int,
|
|
43
43
|
api_base: str | None,
|
|
44
44
|
api_version: str | None,
|
|
45
|
+
gpu_memory_utilization: float,
|
|
45
46
|
debug: bool,
|
|
46
47
|
run_with_cli: bool,
|
|
47
48
|
only_allow_safetensors: bool,
|
|
@@ -102,6 +103,11 @@ def build_benchmark_config(
|
|
|
102
103
|
model on an inference API.
|
|
103
104
|
api_version:
|
|
104
105
|
The version of the API to use for a given inference API.
|
|
106
|
+
gpu_memory_utilization:
|
|
107
|
+
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
108
|
+
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
109
|
+
this if you are running out of GPU memory. Only relevant if the model is
|
|
110
|
+
generative.
|
|
105
111
|
debug:
|
|
106
112
|
Whether to run the benchmark in debug mode.
|
|
107
113
|
run_with_cli:
|
|
@@ -154,6 +160,7 @@ def build_benchmark_config(
|
|
|
154
160
|
num_iterations=num_iterations,
|
|
155
161
|
api_base=api_base,
|
|
156
162
|
api_version=api_version,
|
|
163
|
+
gpu_memory_utilization=gpu_memory_utilization,
|
|
157
164
|
debug=debug,
|
|
158
165
|
run_with_cli=run_with_cli,
|
|
159
166
|
only_allow_safetensors=only_allow_safetensors,
|
|
@@ -10,17 +10,8 @@ from functools import cached_property, partial
|
|
|
10
10
|
from datasets import DatasetDict
|
|
11
11
|
from torch import nn
|
|
12
12
|
from tqdm.auto import tqdm
|
|
13
|
-
|
|
14
|
-
from
|
|
15
|
-
|
|
16
|
-
from ..data_models import (
|
|
17
|
-
BenchmarkConfig,
|
|
18
|
-
DatasetConfig,
|
|
19
|
-
GenerativeModelOutput,
|
|
20
|
-
ModelConfig,
|
|
21
|
-
Task,
|
|
22
|
-
)
|
|
23
|
-
from ..enums import BatchingPreference, GenerativeType, TaskGroup
|
|
13
|
+
|
|
14
|
+
from ..enums import TaskGroup
|
|
24
15
|
from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
|
|
25
16
|
from ..task_group_utils import (
|
|
26
17
|
question_answering,
|
|
@@ -28,9 +19,22 @@ from ..task_group_utils import (
|
|
|
28
19
|
text_to_text,
|
|
29
20
|
token_classification,
|
|
30
21
|
)
|
|
31
|
-
from ..types import ComputeMetricsFunction, ExtractLabelsFunction
|
|
32
22
|
from ..utils import log_once
|
|
33
23
|
|
|
24
|
+
if t.TYPE_CHECKING:
|
|
25
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
26
|
+
from transformers.trainer import Trainer
|
|
27
|
+
|
|
28
|
+
from ..data_models import (
|
|
29
|
+
BenchmarkConfig,
|
|
30
|
+
DatasetConfig,
|
|
31
|
+
GenerativeModelOutput,
|
|
32
|
+
ModelConfig,
|
|
33
|
+
Task,
|
|
34
|
+
)
|
|
35
|
+
from ..enums import BatchingPreference, GenerativeType
|
|
36
|
+
from ..types import ComputeMetricsFunction, ExtractLabelsFunction
|
|
37
|
+
|
|
34
38
|
logger = logging.getLogger("euroeval")
|
|
35
39
|
|
|
36
40
|
|
|
@@ -49,14 +53,14 @@ class BenchmarkModule(ABC):
|
|
|
49
53
|
"""
|
|
50
54
|
|
|
51
55
|
fresh_model: bool
|
|
52
|
-
batching_preference: BatchingPreference
|
|
56
|
+
batching_preference: "BatchingPreference"
|
|
53
57
|
high_priority: bool
|
|
54
58
|
|
|
55
59
|
def __init__(
|
|
56
60
|
self,
|
|
57
|
-
model_config: ModelConfig,
|
|
58
|
-
dataset_config: DatasetConfig,
|
|
59
|
-
benchmark_config: BenchmarkConfig,
|
|
61
|
+
model_config: "ModelConfig",
|
|
62
|
+
dataset_config: "DatasetConfig",
|
|
63
|
+
benchmark_config: "BenchmarkConfig",
|
|
60
64
|
) -> None:
|
|
61
65
|
"""Initialise the benchmark module.
|
|
62
66
|
|
|
@@ -138,7 +142,7 @@ class BenchmarkModule(ABC):
|
|
|
138
142
|
|
|
139
143
|
@property
|
|
140
144
|
@abstractmethod
|
|
141
|
-
def generative_type(self) -> GenerativeType | None:
|
|
145
|
+
def generative_type(self) -> "GenerativeType | None":
|
|
142
146
|
"""Get the generative type of the model.
|
|
143
147
|
|
|
144
148
|
Returns:
|
|
@@ -177,7 +181,7 @@ class BenchmarkModule(ABC):
|
|
|
177
181
|
...
|
|
178
182
|
|
|
179
183
|
@property
|
|
180
|
-
def compute_metrics(self) -> ComputeMetricsFunction:
|
|
184
|
+
def compute_metrics(self) -> "ComputeMetricsFunction":
|
|
181
185
|
"""The function used to compute the metrics.
|
|
182
186
|
|
|
183
187
|
Returns:
|
|
@@ -188,13 +192,11 @@ class BenchmarkModule(ABC):
|
|
|
188
192
|
return partial(
|
|
189
193
|
sequence_classification.compute_metrics,
|
|
190
194
|
dataset_config=self.dataset_config,
|
|
191
|
-
benchmark_config=self.benchmark_config,
|
|
192
195
|
)
|
|
193
196
|
case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
194
197
|
return partial(
|
|
195
198
|
sequence_classification.compute_metrics,
|
|
196
199
|
dataset_config=self.dataset_config,
|
|
197
|
-
benchmark_config=self.benchmark_config,
|
|
198
200
|
)
|
|
199
201
|
case TaskGroup.TEXT_TO_TEXT:
|
|
200
202
|
return partial(
|
|
@@ -207,13 +209,11 @@ class BenchmarkModule(ABC):
|
|
|
207
209
|
token_classification.compute_metrics,
|
|
208
210
|
has_misc_tags=self.buffer.get("has_misc_tags", True),
|
|
209
211
|
dataset_config=self.dataset_config,
|
|
210
|
-
benchmark_config=self.benchmark_config,
|
|
211
212
|
)
|
|
212
213
|
case TaskGroup.QUESTION_ANSWERING:
|
|
213
214
|
return partial(
|
|
214
215
|
question_answering.compute_metrics,
|
|
215
216
|
dataset_config=self.dataset_config,
|
|
216
|
-
benchmark_config=self.benchmark_config,
|
|
217
217
|
)
|
|
218
218
|
case _:
|
|
219
219
|
raise NotImplementedError(
|
|
@@ -222,7 +222,7 @@ class BenchmarkModule(ABC):
|
|
|
222
222
|
|
|
223
223
|
@property
|
|
224
224
|
@abstractmethod
|
|
225
|
-
def extract_labels_from_generation(self) -> ExtractLabelsFunction:
|
|
225
|
+
def extract_labels_from_generation(self) -> "ExtractLabelsFunction":
|
|
226
226
|
"""The function used to extract the labels from the generated output.
|
|
227
227
|
|
|
228
228
|
Returns:
|
|
@@ -241,7 +241,7 @@ class BenchmarkModule(ABC):
|
|
|
241
241
|
...
|
|
242
242
|
|
|
243
243
|
def prepare_datasets(
|
|
244
|
-
self, datasets: list[DatasetDict], task: Task
|
|
244
|
+
self, datasets: list[DatasetDict], task: "Task"
|
|
245
245
|
) -> list[DatasetDict]:
|
|
246
246
|
"""Prepare the datasets for the model.
|
|
247
247
|
|
|
@@ -283,7 +283,7 @@ class BenchmarkModule(ABC):
|
|
|
283
283
|
|
|
284
284
|
@abstractmethod
|
|
285
285
|
def prepare_dataset(
|
|
286
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
286
|
+
self, dataset: DatasetDict, task: "Task", itr_idx: int
|
|
287
287
|
) -> DatasetDict:
|
|
288
288
|
"""Prepare the dataset for the model.
|
|
289
289
|
|
|
@@ -302,7 +302,7 @@ class BenchmarkModule(ABC):
|
|
|
302
302
|
"""
|
|
303
303
|
...
|
|
304
304
|
|
|
305
|
-
def generate(self, inputs: dict) -> GenerativeModelOutput:
|
|
305
|
+
def generate(self, inputs: dict) -> "GenerativeModelOutput":
|
|
306
306
|
"""Generate outputs from the model.
|
|
307
307
|
|
|
308
308
|
Args:
|
|
@@ -320,7 +320,7 @@ class BenchmarkModule(ABC):
|
|
|
320
320
|
@classmethod
|
|
321
321
|
@abstractmethod
|
|
322
322
|
def model_exists(
|
|
323
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
323
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
324
324
|
) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
|
|
325
325
|
"""Check if a model exists.
|
|
326
326
|
|
|
@@ -339,8 +339,8 @@ class BenchmarkModule(ABC):
|
|
|
339
339
|
@classmethod
|
|
340
340
|
@abstractmethod
|
|
341
341
|
def get_model_config(
|
|
342
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
343
|
-
) -> ModelConfig:
|
|
342
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
343
|
+
) -> "ModelConfig":
|
|
344
344
|
"""Fetch the model configuration.
|
|
345
345
|
|
|
346
346
|
Args:
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
"""Freshly initialised encoder models."""
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
import typing as t
|
|
4
5
|
from functools import cached_property
|
|
5
6
|
from json import JSONDecodeError
|
|
6
7
|
|
|
7
|
-
from transformers.configuration_utils import PretrainedConfig
|
|
8
|
-
from transformers.modeling_utils import PreTrainedModel
|
|
9
8
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
10
9
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
11
10
|
from transformers.models.electra import (
|
|
@@ -18,9 +17,8 @@ from transformers.models.xlm_roberta import (
|
|
|
18
17
|
XLMRobertaForSequenceClassification,
|
|
19
18
|
XLMRobertaForTokenClassification,
|
|
20
19
|
)
|
|
21
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
22
20
|
|
|
23
|
-
from ..data_models import
|
|
21
|
+
from ..data_models import ModelConfig
|
|
24
22
|
from ..enums import InferenceBackend, ModelType, TaskGroup
|
|
25
23
|
from ..exceptions import (
|
|
26
24
|
InvalidBenchmark,
|
|
@@ -35,6 +33,13 @@ from .hf import (
|
|
|
35
33
|
setup_model_for_question_answering,
|
|
36
34
|
)
|
|
37
35
|
|
|
36
|
+
if t.TYPE_CHECKING:
|
|
37
|
+
from transformers.configuration_utils import PretrainedConfig
|
|
38
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
39
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
40
|
+
|
|
41
|
+
from ..data_models import BenchmarkConfig, DatasetConfig
|
|
42
|
+
|
|
38
43
|
|
|
39
44
|
class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
40
45
|
"""A freshly initialised encoder model."""
|
|
@@ -43,9 +48,9 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
43
48
|
|
|
44
49
|
def __init__(
|
|
45
50
|
self,
|
|
46
|
-
model_config: ModelConfig,
|
|
47
|
-
dataset_config: DatasetConfig,
|
|
48
|
-
benchmark_config: BenchmarkConfig,
|
|
51
|
+
model_config: "ModelConfig",
|
|
52
|
+
dataset_config: "DatasetConfig",
|
|
53
|
+
benchmark_config: "BenchmarkConfig",
|
|
49
54
|
) -> None:
|
|
50
55
|
"""Initialise the model.
|
|
51
56
|
|
|
@@ -67,8 +72,8 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
67
72
|
benchmark_config=benchmark_config,
|
|
68
73
|
model_max_length=self.model_max_length,
|
|
69
74
|
)
|
|
70
|
-
self._model: PreTrainedModel = model
|
|
71
|
-
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
75
|
+
self._model: "PreTrainedModel" = model
|
|
76
|
+
self._tokenizer: "PreTrainedTokenizer" = tokenizer
|
|
72
77
|
|
|
73
78
|
self._model, self._tokenizer = align_model_and_tokenizer(
|
|
74
79
|
model=self._model,
|
|
@@ -141,7 +146,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
141
146
|
|
|
142
147
|
@classmethod
|
|
143
148
|
def model_exists(
|
|
144
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
149
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
145
150
|
) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
|
|
146
151
|
"""Check if a model exists.
|
|
147
152
|
|
|
@@ -160,8 +165,8 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
160
165
|
|
|
161
166
|
@classmethod
|
|
162
167
|
def get_model_config(
|
|
163
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
164
|
-
) -> ModelConfig:
|
|
168
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
169
|
+
) -> "ModelConfig":
|
|
165
170
|
"""Fetch the model configuration.
|
|
166
171
|
|
|
167
172
|
Args:
|
|
@@ -190,11 +195,11 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
190
195
|
|
|
191
196
|
|
|
192
197
|
def load_model_and_tokenizer(
|
|
193
|
-
model_config: ModelConfig,
|
|
194
|
-
dataset_config: DatasetConfig,
|
|
195
|
-
benchmark_config: BenchmarkConfig,
|
|
198
|
+
model_config: "ModelConfig",
|
|
199
|
+
dataset_config: "DatasetConfig",
|
|
200
|
+
benchmark_config: "BenchmarkConfig",
|
|
196
201
|
model_max_length: int,
|
|
197
|
-
) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
|
|
202
|
+
) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
|
|
198
203
|
"""Load the model and tokenizer.
|
|
199
204
|
|
|
200
205
|
Args:
|
|
@@ -248,12 +253,19 @@ def load_model_and_tokenizer(
|
|
|
248
253
|
)
|
|
249
254
|
model_cls = model_cls_mapping[model_id]
|
|
250
255
|
|
|
256
|
+
# Special case where there is a mismatch between the labels during training and
|
|
257
|
+
# testing
|
|
258
|
+
if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
259
|
+
id2label = {0: "0", 1: "1"}
|
|
260
|
+
else:
|
|
261
|
+
id2label = dataset_config.id2label
|
|
262
|
+
|
|
251
263
|
config = AutoConfig.from_pretrained(
|
|
252
264
|
real_model_id,
|
|
253
265
|
token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
|
|
254
|
-
num_labels=
|
|
255
|
-
id2label=
|
|
256
|
-
label2id=
|
|
266
|
+
num_labels=len(id2label),
|
|
267
|
+
id2label=id2label,
|
|
268
|
+
label2id={label: id_ for id_, label in id2label.items()},
|
|
257
269
|
cache_dir=model_config.model_cache_dir,
|
|
258
270
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
259
271
|
)
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -24,7 +24,6 @@ from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
|
|
|
24
24
|
from peft import PeftConfig
|
|
25
25
|
from requests.exceptions import RequestException
|
|
26
26
|
from torch import nn
|
|
27
|
-
from transformers.configuration_utils import PretrainedConfig
|
|
28
27
|
from transformers.data.data_collator import (
|
|
29
28
|
DataCollatorForTokenClassification,
|
|
30
29
|
DataCollatorWithPadding,
|
|
@@ -33,8 +32,6 @@ from transformers.modelcard import TASK_MAPPING
|
|
|
33
32
|
from transformers.modeling_utils import PreTrainedModel
|
|
34
33
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
35
34
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
36
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
37
|
-
from transformers.tokenization_utils_base import BatchEncoding
|
|
38
35
|
from transformers.trainer import Trainer
|
|
39
36
|
from urllib3.exceptions import RequestError
|
|
40
37
|
|
|
@@ -45,7 +42,7 @@ from ..constants import (
|
|
|
45
42
|
MAX_CONTEXT_LENGTH,
|
|
46
43
|
MERGE_TAGS,
|
|
47
44
|
)
|
|
48
|
-
from ..data_models import
|
|
45
|
+
from ..data_models import HFModelInfo, ModelConfig
|
|
49
46
|
from ..enums import (
|
|
50
47
|
BatchingPreference,
|
|
51
48
|
GenerativeType,
|
|
@@ -67,7 +64,6 @@ from ..task_group_utils import (
|
|
|
67
64
|
token_classification,
|
|
68
65
|
)
|
|
69
66
|
from ..tokenization_utils import get_bos_token, get_eos_token
|
|
70
|
-
from ..types import ExtractLabelsFunction
|
|
71
67
|
from ..utils import (
|
|
72
68
|
block_terminal_output,
|
|
73
69
|
create_model_cache_dir,
|
|
@@ -77,6 +73,14 @@ from ..utils import (
|
|
|
77
73
|
)
|
|
78
74
|
from .base import BenchmarkModule
|
|
79
75
|
|
|
76
|
+
if t.TYPE_CHECKING:
|
|
77
|
+
from transformers.configuration_utils import PretrainedConfig
|
|
78
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
79
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
80
|
+
|
|
81
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, Task
|
|
82
|
+
from ..types import ExtractLabelsFunction
|
|
83
|
+
|
|
80
84
|
logger = logging.getLogger("euroeval")
|
|
81
85
|
|
|
82
86
|
|
|
@@ -89,9 +93,9 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
89
93
|
|
|
90
94
|
def __init__(
|
|
91
95
|
self,
|
|
92
|
-
model_config: ModelConfig,
|
|
93
|
-
dataset_config: DatasetConfig,
|
|
94
|
-
benchmark_config: BenchmarkConfig,
|
|
96
|
+
model_config: "ModelConfig",
|
|
97
|
+
dataset_config: "DatasetConfig",
|
|
98
|
+
benchmark_config: "BenchmarkConfig",
|
|
95
99
|
) -> None:
|
|
96
100
|
"""Initialise the model.
|
|
97
101
|
|
|
@@ -108,8 +112,8 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
108
112
|
dataset_config=dataset_config,
|
|
109
113
|
benchmark_config=benchmark_config,
|
|
110
114
|
)
|
|
111
|
-
self._model: PreTrainedModel = model
|
|
112
|
-
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
115
|
+
self._model: "PreTrainedModel" = model
|
|
116
|
+
self._tokenizer: "PreTrainedTokenizer" = tokenizer
|
|
113
117
|
|
|
114
118
|
self._model, self._tokenizer = align_model_and_tokenizer(
|
|
115
119
|
model=self._model,
|
|
@@ -291,7 +295,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
291
295
|
return None
|
|
292
296
|
|
|
293
297
|
@property
|
|
294
|
-
def extract_labels_from_generation(self) -> ExtractLabelsFunction:
|
|
298
|
+
def extract_labels_from_generation(self) -> "ExtractLabelsFunction":
|
|
295
299
|
"""The function used to extract the labels from the generated output.
|
|
296
300
|
|
|
297
301
|
Returns:
|
|
@@ -328,7 +332,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
328
332
|
)
|
|
329
333
|
|
|
330
334
|
def prepare_dataset(
|
|
331
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
335
|
+
self, dataset: DatasetDict, task: "Task", itr_idx: int
|
|
332
336
|
) -> DatasetDict:
|
|
333
337
|
"""Prepare the dataset for the model.
|
|
334
338
|
|
|
@@ -361,7 +365,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
361
365
|
)
|
|
362
366
|
return examples
|
|
363
367
|
|
|
364
|
-
def tokenise(examples: dict) -> BatchEncoding:
|
|
368
|
+
def tokenise(examples: dict) -> "BatchEncoding":
|
|
365
369
|
return self._tokenizer(text=examples["text"], truncation=True, padding=True)
|
|
366
370
|
|
|
367
371
|
match task.task_group:
|
|
@@ -481,7 +485,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
481
485
|
|
|
482
486
|
@classmethod
|
|
483
487
|
def model_exists(
|
|
484
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
488
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
485
489
|
) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
|
|
486
490
|
"""Check if a model exists.
|
|
487
491
|
|
|
@@ -508,8 +512,8 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
508
512
|
|
|
509
513
|
@classmethod
|
|
510
514
|
def get_model_config(
|
|
511
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
512
|
-
) -> ModelConfig:
|
|
515
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
516
|
+
) -> "ModelConfig":
|
|
513
517
|
"""Fetch the model configuration.
|
|
514
518
|
|
|
515
519
|
Args:
|
|
@@ -556,10 +560,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
556
560
|
|
|
557
561
|
|
|
558
562
|
def load_model_and_tokenizer(
|
|
559
|
-
model_config: ModelConfig,
|
|
560
|
-
dataset_config: DatasetConfig,
|
|
561
|
-
benchmark_config: BenchmarkConfig,
|
|
562
|
-
) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
|
|
563
|
+
model_config: "ModelConfig",
|
|
564
|
+
dataset_config: "DatasetConfig",
|
|
565
|
+
benchmark_config: "BenchmarkConfig",
|
|
566
|
+
) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
|
|
563
567
|
"""Load the model and tokenizer.
|
|
564
568
|
|
|
565
569
|
Args:
|
|
@@ -618,7 +622,7 @@ def load_model_and_tokenizer(
|
|
|
618
622
|
# These are used when a timeout occurs
|
|
619
623
|
attempts_left = 5
|
|
620
624
|
|
|
621
|
-
model: PreTrainedModel | None = None
|
|
625
|
+
model: "PreTrainedModel | None" = None
|
|
622
626
|
while True:
|
|
623
627
|
# Get the model class associated with the task group
|
|
624
628
|
model_cls_or_none: t.Type["PreTrainedModel"] | None = get_class_by_name(
|
|
@@ -703,8 +707,8 @@ def load_model_and_tokenizer(
|
|
|
703
707
|
|
|
704
708
|
|
|
705
709
|
def get_model_repo_info(
|
|
706
|
-
model_id: str, revision: str, benchmark_config: BenchmarkConfig
|
|
707
|
-
) -> HFModelInfo | None:
|
|
710
|
+
model_id: str, revision: str, benchmark_config: "BenchmarkConfig"
|
|
711
|
+
) -> "HFModelInfo | None":
|
|
708
712
|
"""Get the information about the model from the HF Hub or a local directory.
|
|
709
713
|
|
|
710
714
|
Args:
|
|
@@ -11,7 +11,6 @@ from time import sleep
|
|
|
11
11
|
|
|
12
12
|
import litellm
|
|
13
13
|
import ollama
|
|
14
|
-
from datasets import DatasetDict
|
|
15
14
|
from huggingface_hub import HfApi
|
|
16
15
|
from huggingface_hub.errors import (
|
|
17
16
|
HFValidationError,
|
|
@@ -31,12 +30,11 @@ from litellm.exceptions import (
|
|
|
31
30
|
)
|
|
32
31
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
33
32
|
from litellm.router import Router
|
|
34
|
-
from litellm.types.utils import ChoiceLogprobs
|
|
33
|
+
from litellm.types.utils import ChoiceLogprobs
|
|
35
34
|
from pydantic import conlist, create_model
|
|
36
35
|
from requests.exceptions import RequestException
|
|
37
36
|
from tqdm.asyncio import tqdm as tqdm_async
|
|
38
37
|
from tqdm.auto import tqdm
|
|
39
|
-
from transformers.trainer import Trainer
|
|
40
38
|
|
|
41
39
|
from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
|
|
42
40
|
from ..data_models import (
|
|
@@ -78,6 +76,11 @@ from ..utils import (
|
|
|
78
76
|
from .base import BenchmarkModule
|
|
79
77
|
from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
|
|
80
78
|
|
|
79
|
+
if t.TYPE_CHECKING:
|
|
80
|
+
from datasets import DatasetDict
|
|
81
|
+
from litellm.types.utils import ModelResponse
|
|
82
|
+
from transformers.trainer import Trainer
|
|
83
|
+
|
|
81
84
|
logger = logging.getLogger("euroeval")
|
|
82
85
|
|
|
83
86
|
|
|
@@ -140,18 +143,15 @@ NUM_PARAMS_MAPPING = {
|
|
|
140
143
|
|
|
141
144
|
ALLOWED_PARAMS = {
|
|
142
145
|
# OpenAI models
|
|
143
|
-
r"
|
|
144
|
-
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
|
|
146
|
+
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
|
|
145
147
|
# Anthropic models
|
|
146
|
-
r"(anthropic/)?claude-3-
|
|
147
|
-
r"(anthropic/)?claude-
|
|
148
|
-
r"(anthropic/)?claude-3-7-sonnet.*": ["thinking"],
|
|
148
|
+
r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
|
|
149
|
+
r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
|
|
149
150
|
# Gemini models
|
|
150
|
-
r"(gemini/)?gemini
|
|
151
|
+
r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
|
|
152
|
+
r"(gemini/)?gemini-2.5-flash-[0-9].*": ["no-thinking", "thinking"],
|
|
151
153
|
# xAI models
|
|
152
|
-
r"(xai/)?grok-
|
|
153
|
-
r"(xai/)?grok-3(-fast)?(-beta)?": [],
|
|
154
|
-
r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "high"],
|
|
154
|
+
r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
|
|
155
155
|
}
|
|
156
156
|
|
|
157
157
|
|
|
@@ -170,18 +170,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
170
170
|
batching_preference = BatchingPreference.ALL_AT_ONCE
|
|
171
171
|
high_priority = False
|
|
172
172
|
|
|
173
|
-
_handleable_exceptions = (
|
|
174
|
-
BadRequestError,
|
|
175
|
-
RateLimitError,
|
|
176
|
-
APIError,
|
|
177
|
-
APIConnectionError,
|
|
178
|
-
Timeout,
|
|
179
|
-
ServiceUnavailableError,
|
|
180
|
-
InternalServerError,
|
|
181
|
-
SystemError,
|
|
182
|
-
AuthenticationError,
|
|
183
|
-
)
|
|
184
|
-
|
|
185
173
|
def __init__(
|
|
186
174
|
self,
|
|
187
175
|
model_config: ModelConfig,
|
|
@@ -240,6 +228,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
240
228
|
)
|
|
241
229
|
elif self.model_config.revision in {"thinking"}:
|
|
242
230
|
type_ = GenerativeType.REASONING
|
|
231
|
+
elif self.model_config.revision in {"no-thinking"}:
|
|
232
|
+
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
243
233
|
elif re.fullmatch(
|
|
244
234
|
pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
|
|
245
235
|
):
|
|
@@ -370,7 +360,13 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
370
360
|
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
371
361
|
level=logging.DEBUG,
|
|
372
362
|
)
|
|
373
|
-
elif self.model_config.revision
|
|
363
|
+
elif self.model_config.revision == "no-thinking":
|
|
364
|
+
generation_kwargs["thinking"] = dict(type="disabled", budget_tokens=0)
|
|
365
|
+
log_once(
|
|
366
|
+
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
367
|
+
level=logging.DEBUG,
|
|
368
|
+
)
|
|
369
|
+
elif self.model_config.revision in {"low", "medium", "high"}:
|
|
374
370
|
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
375
371
|
log_once(
|
|
376
372
|
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
@@ -381,7 +377,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
381
377
|
# Drop generation kwargs that are not supported by the model
|
|
382
378
|
litellm.drop_params = True
|
|
383
379
|
|
|
384
|
-
all_responses: dict[int, ModelResponse] = {}
|
|
380
|
+
all_responses: dict[int, "ModelResponse"] = {}
|
|
385
381
|
conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
|
|
386
382
|
enumerate(conversations)
|
|
387
383
|
)
|
|
@@ -477,6 +473,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
477
473
|
]
|
|
478
474
|
max_items_messages = ["'maxItems' is not permitted."]
|
|
479
475
|
no_json_schema_messages = ["Property keys should match pattern"]
|
|
476
|
+
thinking_budget_pattern = re.compile(
|
|
477
|
+
r"the thinking budget [0-9]+ is invalid. please choose a value between "
|
|
478
|
+
r"[0-9]+ and ([0-9]+)\."
|
|
479
|
+
)
|
|
480
480
|
|
|
481
481
|
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
482
482
|
log_once(
|
|
@@ -537,6 +537,26 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
537
537
|
)
|
|
538
538
|
generation_kwargs["response_format"] = dict(type="json_object")
|
|
539
539
|
return
|
|
540
|
+
elif thinking_match := thinking_budget_pattern.search(string=error_msg):
|
|
541
|
+
thinking_budget = int(thinking_match.group(1))
|
|
542
|
+
if thinking_budget >= REASONING_MAX_TOKENS:
|
|
543
|
+
raise InvalidBenchmark(
|
|
544
|
+
f"The model {model_id!r} has an upper thinking budget of "
|
|
545
|
+
f"{thinking_budget:,} tokens, which is within the limit of "
|
|
546
|
+
f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
|
|
547
|
+
f"error message was: {error_msg}."
|
|
548
|
+
)
|
|
549
|
+
log_once(
|
|
550
|
+
f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
|
|
551
|
+
"for reasoning, which is less than the default of "
|
|
552
|
+
f"{REASONING_MAX_TOKENS:,} tokens. Setting the thinking budget to "
|
|
553
|
+
f"{thinking_budget:,} tokens.",
|
|
554
|
+
level=logging.DEBUG,
|
|
555
|
+
)
|
|
556
|
+
generation_kwargs["thinking"] = dict(
|
|
557
|
+
type="enabled", budget_tokens=thinking_budget - 1
|
|
558
|
+
)
|
|
559
|
+
return
|
|
540
560
|
elif isinstance(
|
|
541
561
|
error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
|
|
542
562
|
):
|
|
@@ -581,7 +601,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
581
601
|
model_id: str,
|
|
582
602
|
conversations: list[list[litellm.AllMessageValues]],
|
|
583
603
|
**generation_kwargs,
|
|
584
|
-
) -> tuple[list[tuple[int, ModelResponse]], list[tuple[int, Exception]]]:
|
|
604
|
+
) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
|
|
585
605
|
"""Generate outputs from the model asynchronously.
|
|
586
606
|
|
|
587
607
|
Args:
|
|
@@ -641,7 +661,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
641
661
|
|
|
642
662
|
@staticmethod
|
|
643
663
|
def _create_model_output(
|
|
644
|
-
model_responses: list[ModelResponse], model_id: str
|
|
664
|
+
model_responses: list["ModelResponse"], model_id: str
|
|
645
665
|
) -> GenerativeModelOutput:
|
|
646
666
|
"""Create a GenerativeModelOutput object from a list of ModelResponse objects.
|
|
647
667
|
|
|
@@ -1123,8 +1143,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1123
1143
|
)
|
|
1124
1144
|
|
|
1125
1145
|
def prepare_dataset(
|
|
1126
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
1127
|
-
) -> DatasetDict:
|
|
1146
|
+
self, dataset: "DatasetDict", task: Task, itr_idx: int
|
|
1147
|
+
) -> "DatasetDict":
|
|
1128
1148
|
"""Prepare the dataset for the model.
|
|
1129
1149
|
|
|
1130
1150
|
This includes things like tokenisation.
|