EuroEval 15.4.2__py3-none-any.whl → 15.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/hf.py +11 -2
- euroeval/benchmark_modules/litellm.py +204 -74
- euroeval/benchmark_modules/vllm.py +59 -34
- euroeval/benchmarker.py +35 -6
- euroeval/constants.py +8 -1
- euroeval/data_models.py +1 -2
- euroeval/dataset_configs.py +1 -1
- euroeval/task_utils/sequence_classification.py +44 -9
- euroeval/utils.py +100 -4
- {euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/METADATA +5 -4
- {euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/RECORD +15 -15
- {euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
### Block unwanted terminal output that happens on importing external modules ###
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
7
8
|
import sys
|
|
8
9
|
import warnings
|
|
9
10
|
|
|
@@ -14,7 +15,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
|
|
|
14
15
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
15
16
|
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
16
17
|
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
17
|
-
|
|
18
|
+
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
18
19
|
|
|
19
20
|
# Set up logging
|
|
20
21
|
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
@@ -29,7 +30,6 @@ logging.basicConfig(
|
|
|
29
30
|
### Set the rest up ###
|
|
30
31
|
|
|
31
32
|
import importlib.metadata # noqa: E402
|
|
32
|
-
import os # noqa: E402
|
|
33
33
|
|
|
34
34
|
from dotenv import load_dotenv # noqa: E402
|
|
35
35
|
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -44,6 +44,7 @@ from ..constants import (
|
|
|
44
44
|
DUMMY_FILL_VALUE,
|
|
45
45
|
GENERATIVE_PIPELINE_TAGS,
|
|
46
46
|
LOCAL_MODELS_REQUIRED_FILES,
|
|
47
|
+
MAX_CONTEXT_LENGTH,
|
|
47
48
|
MERGE_TAGS,
|
|
48
49
|
)
|
|
49
50
|
from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
|
|
@@ -245,6 +246,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
245
246
|
max_length for max_length in all_max_lengths if max_length >= 128
|
|
246
247
|
]
|
|
247
248
|
|
|
249
|
+
# We remove the upper cap of maximum context length for the model, as it is
|
|
250
|
+
# highly unlikely that this is the model's actual maximum context length - we
|
|
251
|
+
# would rather not report a value than report an incorrect one.
|
|
252
|
+
all_max_lengths = [
|
|
253
|
+
max_length
|
|
254
|
+
for max_length in all_max_lengths
|
|
255
|
+
if max_length != MAX_CONTEXT_LENGTH
|
|
256
|
+
]
|
|
257
|
+
|
|
248
258
|
if len(list(all_max_lengths)) > 0:
|
|
249
259
|
model_max_length = min(list(all_max_lengths))
|
|
250
260
|
else:
|
|
@@ -1140,8 +1150,7 @@ def align_model_and_tokenizer(
|
|
|
1140
1150
|
Returns:
|
|
1141
1151
|
The fixed model and tokenizer.
|
|
1142
1152
|
"""
|
|
1143
|
-
|
|
1144
|
-
model_max_length = min(model_max_length, 5_000)
|
|
1153
|
+
model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
|
|
1145
1154
|
|
|
1146
1155
|
if model_max_length > 0:
|
|
1147
1156
|
tokenizer.model_max_length = model_max_length
|
|
@@ -27,20 +27,17 @@ from litellm.exceptions import (
|
|
|
27
27
|
BadRequestError,
|
|
28
28
|
InternalServerError,
|
|
29
29
|
NotFoundError,
|
|
30
|
+
RateLimitError,
|
|
30
31
|
ServiceUnavailableError,
|
|
31
32
|
Timeout,
|
|
32
33
|
)
|
|
34
|
+
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
33
35
|
from litellm.types.utils import ModelResponse
|
|
34
36
|
from requests.exceptions import RequestException
|
|
35
37
|
from tqdm.auto import tqdm
|
|
36
38
|
from transformers import Trainer
|
|
37
39
|
|
|
38
|
-
from ..constants import
|
|
39
|
-
MAX_LOGPROBS,
|
|
40
|
-
REASONING_MAX_TOKENS,
|
|
41
|
-
TASK_GROUPS_USING_LOGPROBS,
|
|
42
|
-
TASKS_USING_JSON,
|
|
43
|
-
)
|
|
40
|
+
from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
|
|
44
41
|
from ..data_models import (
|
|
45
42
|
BenchmarkConfig,
|
|
46
43
|
DatasetConfig,
|
|
@@ -69,7 +66,7 @@ from ..task_utils import (
|
|
|
69
66
|
token_classification,
|
|
70
67
|
)
|
|
71
68
|
from ..types import ExtractLabelsFunction
|
|
72
|
-
from ..utils import create_model_cache_dir, log_once
|
|
69
|
+
from ..utils import create_model_cache_dir, get_first_label_token_mapping, log_once
|
|
73
70
|
from .base import BenchmarkModule
|
|
74
71
|
from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
|
|
75
72
|
|
|
@@ -78,64 +75,80 @@ logger = logging.getLogger("euroeval")
|
|
|
78
75
|
|
|
79
76
|
VOCAB_SIZE_MAPPING = {
|
|
80
77
|
# OpenAI models
|
|
81
|
-
"(
|
|
82
|
-
"
|
|
83
|
-
"gpt-
|
|
84
|
-
"gpt-4-(
|
|
85
|
-
"gpt-
|
|
86
|
-
"gpt-
|
|
87
|
-
"
|
|
88
|
-
"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
|
|
89
|
-
"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
|
|
90
|
-
"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
78
|
+
r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
|
|
79
|
+
r"gpt-4-[0-9]{4}-preview": 100_256,
|
|
80
|
+
r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
|
|
81
|
+
r"gpt-4-(vision|turbo)(-preview)?": 100_256,
|
|
82
|
+
r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
|
|
83
|
+
r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
|
|
84
|
+
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
91
85
|
# Anthropic models
|
|
92
|
-
"claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
|
|
86
|
+
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
|
|
87
|
+
# Gemini models
|
|
88
|
+
r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
|
|
89
|
+
# xAI models
|
|
90
|
+
r"(xai/)?grok.*": -1,
|
|
93
91
|
}
|
|
94
92
|
|
|
95
93
|
|
|
96
94
|
MODEL_MAX_LENGTH_MAPPING = {
|
|
97
95
|
# OpenAI models
|
|
98
|
-
"
|
|
99
|
-
"
|
|
100
|
-
"
|
|
101
|
-
"gpt-
|
|
102
|
-
"gpt-
|
|
103
|
-
"gpt-3.5-turbo-
|
|
104
|
-
"gpt-
|
|
105
|
-
"
|
|
106
|
-
"
|
|
107
|
-
"
|
|
108
|
-
"gpt-4-(vision|turbo)(-preview)?": 128_000,
|
|
109
|
-
"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
|
|
110
|
-
"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
111
|
-
"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
112
|
-
"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
113
|
-
"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
96
|
+
r"gpt-4(-[0-9]{4})?": 8_191,
|
|
97
|
+
r"gpt-4-32k(-[0-9]{4})?": 32_767,
|
|
98
|
+
r"gpt-4-[0-9]{4}-preview": 128_000,
|
|
99
|
+
r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
100
|
+
r"gpt-4-(vision|turbo)(-preview)?": 128_000,
|
|
101
|
+
r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
|
|
102
|
+
r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
103
|
+
r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
104
|
+
r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
105
|
+
r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
114
106
|
# Anthropic models
|
|
115
|
-
"claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
|
|
107
|
+
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
|
|
108
|
+
# Gemini models
|
|
109
|
+
r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
|
|
110
|
+
r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
|
|
111
|
+
r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
|
|
112
|
+
# xAI models
|
|
113
|
+
r"(xai/)?grok.*": 131_072,
|
|
116
114
|
}
|
|
117
115
|
|
|
118
116
|
|
|
119
117
|
NUM_PARAMS_MAPPING = {
|
|
120
118
|
# OpenAI models
|
|
121
|
-
"
|
|
122
|
-
"(
|
|
123
|
-
"(text-)?curie(-001)?": 13_000_000_000,
|
|
124
|
-
"((text|code)-)?davinci(-00[1-9])?": 175_000_000_000,
|
|
125
|
-
"gpt-(3.5|4)-turbo-((16|32)k)?(-[0-9]{4})?": -1,
|
|
126
|
-
"gpt-4-[0-9]{4}-preview": -1,
|
|
127
|
-
"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
128
|
-
"gpt-4-(vision|turbo)(-preview)?": -1,
|
|
129
|
-
"gpt-3.5-turbo-instruct(-[0-9]{4})?": -1,
|
|
130
|
-
"gpt-4o(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
131
|
-
"gpt-4o-mini(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
132
|
-
"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
119
|
+
r"gpt-4.*": -1,
|
|
120
|
+
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
133
121
|
# Anthropic models
|
|
134
|
-
"
|
|
122
|
+
r"(anthropic/)?claude-*": -1,
|
|
123
|
+
# Gemini models
|
|
124
|
+
r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
|
|
125
|
+
r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
|
|
126
|
+
r"(gemini/)?gemini-2.(0|5).*": -1,
|
|
127
|
+
# xAI models
|
|
128
|
+
r"(xai/)?grok.*": -1,
|
|
135
129
|
}
|
|
136
130
|
|
|
137
131
|
|
|
138
|
-
|
|
132
|
+
ALLOWED_PARAMS = {
|
|
133
|
+
# OpenAI models
|
|
134
|
+
r"gpt-4.*": [],
|
|
135
|
+
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
|
|
136
|
+
# Anthropic models
|
|
137
|
+
r"(anthropic/)?claude-3-.*": [],
|
|
138
|
+
r"(anthropic/)?claude-3.5-.*": [],
|
|
139
|
+
r"(anthropic/)?claude-3.7-sonnet.*": ["thinking"],
|
|
140
|
+
# Gemini models
|
|
141
|
+
r"(gemini/)?gemini-.*": [],
|
|
142
|
+
# xAI models
|
|
143
|
+
r"(xai/)?grok.*": [],
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
REASONING_MODELS = [
|
|
148
|
+
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
|
|
149
|
+
r"(gemini/)?gemini.*thinking.*",
|
|
150
|
+
r"(gemini/)?gemini-2.5-pro.*",
|
|
151
|
+
]
|
|
139
152
|
|
|
140
153
|
|
|
141
154
|
class LiteLLMModel(BenchmarkModule):
|
|
@@ -167,12 +180,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
167
180
|
"ollama/"
|
|
168
181
|
) or model_config.model_id.startswith("ollama_chat/")
|
|
169
182
|
|
|
183
|
+
raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
|
|
184
|
+
|
|
170
185
|
super().__init__(
|
|
171
186
|
model_config=model_config,
|
|
172
187
|
dataset_config=dataset_config,
|
|
173
188
|
benchmark_config=benchmark_config,
|
|
174
189
|
)
|
|
175
190
|
|
|
191
|
+
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
192
|
+
dataset_config=self.dataset_config, tokenizer=None
|
|
193
|
+
)
|
|
194
|
+
|
|
176
195
|
@property
|
|
177
196
|
def generative_type(self) -> GenerativeType | None:
|
|
178
197
|
"""Get the generative type of the model.
|
|
@@ -180,7 +199,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
180
199
|
Returns:
|
|
181
200
|
The generative type of the model, or None if it has not been set yet.
|
|
182
201
|
"""
|
|
183
|
-
if
|
|
202
|
+
if self.model_config.revision == "thinking":
|
|
203
|
+
return GenerativeType.REASONING
|
|
204
|
+
elif re.fullmatch(
|
|
184
205
|
pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
|
|
185
206
|
):
|
|
186
207
|
return GenerativeType.REASONING
|
|
@@ -218,7 +239,13 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
218
239
|
api_version=self.benchmark_config.api_version,
|
|
219
240
|
)
|
|
220
241
|
|
|
221
|
-
|
|
242
|
+
# Get the mapping from labels to the first token in the label. We call this each
|
|
243
|
+
# time we generate a new dataset since the dataset config can change
|
|
244
|
+
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
245
|
+
dataset_config=self.dataset_config, tokenizer=None
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if self.buffer["first_label_token_mapping"]:
|
|
222
249
|
generation_kwargs["logprobs"] = True
|
|
223
250
|
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
224
251
|
|
|
@@ -227,6 +254,27 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
227
254
|
"Prompt must contain 'json' for JSON tasks."
|
|
228
255
|
)
|
|
229
256
|
generation_kwargs["response_format"] = dict(type="json_object")
|
|
257
|
+
log_once(
|
|
258
|
+
"Enabling JSON response format for model "
|
|
259
|
+
f"{self.model_config.model_id!r}",
|
|
260
|
+
level=logging.DEBUG,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if self.model_config.revision == "thinking":
|
|
264
|
+
generation_kwargs["thinking"] = dict(
|
|
265
|
+
type="enabled", budget_tokens=REASONING_MAX_TOKENS
|
|
266
|
+
)
|
|
267
|
+
log_once(
|
|
268
|
+
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
269
|
+
level=logging.DEBUG,
|
|
270
|
+
)
|
|
271
|
+
elif self.model_config.revision in {"low", "high"}:
|
|
272
|
+
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
273
|
+
log_once(
|
|
274
|
+
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
275
|
+
f"{self.model_config.model_id!r}",
|
|
276
|
+
level=logging.DEBUG,
|
|
277
|
+
)
|
|
230
278
|
|
|
231
279
|
# This drops generation kwargs that are not supported by the model
|
|
232
280
|
litellm.drop_params = True
|
|
@@ -235,39 +283,60 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
235
283
|
# handle using newlines as stop sequences, so we try both.
|
|
236
284
|
num_attempts = 10
|
|
237
285
|
for _ in range(num_attempts):
|
|
286
|
+
stop_messages = ["stop_sequences"]
|
|
287
|
+
logprobs_messages = [
|
|
288
|
+
"you are not allowed to request logprobs",
|
|
289
|
+
"you've reached the maximum number of requests with logprobs",
|
|
290
|
+
"logprobs is not supported",
|
|
291
|
+
"logprobs is not enabled",
|
|
292
|
+
]
|
|
293
|
+
temperature_messages = [
|
|
294
|
+
"'temperature' is not supported with this model.",
|
|
295
|
+
"temperature is not supported with this model",
|
|
296
|
+
]
|
|
238
297
|
try:
|
|
239
298
|
model_response = litellm.completion(
|
|
240
299
|
messages=messages, max_retries=3, **generation_kwargs
|
|
241
300
|
)
|
|
242
301
|
break
|
|
243
|
-
except BadRequestError as e:
|
|
244
|
-
if
|
|
302
|
+
except (BadRequestError, RateLimitError) as e:
|
|
303
|
+
if any(msg.lower() in str(e).lower() for msg in stop_messages):
|
|
245
304
|
generation_kwargs["stop"] = None
|
|
246
|
-
elif "you are not allowed to request logprobs" in str(e).lower():
|
|
247
|
-
generation_kwargs.pop("logprobs")
|
|
248
|
-
generation_kwargs.pop("top_logprobs")
|
|
249
305
|
elif (
|
|
250
|
-
|
|
306
|
+
any(msg.lower() in str(e).lower() for msg in logprobs_messages)
|
|
307
|
+
# Special case for Vertex AI models, since they have strict rate
|
|
308
|
+
# limits on using logprobs. They also have a cap of 5 logprobs, but
|
|
309
|
+
# we ignore this since the rate limiting makes it unusable anyway.
|
|
310
|
+
or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
|
|
251
311
|
):
|
|
312
|
+
generation_kwargs.pop("logprobs")
|
|
313
|
+
generation_kwargs.pop("top_logprobs")
|
|
314
|
+
elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
|
|
252
315
|
generation_kwargs.pop("temperature")
|
|
316
|
+
elif isinstance(e, RateLimitError):
|
|
317
|
+
raise InvalidModel(
|
|
318
|
+
"You have encountered your rate limit for model "
|
|
319
|
+
f"{self.model_config.model_id!r}. The error message was: {e}"
|
|
320
|
+
)
|
|
253
321
|
else:
|
|
254
322
|
raise InvalidBenchmark(
|
|
255
323
|
f"Failed to generate text. The error message was: {e}"
|
|
256
324
|
)
|
|
325
|
+
except APIError as e:
|
|
326
|
+
raise InvalidBenchmark(
|
|
327
|
+
f"Failed to generate text. The error message was: {e}"
|
|
328
|
+
)
|
|
257
329
|
except (
|
|
330
|
+
APIConnectionError,
|
|
258
331
|
Timeout,
|
|
259
332
|
ServiceUnavailableError,
|
|
260
|
-
APIConnectionError,
|
|
261
333
|
InternalServerError,
|
|
262
|
-
):
|
|
334
|
+
) as e:
|
|
263
335
|
logger.debug(
|
|
264
|
-
"Service temporarily unavailable.
|
|
336
|
+
f"Service temporarily unavailable. The error message was: {e}. "
|
|
337
|
+
f"Retrying in 5 seconds..."
|
|
265
338
|
)
|
|
266
339
|
sleep(5)
|
|
267
|
-
except APIError as e:
|
|
268
|
-
raise InvalidBenchmark(
|
|
269
|
-
f"Failed to generate text. The error message was: {e}"
|
|
270
|
-
)
|
|
271
340
|
except AuthenticationError:
|
|
272
341
|
raise NeedsAdditionalArgument(
|
|
273
342
|
cli_argument="--api-key",
|
|
@@ -280,6 +349,15 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
280
349
|
)
|
|
281
350
|
|
|
282
351
|
assert isinstance(model_response, ModelResponse)
|
|
352
|
+
if not model_response.choices:
|
|
353
|
+
# This happens for reasoning models, when they don't finish thinking and run
|
|
354
|
+
# out of tokens. Happens quite rarely, but we need to handle it.
|
|
355
|
+
logger.warning(
|
|
356
|
+
f"The model {self.model_config.model_id!r} did not end up generating "
|
|
357
|
+
"any text. This is likely because the model ran out of tokens while "
|
|
358
|
+
"reasoning. Returning an empty string."
|
|
359
|
+
)
|
|
360
|
+
return GenerativeModelOutput(sequences=[""])
|
|
283
361
|
model_response_choices = model_response.choices[0]
|
|
284
362
|
assert isinstance(model_response_choices, litellm.Choices)
|
|
285
363
|
generation_output = model_response_choices.message["content"] or ""
|
|
@@ -314,7 +392,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
314
392
|
# If it is an Ollama model then we can get the number of parameters from the
|
|
315
393
|
# Ollama Python SDK
|
|
316
394
|
if self.is_ollama:
|
|
317
|
-
ollama_model_id = self.model_config.model_id.split("/")[
|
|
395
|
+
ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
|
|
318
396
|
model_info = ollama.show(ollama_model_id).modelinfo
|
|
319
397
|
if model_info is not None:
|
|
320
398
|
num_params = model_info.get("general.parameter_count")
|
|
@@ -334,7 +412,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
334
412
|
num_labels=self.dataset_config.num_labels,
|
|
335
413
|
id2label=self.dataset_config.id2label,
|
|
336
414
|
label2id=self.dataset_config.label2id,
|
|
337
|
-
revision=
|
|
415
|
+
revision="main",
|
|
338
416
|
model_cache_dir=self.model_config.model_cache_dir,
|
|
339
417
|
api_key=self.benchmark_config.api_key,
|
|
340
418
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
@@ -345,7 +423,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
345
423
|
try:
|
|
346
424
|
repo_info = hf_api.model_info(
|
|
347
425
|
repo_id=model_id,
|
|
348
|
-
revision=
|
|
426
|
+
revision="main",
|
|
349
427
|
token=os.getenv("HUGGINGFACE_API_KEY")
|
|
350
428
|
or self.benchmark_config.api_key
|
|
351
429
|
or True,
|
|
@@ -398,7 +476,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
398
476
|
num_labels=self.dataset_config.num_labels,
|
|
399
477
|
id2label=self.dataset_config.id2label,
|
|
400
478
|
label2id=self.dataset_config.label2id,
|
|
401
|
-
revision=
|
|
479
|
+
revision="main",
|
|
402
480
|
model_cache_dir=self.model_config.model_cache_dir,
|
|
403
481
|
api_key=self.benchmark_config.api_key,
|
|
404
482
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
@@ -442,7 +520,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
442
520
|
# If it is an Ollama model then we can get the maximum length from the Ollama
|
|
443
521
|
# Python SDK
|
|
444
522
|
if self.is_ollama:
|
|
445
|
-
ollama_model_id = self.model_config.model_id.split("/")[
|
|
523
|
+
ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
|
|
446
524
|
model_info = ollama.show(ollama_model_id).modelinfo
|
|
447
525
|
if model_info is not None:
|
|
448
526
|
context_length_keys = [
|
|
@@ -478,7 +556,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
478
556
|
num_labels=self.dataset_config.num_labels,
|
|
479
557
|
id2label=self.dataset_config.id2label,
|
|
480
558
|
label2id=self.dataset_config.label2id,
|
|
481
|
-
revision=
|
|
559
|
+
revision="main",
|
|
482
560
|
model_cache_dir=self.model_config.model_cache_dir,
|
|
483
561
|
api_key=self.benchmark_config.api_key,
|
|
484
562
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
@@ -563,6 +641,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
563
641
|
return partial(
|
|
564
642
|
sequence_classification.extract_labels_from_generation,
|
|
565
643
|
dataset_config=self.dataset_config,
|
|
644
|
+
first_label_token_mapping=self.buffer["first_label_token_mapping"],
|
|
566
645
|
)
|
|
567
646
|
case TaskGroup.TEXT_TO_TEXT:
|
|
568
647
|
return text_to_text.extract_labels_from_generation
|
|
@@ -605,12 +684,13 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
605
684
|
Whether the model exists, or an error describing why we cannot check
|
|
606
685
|
whether the model exists.
|
|
607
686
|
"""
|
|
687
|
+
model_id, _ = model_id.split("@") if "@" in model_id else (model_id, "main")
|
|
608
688
|
if model_id in litellm.model_list:
|
|
609
689
|
return True
|
|
610
690
|
|
|
611
691
|
# If it is an Ollama model then try to download it
|
|
612
692
|
if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
|
|
613
|
-
ollama_model_id = model_id.split("/")[
|
|
693
|
+
ollama_model_id = "/".join(model_id.split("/")[1:])
|
|
614
694
|
downloaded_ollama_models: list[str] = [
|
|
615
695
|
model_obj.model
|
|
616
696
|
for model_obj in ollama.list().models
|
|
@@ -657,12 +737,29 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
657
737
|
api_version=benchmark_config.api_version,
|
|
658
738
|
)
|
|
659
739
|
return True
|
|
740
|
+
except (
|
|
741
|
+
APIConnectionError,
|
|
742
|
+
Timeout,
|
|
743
|
+
ServiceUnavailableError,
|
|
744
|
+
InternalServerError,
|
|
745
|
+
) as e:
|
|
746
|
+
logger.debug(
|
|
747
|
+
f"Service temporarily unavailable. The error message was: {e}. "
|
|
748
|
+
"Retrying in 10 seconds..."
|
|
749
|
+
)
|
|
750
|
+
sleep(5)
|
|
751
|
+
except RateLimitError:
|
|
752
|
+
logger.warning(
|
|
753
|
+
f"Rate limit exceeded for model {model_id!r}. Retrying in 10 "
|
|
754
|
+
"seconds..."
|
|
755
|
+
)
|
|
756
|
+
sleep(10)
|
|
660
757
|
except APIError as e:
|
|
661
758
|
if "'503 Service Unavailable" not in str(e):
|
|
662
759
|
raise e
|
|
663
760
|
logger.warning(
|
|
664
|
-
f"Failed to check if model {model_id!r} exists. Retrying in "
|
|
665
|
-
|
|
761
|
+
f"Failed to check if model {model_id!r} exists. Retrying in 10 "
|
|
762
|
+
"seconds..."
|
|
666
763
|
)
|
|
667
764
|
sleep(10)
|
|
668
765
|
except (BadRequestError, NotFoundError):
|
|
@@ -708,9 +805,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
708
805
|
Returns:
|
|
709
806
|
The model configuration.
|
|
710
807
|
"""
|
|
808
|
+
model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "")
|
|
711
809
|
return ModelConfig(
|
|
712
810
|
model_id=model_id,
|
|
713
|
-
revision=
|
|
811
|
+
revision=revision,
|
|
714
812
|
task="text-generation",
|
|
715
813
|
languages=list(),
|
|
716
814
|
merge=False,
|
|
@@ -1025,3 +1123,35 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1025
1123
|
|
|
1026
1124
|
examples["messages"] = messages_list
|
|
1027
1125
|
return examples
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
def raise_if_wrong_params(
|
|
1129
|
+
model_config: ModelConfig, allowed_params: dict[str, list[str]]
|
|
1130
|
+
) -> None:
|
|
1131
|
+
"""Raise an error if the model configuration has invalid parameters.
|
|
1132
|
+
|
|
1133
|
+
Args:
|
|
1134
|
+
model_config:
|
|
1135
|
+
The model configuration.
|
|
1136
|
+
allowed_params:
|
|
1137
|
+
The allowed parameters for the model.
|
|
1138
|
+
|
|
1139
|
+
Raises:
|
|
1140
|
+
InvalidModel:
|
|
1141
|
+
If the model configuration has invalid parameters.
|
|
1142
|
+
"""
|
|
1143
|
+
param = model_config.revision
|
|
1144
|
+
if param == "":
|
|
1145
|
+
return
|
|
1146
|
+
for model_regex, allowed_params_list in allowed_params.items():
|
|
1147
|
+
if re.fullmatch(pattern=model_regex, string=model_config.model_id):
|
|
1148
|
+
if param not in allowed_params_list:
|
|
1149
|
+
msg = (
|
|
1150
|
+
f"Invalid parameter {param!r} for model {model_config.model_id!r}."
|
|
1151
|
+
)
|
|
1152
|
+
if allowed_params_list:
|
|
1153
|
+
msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
|
|
1154
|
+
else:
|
|
1155
|
+
msg += " No parameters are allowed."
|
|
1156
|
+
raise InvalidModel(msg)
|
|
1157
|
+
return
|
|
@@ -25,10 +25,10 @@ from urllib3.exceptions import RequestError
|
|
|
25
25
|
|
|
26
26
|
from ..constants import (
|
|
27
27
|
GENERATIVE_PIPELINE_TAGS,
|
|
28
|
+
MAX_CONTEXT_LENGTH,
|
|
28
29
|
MAX_LOGPROBS,
|
|
29
30
|
MERGE_TAGS,
|
|
30
31
|
REASONING_MAX_TOKENS,
|
|
31
|
-
TASK_GROUPS_USING_LOGPROBS,
|
|
32
32
|
TASKS_USING_JSON,
|
|
33
33
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
34
34
|
)
|
|
@@ -66,6 +66,7 @@ from ..utils import (
|
|
|
66
66
|
get_bos_token,
|
|
67
67
|
get_end_of_chat_token_ids,
|
|
68
68
|
get_eos_token,
|
|
69
|
+
get_first_label_token_mapping,
|
|
69
70
|
get_min_cuda_compute_capability,
|
|
70
71
|
log_once,
|
|
71
72
|
should_prompts_be_stripped,
|
|
@@ -122,11 +123,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
122
123
|
):
|
|
123
124
|
raise NeedsExtraInstalled(extra="generative")
|
|
124
125
|
|
|
125
|
-
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
126
126
|
model, tokenizer = load_model_and_tokenizer(
|
|
127
|
-
model_config=model_config,
|
|
128
|
-
benchmark_config=benchmark_config,
|
|
129
|
-
output_scores=output_scores,
|
|
127
|
+
model_config=model_config, benchmark_config=benchmark_config
|
|
130
128
|
)
|
|
131
129
|
self._model: LLM = model
|
|
132
130
|
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
@@ -142,8 +140,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
142
140
|
benchmark_config=benchmark_config,
|
|
143
141
|
)
|
|
144
142
|
|
|
145
|
-
self.buffer
|
|
146
|
-
|
|
143
|
+
self.buffer |= dict(
|
|
144
|
+
instruction_model=self._tokenizer.chat_template is not None,
|
|
145
|
+
first_label_token_mapping=get_first_label_token_mapping(
|
|
146
|
+
dataset_config=self.dataset_config, tokenizer=self._tokenizer
|
|
147
|
+
),
|
|
148
|
+
)
|
|
147
149
|
if self.model_config.adapter_base_model_id is not None:
|
|
148
150
|
adapter_path = snapshot_download(
|
|
149
151
|
repo_id=self.model_config.model_id,
|
|
@@ -185,6 +187,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
185
187
|
return partial(
|
|
186
188
|
sequence_classification.extract_labels_from_generation,
|
|
187
189
|
dataset_config=self.dataset_config,
|
|
190
|
+
first_label_token_mapping=self.buffer["first_label_token_mapping"],
|
|
188
191
|
)
|
|
189
192
|
case TaskGroup.TEXT_TO_TEXT:
|
|
190
193
|
return text_to_text.extract_labels_from_generation
|
|
@@ -338,6 +341,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
338
341
|
else:
|
|
339
342
|
logits_processor = None
|
|
340
343
|
|
|
344
|
+
# Get the mapping from labels to the first token in the label. We call this each
|
|
345
|
+
# time we generate a new dataset since the dataset config can change
|
|
346
|
+
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
347
|
+
dataset_config=self.dataset_config, tokenizer=self._tokenizer
|
|
348
|
+
)
|
|
349
|
+
|
|
341
350
|
# Define the parameters used for vLLM generation
|
|
342
351
|
max_tokens: int = (
|
|
343
352
|
REASONING_MAX_TOKENS
|
|
@@ -346,7 +355,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
346
355
|
)
|
|
347
356
|
sampling_params = SamplingParams(
|
|
348
357
|
max_tokens=max_tokens,
|
|
349
|
-
logprobs=MAX_LOGPROBS if self.buffer["
|
|
358
|
+
logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
|
|
350
359
|
temperature=0.0,
|
|
351
360
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
352
361
|
logits_processors=[logits_processor] if logits_processor else None,
|
|
@@ -416,7 +425,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
416
425
|
completions = [completion.strip() for completion in completions]
|
|
417
426
|
|
|
418
427
|
# Add logprobs scores to the output
|
|
419
|
-
if self.buffer["
|
|
428
|
+
if self.buffer["first_label_token_mapping"]:
|
|
420
429
|
scores: list[list[list[tuple[str, float]]]] = [
|
|
421
430
|
[
|
|
422
431
|
[
|
|
@@ -846,7 +855,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
846
855
|
|
|
847
856
|
|
|
848
857
|
def load_model_and_tokenizer(
|
|
849
|
-
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
858
|
+
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
850
859
|
) -> "tuple[LLM, PreTrainedTokenizer]":
|
|
851
860
|
"""Load the model and tokenizer.
|
|
852
861
|
|
|
@@ -855,11 +864,9 @@ def load_model_and_tokenizer(
|
|
|
855
864
|
The model configuration.
|
|
856
865
|
benchmark_config:
|
|
857
866
|
The benchmark configuration.
|
|
858
|
-
output_scores:
|
|
859
|
-
Whether to output scores.
|
|
860
867
|
|
|
861
868
|
Returns:
|
|
862
|
-
|
|
869
|
+
A pair (model, tokenizer), with the loaded model and tokenizer
|
|
863
870
|
"""
|
|
864
871
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
865
872
|
# during inference in this case
|
|
@@ -893,7 +900,27 @@ def load_model_and_tokenizer(
|
|
|
893
900
|
if quantization == "awq" and importlib.util.find_spec("awq") is None:
|
|
894
901
|
raise NeedsExtraInstalled(extra="quantization")
|
|
895
902
|
|
|
903
|
+
# Start with dtype being the "auto" vLLM dtype
|
|
896
904
|
dtype: str | torch.dtype = "auto"
|
|
905
|
+
|
|
906
|
+
# Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
|
|
907
|
+
if hf_model_config.torch_dtype == torch.float32:
|
|
908
|
+
if torch.cuda.is_bf16_supported():
|
|
909
|
+
logger.info(
|
|
910
|
+
"You are loading a model with dtype FP32, which we will convert to "
|
|
911
|
+
"BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
|
|
912
|
+
"GPU."
|
|
913
|
+
)
|
|
914
|
+
dtype = torch.bfloat16
|
|
915
|
+
else:
|
|
916
|
+
logger.info(
|
|
917
|
+
"You are loading a model with dtype FP32, which we will convert to "
|
|
918
|
+
"FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
|
|
919
|
+
"your GPU."
|
|
920
|
+
)
|
|
921
|
+
dtype = torch.float16
|
|
922
|
+
|
|
923
|
+
# If the model is a quantized model, we need to set the dtype to float16
|
|
897
924
|
if quantization is not None and hf_model_config.torch_dtype != torch.float16:
|
|
898
925
|
logger.info(
|
|
899
926
|
"You are loading a quantized model with dtype "
|
|
@@ -902,6 +929,7 @@ def load_model_and_tokenizer(
|
|
|
902
929
|
)
|
|
903
930
|
dtype = torch.float16
|
|
904
931
|
|
|
932
|
+
# If the model is a bf16 model, we need to check the CUDA compute capability
|
|
905
933
|
if hf_model_config.torch_dtype == torch.bfloat16:
|
|
906
934
|
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
907
935
|
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
@@ -940,7 +968,17 @@ def load_model_and_tokenizer(
|
|
|
940
968
|
if len(true_max_model_len_candidates) > 0:
|
|
941
969
|
true_max_model_len = min(true_max_model_len_candidates)
|
|
942
970
|
else:
|
|
943
|
-
true_max_model_len =
|
|
971
|
+
true_max_model_len = MAX_CONTEXT_LENGTH
|
|
972
|
+
|
|
973
|
+
tokenizer = load_tokenizer(
|
|
974
|
+
model_id=model_config.model_id,
|
|
975
|
+
revision=model_config.revision,
|
|
976
|
+
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
977
|
+
trust_remote_code=benchmark_config.trust_remote_code,
|
|
978
|
+
model_max_length=true_max_model_len,
|
|
979
|
+
model_cache_dir=model_config.model_cache_dir,
|
|
980
|
+
token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
|
|
981
|
+
)
|
|
944
982
|
|
|
945
983
|
clear_vllm()
|
|
946
984
|
|
|
@@ -951,7 +989,7 @@ def load_model_and_tokenizer(
|
|
|
951
989
|
model=model_id,
|
|
952
990
|
tokenizer=model_id,
|
|
953
991
|
gpu_memory_utilization=0.95,
|
|
954
|
-
max_model_len=min(true_max_model_len,
|
|
992
|
+
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
955
993
|
download_dir=download_dir,
|
|
956
994
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
957
995
|
revision=revision,
|
|
@@ -962,7 +1000,6 @@ def load_model_and_tokenizer(
|
|
|
962
1000
|
quantization=quantization,
|
|
963
1001
|
dtype=dtype,
|
|
964
1002
|
enforce_eager=True,
|
|
965
|
-
max_logprobs=MAX_LOGPROBS if output_scores else None,
|
|
966
1003
|
# TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
|
|
967
1004
|
# so we disable it for now
|
|
968
1005
|
enable_prefix_caching=False,
|
|
@@ -988,16 +1025,6 @@ def load_model_and_tokenizer(
|
|
|
988
1025
|
model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
|
|
989
1026
|
model.config = hf_model_config
|
|
990
1027
|
|
|
991
|
-
tokenizer = load_tokenizer(
|
|
992
|
-
model_id=model_config.model_id,
|
|
993
|
-
revision=model_config.revision,
|
|
994
|
-
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
995
|
-
trust_remote_code=benchmark_config.trust_remote_code,
|
|
996
|
-
model_max_length=true_max_model_len,
|
|
997
|
-
model_cache_dir=model_config.model_cache_dir,
|
|
998
|
-
token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
|
|
999
|
-
)
|
|
1000
|
-
|
|
1001
1028
|
return model, tokenizer
|
|
1002
1029
|
|
|
1003
1030
|
|
|
@@ -1157,15 +1184,13 @@ def get_end_of_reasoning_token_id(
|
|
|
1157
1184
|
|
|
1158
1185
|
# Generate a completion and remove the BOS token from it, to not confuse it with the
|
|
1159
1186
|
# potential reasoning token
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
use_tqdm=False,
|
|
1165
|
-
)[0]
|
|
1166
|
-
.outputs[0]
|
|
1167
|
-
.text
|
|
1187
|
+
model_output = model.generate(
|
|
1188
|
+
prompts=[prompt],
|
|
1189
|
+
sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
|
|
1190
|
+
use_tqdm=False,
|
|
1168
1191
|
)
|
|
1192
|
+
completion = model_output[0].outputs[0].text
|
|
1193
|
+
|
|
1169
1194
|
if tokenizer.bos_token is not None:
|
|
1170
1195
|
if isinstance(tokenizer.bos_token, str):
|
|
1171
1196
|
prompt = prompt.replace(tokenizer.bos_token, "").strip()
|
euroeval/benchmarker.py
CHANGED
|
@@ -366,14 +366,18 @@ class Benchmarker:
|
|
|
366
366
|
dataset_names=benchmark_config.datasets
|
|
367
367
|
)
|
|
368
368
|
|
|
369
|
+
total_benchmarks = len(model_ids) * len(dataset_configs)
|
|
370
|
+
num_finished_benchmarks = 0
|
|
371
|
+
|
|
369
372
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
370
|
-
for
|
|
373
|
+
for model_id in model_ids:
|
|
371
374
|
try:
|
|
372
375
|
model_config = get_model_config(
|
|
373
|
-
model_id=
|
|
376
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
374
377
|
)
|
|
375
378
|
except InvalidModel as e:
|
|
376
379
|
logger.info(e.message)
|
|
380
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
377
381
|
continue
|
|
378
382
|
|
|
379
383
|
loaded_model: BenchmarkModule | None = None
|
|
@@ -381,16 +385,18 @@ class Benchmarker:
|
|
|
381
385
|
# Skip if we have already benchmarked this model on this dataset and
|
|
382
386
|
# we are not forcing the benchmark
|
|
383
387
|
if not benchmark_config.force and model_has_been_benchmarked(
|
|
384
|
-
model_id=
|
|
388
|
+
model_id=model_id,
|
|
385
389
|
dataset=dataset_config.name,
|
|
386
390
|
few_shot=benchmark_config.few_shot,
|
|
387
391
|
validation_split=not benchmark_config.evaluate_test_split,
|
|
388
392
|
benchmark_results=self.benchmark_results,
|
|
389
393
|
):
|
|
390
394
|
logger.debug(
|
|
391
|
-
f"Skipping benchmarking {
|
|
392
|
-
" as it
|
|
395
|
+
f"Skipping benchmarking {model_id} on "
|
|
396
|
+
f"{dataset_config.pretty_name}, as it "
|
|
397
|
+
"has already been benchmarked."
|
|
393
398
|
)
|
|
399
|
+
num_finished_benchmarks += 1
|
|
394
400
|
continue
|
|
395
401
|
|
|
396
402
|
# We do not re-initialise generative models as their architecture is not
|
|
@@ -413,6 +419,15 @@ class Benchmarker:
|
|
|
413
419
|
if benchmark_config.raise_errors:
|
|
414
420
|
raise e
|
|
415
421
|
logger.info(e.message)
|
|
422
|
+
|
|
423
|
+
# Add the remaining number of benchmarks for the model to
|
|
424
|
+
# our benchmark counter, since we're skipping the
|
|
425
|
+
# rest of them
|
|
426
|
+
num_finished_benchmarks += (
|
|
427
|
+
len(dataset_configs)
|
|
428
|
+
- dataset_configs.index(dataset_config)
|
|
429
|
+
- 1
|
|
430
|
+
)
|
|
416
431
|
break
|
|
417
432
|
else:
|
|
418
433
|
loaded_model.dataset_config = dataset_config
|
|
@@ -435,16 +450,24 @@ class Benchmarker:
|
|
|
435
450
|
if benchmark_config.raise_errors:
|
|
436
451
|
raise benchmark_output_or_err
|
|
437
452
|
logger.info(
|
|
438
|
-
f"{
|
|
453
|
+
f"{model_id} could not be benchmarked on "
|
|
439
454
|
f"{dataset_config.pretty_name}. Skipping. The error message "
|
|
440
455
|
f"raised was {benchmark_output_or_err.message!r}."
|
|
441
456
|
)
|
|
457
|
+
num_finished_benchmarks += 1
|
|
442
458
|
continue
|
|
443
459
|
|
|
444
460
|
elif isinstance(benchmark_output_or_err, InvalidModel):
|
|
445
461
|
if benchmark_config.raise_errors:
|
|
446
462
|
raise benchmark_output_or_err
|
|
447
463
|
logger.info(benchmark_output_or_err.message)
|
|
464
|
+
|
|
465
|
+
# Add the remaining number of benchmarks for the model to
|
|
466
|
+
# our benchmark counter, since we're skipping the
|
|
467
|
+
# rest of them
|
|
468
|
+
num_finished_benchmarks += (
|
|
469
|
+
len(dataset_configs) - dataset_configs.index(dataset_config) - 1
|
|
470
|
+
)
|
|
448
471
|
break
|
|
449
472
|
|
|
450
473
|
else:
|
|
@@ -453,6 +476,12 @@ class Benchmarker:
|
|
|
453
476
|
if benchmark_config.save_results:
|
|
454
477
|
record.append_to_results(results_path=self.results_path)
|
|
455
478
|
|
|
479
|
+
num_finished_benchmarks += 1
|
|
480
|
+
logger.info(
|
|
481
|
+
f"Finished {num_finished_benchmarks} out of "
|
|
482
|
+
f"{total_benchmarks} benchmarks."
|
|
483
|
+
)
|
|
484
|
+
|
|
456
485
|
if benchmark_config.clear_model_cache:
|
|
457
486
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
458
487
|
|
euroeval/constants.py
CHANGED
|
@@ -7,6 +7,13 @@ from .tasks import NER
|
|
|
7
7
|
DUMMY_FILL_VALUE = 100
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
# This is the maximum allowed context length for models for the purpose of this
|
|
11
|
+
# benchmark. We will still report the models' true maximum context length in the
|
|
12
|
+
# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
|
|
13
|
+
# all tokens in the context.
|
|
14
|
+
MAX_CONTEXT_LENGTH = 5_000
|
|
15
|
+
|
|
16
|
+
|
|
10
17
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
11
18
|
# time to think
|
|
12
19
|
REASONING_MAX_TOKENS = 8_192
|
|
@@ -47,7 +54,7 @@ TASK_GROUPS_USING_LOGPROBS = [
|
|
|
47
54
|
MAX_LOGPROBS = 10
|
|
48
55
|
|
|
49
56
|
|
|
50
|
-
# We make sure to remove these metric
|
|
57
|
+
# We make sure to remove these metric attributes after each iteration, to avoid memory
|
|
51
58
|
# leaks
|
|
52
59
|
METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
53
60
|
|
euroeval/data_models.py
CHANGED
|
@@ -10,10 +10,9 @@ from dataclasses import dataclass, field
|
|
|
10
10
|
import pydantic
|
|
11
11
|
import torch
|
|
12
12
|
|
|
13
|
-
from euroeval.utils import get_package_version
|
|
14
|
-
|
|
15
13
|
from .enums import Device, InferenceBackend, ModelType, TaskGroup
|
|
16
14
|
from .types import ScoreDict
|
|
15
|
+
from .utils import get_package_version
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
@dataclass
|
euroeval/dataset_configs.py
CHANGED
|
@@ -1643,7 +1643,7 @@ ORANGE_SUM_CONFIG = DatasetConfig(
|
|
|
1643
1643
|
|
|
1644
1644
|
ILPOST_SUM_CONFIG = DatasetConfig(
|
|
1645
1645
|
name="ilpost-sum",
|
|
1646
|
-
pretty_name="the truncated version of the Italian summarisation dataset IlPost",
|
|
1646
|
+
pretty_name="the truncated version of the Italian summarisation dataset IlPost-Sum",
|
|
1647
1647
|
huggingface_id="EuroEval/ilpost-sum",
|
|
1648
1648
|
task=SUMM,
|
|
1649
1649
|
languages=[IT],
|
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
|
10
10
|
from evaluate import EvaluationModule
|
|
11
11
|
|
|
12
12
|
from ..data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
|
+
from ..exceptions import InvalidBenchmark
|
|
13
14
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
14
15
|
|
|
15
16
|
if t.TYPE_CHECKING:
|
|
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
|
|
|
110
111
|
input_batch: dict[str, list],
|
|
111
112
|
model_output: GenerativeModelOutput,
|
|
112
113
|
dataset_config: "DatasetConfig",
|
|
114
|
+
first_label_token_mapping: dict[str, str] | bool,
|
|
113
115
|
) -> list[str]:
|
|
114
116
|
"""Extract the predicted labels from the generated output.
|
|
115
117
|
|
|
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
|
|
|
121
123
|
The raw generated output of the model.
|
|
122
124
|
dataset_config:
|
|
123
125
|
The configuration of the dataset.
|
|
126
|
+
first_label_token_mapping:
|
|
127
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
128
|
+
Boolean value indicating whether the model should output scores (if the
|
|
129
|
+
mapping is outputted then the model will always output scores).
|
|
124
130
|
|
|
125
131
|
Returns:
|
|
126
132
|
The predicted labels.
|
|
127
133
|
"""
|
|
128
134
|
if model_output.scores is not None:
|
|
129
135
|
return get_closest_logprobs_labels(
|
|
130
|
-
generation_logprobs=model_output.scores,
|
|
136
|
+
generation_logprobs=model_output.scores,
|
|
137
|
+
dataset_config=dataset_config,
|
|
138
|
+
first_label_token_mapping=first_label_token_mapping,
|
|
131
139
|
)
|
|
132
140
|
else:
|
|
133
141
|
return get_closest_word_edit_labels(
|
|
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
|
|
|
138
146
|
def get_closest_logprobs_labels(
|
|
139
147
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
140
148
|
dataset_config: "DatasetConfig",
|
|
149
|
+
first_label_token_mapping: dict[str, str] | bool,
|
|
141
150
|
) -> list[str]:
|
|
142
151
|
"""Get the labels with the highest predicted logprob value.
|
|
143
152
|
|
|
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
|
|
|
152
161
|
(batch_size, num_tokens, num_logprobs).
|
|
153
162
|
dataset_config:
|
|
154
163
|
The configuration of the dataset.
|
|
164
|
+
first_label_token_mapping:
|
|
165
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
166
|
+
Boolean value indicating whether the model should output scores (if the
|
|
167
|
+
mapping is outputted then the model will always output scores).
|
|
155
168
|
|
|
156
169
|
Returns:
|
|
157
170
|
The predicted labels.
|
|
@@ -185,11 +198,29 @@ def get_closest_logprobs_labels(
|
|
|
185
198
|
generated_label = "".join(previously_generated_labels) + generated_label
|
|
186
199
|
|
|
187
200
|
# Get the candidate labels that starts with the generated label
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
201
|
+
if isinstance(first_label_token_mapping, dict):
|
|
202
|
+
if any(
|
|
203
|
+
candidate_label not in first_label_token_mapping
|
|
204
|
+
for candidate_label in candidate_labels
|
|
205
|
+
):
|
|
206
|
+
raise InvalidBenchmark(
|
|
207
|
+
"There is a label not present in the first label token "
|
|
208
|
+
"mapping - this should never happen! Please report this "
|
|
209
|
+
"issue to the EuroEval team at "
|
|
210
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
candidate_output_labels = {
|
|
214
|
+
candidate_label
|
|
215
|
+
for candidate_label in candidate_labels
|
|
216
|
+
if generated_label == first_label_token_mapping[candidate_label]
|
|
217
|
+
}
|
|
218
|
+
else:
|
|
219
|
+
candidate_output_labels = {
|
|
220
|
+
candidate_label
|
|
221
|
+
for candidate_label in candidate_labels
|
|
222
|
+
if candidate_label.startswith(generated_label)
|
|
223
|
+
}
|
|
193
224
|
|
|
194
225
|
# If we can uniquely determine the output label, we break the loop. If
|
|
195
226
|
# there are multiple possible labels then we store the current one, and
|
|
@@ -206,7 +237,7 @@ def get_closest_logprobs_labels(
|
|
|
206
237
|
else:
|
|
207
238
|
output_label = candidate_output_labels.pop()
|
|
208
239
|
candidate_output_labels.add(output_label)
|
|
209
|
-
|
|
240
|
+
raise InvalidBenchmark(
|
|
210
241
|
"Multiple candidate labels found for the generated label "
|
|
211
242
|
f"{generated_label!r}: {candidate_output_labels}. Since "
|
|
212
243
|
"this is not the first generated label, we cannot "
|
|
@@ -214,9 +245,13 @@ def get_closest_logprobs_labels(
|
|
|
214
245
|
f"forced to use the arbitrary {output_label!r} as the "
|
|
215
246
|
"output label, potentially resulting in worse performance. "
|
|
216
247
|
"Please report this issue to the EuroEval team at "
|
|
217
|
-
"github.com/EuroEval/EuroEval/issues."
|
|
218
|
-
level=logging.WARNING,
|
|
248
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
219
249
|
)
|
|
250
|
+
elif len(candidate_output_labels) == 0:
|
|
251
|
+
logger.debug(
|
|
252
|
+
f"No candidate label found for the generated label "
|
|
253
|
+
f"{generated_label!r}. The generated label is thus ignored."
|
|
254
|
+
)
|
|
220
255
|
|
|
221
256
|
if output_label is not None:
|
|
222
257
|
output_labels.append(output_label)
|
euroeval/utils.py
CHANGED
|
@@ -7,12 +7,12 @@ import importlib.util
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import random
|
|
10
|
+
import re
|
|
10
11
|
import sys
|
|
11
12
|
import typing as t
|
|
12
13
|
import warnings
|
|
13
14
|
from functools import cache
|
|
14
15
|
from pathlib import Path
|
|
15
|
-
from types import TracebackType
|
|
16
16
|
|
|
17
17
|
import litellm
|
|
18
18
|
import numpy as np
|
|
@@ -20,7 +20,6 @@ import requests
|
|
|
20
20
|
import torch
|
|
21
21
|
from datasets.utils import disable_progress_bar
|
|
22
22
|
from requests.exceptions import RequestException
|
|
23
|
-
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
|
|
24
23
|
from transformers import logging as tf_logging
|
|
25
24
|
|
|
26
25
|
from .exceptions import InvalidModel, NaNValueInModelOutput
|
|
@@ -29,6 +28,11 @@ if importlib.util.find_spec("ray") is not None:
|
|
|
29
28
|
import ray
|
|
30
29
|
|
|
31
30
|
if t.TYPE_CHECKING:
|
|
31
|
+
from types import TracebackType
|
|
32
|
+
|
|
33
|
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
|
|
34
|
+
|
|
35
|
+
from .data_models import DatasetConfig
|
|
32
36
|
from .types import Predictions
|
|
33
37
|
|
|
34
38
|
|
|
@@ -285,7 +289,7 @@ class HiddenPrints:
|
|
|
285
289
|
self,
|
|
286
290
|
exc_type: t.Type[BaseException],
|
|
287
291
|
exc_val: BaseException,
|
|
288
|
-
exc_tb: TracebackType,
|
|
292
|
+
exc_tb: "TracebackType",
|
|
289
293
|
) -> None:
|
|
290
294
|
"""Exit the context manager."""
|
|
291
295
|
sys.stdout.close()
|
|
@@ -355,7 +359,6 @@ def should_prompts_be_stripped(
|
|
|
355
359
|
return strip_prompts
|
|
356
360
|
|
|
357
361
|
|
|
358
|
-
# TODO: This is currently not used - maybe remove.
|
|
359
362
|
def should_prefix_space_be_added_to_labels(
|
|
360
363
|
labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
|
|
361
364
|
) -> bool:
|
|
@@ -576,3 +579,96 @@ def get_package_version(package_name: str) -> str | None:
|
|
|
576
579
|
return importlib.metadata.version(package_name)
|
|
577
580
|
except importlib.metadata.PackageNotFoundError:
|
|
578
581
|
return None
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def get_first_label_token_mapping(
|
|
585
|
+
dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
|
|
586
|
+
) -> dict[str, str] | bool:
|
|
587
|
+
"""Check if the model should output scores.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
dataset_config:
|
|
591
|
+
The dataset configuration.
|
|
592
|
+
tokenizer:
|
|
593
|
+
The tokenizer, or None if not available.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
597
|
+
Boolean value indicating whether the model should output scores (if the mapping
|
|
598
|
+
is outputted then the model will always output scores).
|
|
599
|
+
"""
|
|
600
|
+
# Importing here to avoid circular imports
|
|
601
|
+
from .constants import TASK_GROUPS_USING_LOGPROBS
|
|
602
|
+
|
|
603
|
+
# If we do not have any tokenizer, then we cannot check if the model should output
|
|
604
|
+
# scores and we just assume it should if the dataset supports it
|
|
605
|
+
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
606
|
+
if tokenizer is None:
|
|
607
|
+
if output_scores:
|
|
608
|
+
log_once(
|
|
609
|
+
"The model will output scores, since the dataset supports it and no "
|
|
610
|
+
"tokenizer is available.",
|
|
611
|
+
level=logging.DEBUG,
|
|
612
|
+
)
|
|
613
|
+
else:
|
|
614
|
+
log_once(
|
|
615
|
+
"The model will not output scores, since the dataset does not support "
|
|
616
|
+
"it and no tokenizer is available.",
|
|
617
|
+
level=logging.DEBUG,
|
|
618
|
+
)
|
|
619
|
+
return output_scores
|
|
620
|
+
|
|
621
|
+
# If there are labels associated with the dataset, and that the first token of each
|
|
622
|
+
# label is distinct, then we can safely use the logprobs
|
|
623
|
+
if output_scores and dataset_config.labels:
|
|
624
|
+
local_labels = [
|
|
625
|
+
dataset_config.prompt_label_mapping[label].strip()
|
|
626
|
+
for label in dataset_config.labels
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
# Get the first token of each label, where we add a prefix space if needed
|
|
630
|
+
add_prefix_space = (
|
|
631
|
+
should_prefix_space_be_added_to_labels(
|
|
632
|
+
labels_to_be_generated=local_labels, tokenizer=tokenizer
|
|
633
|
+
)
|
|
634
|
+
and tokenizer.chat_template is None
|
|
635
|
+
)
|
|
636
|
+
first_tokens = [
|
|
637
|
+
tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
|
|
638
|
+
for label in local_labels
|
|
639
|
+
]
|
|
640
|
+
first_tokens = [
|
|
641
|
+
re.sub(
|
|
642
|
+
pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
|
|
643
|
+
)
|
|
644
|
+
for token in first_tokens
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
# Build a mapping from labels to the first token in each label if the first
|
|
648
|
+
# tokens are distinct
|
|
649
|
+
if len(first_tokens) == len(set(first_tokens)):
|
|
650
|
+
log_once(
|
|
651
|
+
"The model will output scores, since the first tokens of the labels "
|
|
652
|
+
"are distinct.",
|
|
653
|
+
level=logging.DEBUG,
|
|
654
|
+
)
|
|
655
|
+
return {
|
|
656
|
+
label: first_token
|
|
657
|
+
for label, first_token in zip(local_labels, first_tokens)
|
|
658
|
+
}
|
|
659
|
+
else:
|
|
660
|
+
log_once(
|
|
661
|
+
"The model will not output scores, since the first tokens of the "
|
|
662
|
+
"labels are not distinct. The first tokens for the labels "
|
|
663
|
+
f"{local_labels} are {first_tokens}"
|
|
664
|
+
)
|
|
665
|
+
return False
|
|
666
|
+
|
|
667
|
+
# Otherwise, we assume that the model should not output scores, to avoid potential
|
|
668
|
+
# evaluation errors. This will force the label extraction to rely on word edit
|
|
669
|
+
# distance instead of logprobs.
|
|
670
|
+
log_once(
|
|
671
|
+
"The model will not output scores, since the dataset does not have labels.",
|
|
672
|
+
level=logging.DEBUG,
|
|
673
|
+
)
|
|
674
|
+
return False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.5.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -37,7 +37,7 @@ Requires-Dist: demjson3>=3.0.6
|
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
38
|
Requires-Dist: huggingface-hub>=0.24.0
|
|
39
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
40
|
+
Requires-Dist: litellm>=1.63.0
|
|
41
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
43
|
Requires-Dist: ollama>=0.4.7
|
|
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
62
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
64
64
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm
|
|
65
|
+
Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
69
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm
|
|
70
|
+
Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Provides-Extra: human-evaluation
|
|
72
72
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
73
73
|
Provides-Extra: test
|
|
@@ -218,6 +218,7 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
|
218
218
|
$ uv run src/scripts/create_allocine.py
|
|
219
219
|
```
|
|
220
220
|
|
|
221
|
+
|
|
221
222
|
## Special Thanks :pray:
|
|
222
223
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
223
224
|
models on the leaderboards.
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
|
|
2
2
|
euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
3
|
+
euroeval/benchmarker.py,sha256=8Qt1NL7k5n-AfFrhR6139wmmsVS7CgRa-QjminH0d_c,47849
|
|
4
4
|
euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
|
-
euroeval/constants.py,sha256=
|
|
6
|
+
euroeval/constants.py,sha256=CJavEDvKLSKAC4uyz44sFrY1W1AnjUsxkXF63SoMjw4,1985
|
|
7
7
|
euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
9
|
-
euroeval/dataset_configs.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=QssdR_msDTmsp9yKe0cVba0iCpgBTFTOaOUn44o1cl8,14770
|
|
9
|
+
euroeval/dataset_configs.py,sha256=6WiRW-VAAMIL6-1J6Nb6pCm6mf4I-oQ087zB0es3HHs,90644
|
|
10
10
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
11
11
|
euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
|
|
12
12
|
euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
|
|
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
|
|
|
20
20
|
euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
|
|
21
21
|
euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
|
|
22
22
|
euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
23
|
+
euroeval/utils.py,sha256=bbq7WCcIrMKjBRaZ8EcnRpRAvL_F-tCxiL0We_po3QE,22397
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
|
|
27
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
28
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
27
|
+
euroeval/benchmark_modules/hf.py,sha256=VcgWZmSZc4B3FgeUGC0eWQIRv97luU22-KijaBfuqU0,43602
|
|
28
|
+
euroeval/benchmark_modules/litellm.py,sha256=pbTsq6Bb8cnFbdZMUSrUs-XlNAyaCIWNcEKKRIfprx8,45161
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=7AZrvcwHevrQbXvbjTCp4S6HpM0Obk6CIQLbmUWIn9s,47483
|
|
30
30
|
euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
31
31
|
euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
|
|
32
32
|
euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
|
|
33
|
-
euroeval/task_utils/sequence_classification.py,sha256=
|
|
33
|
+
euroeval/task_utils/sequence_classification.py,sha256=JDZfiTj5RdwYwlhhTqVBj2mVdwmkoykZ6wJzEbWj0lo,12225
|
|
34
34
|
euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
|
|
35
35
|
euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
|
|
36
|
-
euroeval-15.
|
|
37
|
-
euroeval-15.
|
|
38
|
-
euroeval-15.
|
|
39
|
-
euroeval-15.
|
|
40
|
-
euroeval-15.
|
|
36
|
+
euroeval-15.5.0.dist-info/METADATA,sha256=T48YoPuFBEFI5sxgUadzkD3tidIB3TA1mKEKsFuh7fs,10752
|
|
37
|
+
euroeval-15.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
euroeval-15.5.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
39
|
+
euroeval-15.5.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
40
|
+
euroeval-15.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|