EuroEval 15.6.1__py3-none-any.whl → 15.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/litellm.py +136 -31
- euroeval/benchmark_modules/vllm.py +105 -38
- euroeval/benchmarker.py +12 -2
- euroeval/constants.py +1 -1
- euroeval/data_loading.py +48 -26
- euroeval/data_models.py +0 -8
- euroeval/dataset_configs/finnish.py +60 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +8 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/reading_comprehension.py +11 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +9 -1
- euroeval/task_group_utils/sequence_classification.py +27 -32
- euroeval/task_group_utils/text_to_text.py +10 -27
- euroeval/tasks.py +1 -1
- euroeval/tokenization_utils.py +22 -6
- {euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/METADATA +14 -2
- {euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/RECORD +22 -21
- {euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/WHEEL +0 -0
- {euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -33,6 +33,7 @@ from litellm.exceptions import (
|
|
|
33
33
|
)
|
|
34
34
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
35
35
|
from litellm.types.utils import ChoiceLogprobs, ModelResponse
|
|
36
|
+
from pydantic import conlist, create_model
|
|
36
37
|
from requests.exceptions import RequestException
|
|
37
38
|
from tqdm.auto import tqdm
|
|
38
39
|
from transformers.trainer import Trainer
|
|
@@ -104,6 +105,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
104
105
|
r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
105
106
|
r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
106
107
|
r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
108
|
+
r"gpt-4.1.*": 1_047_576,
|
|
107
109
|
# Anthropic models
|
|
108
110
|
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
|
|
109
111
|
# Gemini models
|
|
@@ -135,20 +137,23 @@ ALLOWED_PARAMS = {
|
|
|
135
137
|
r"gpt-4.*": [],
|
|
136
138
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
|
|
137
139
|
# Anthropic models
|
|
138
|
-
r"(anthropic/)?claude-3
|
|
139
|
-
r"(anthropic/)?claude-3
|
|
140
|
-
r"(anthropic/)?claude-3
|
|
140
|
+
r"(anthropic/)?claude-3-(haiku|sonnet|opus).*": [],
|
|
141
|
+
r"(anthropic/)?claude-3-5-.*": [],
|
|
142
|
+
r"(anthropic/)?claude-3-7-sonnet.*": ["thinking"],
|
|
141
143
|
# Gemini models
|
|
142
144
|
r"(gemini/)?gemini-.*": [],
|
|
143
145
|
# xAI models
|
|
144
|
-
r"(xai/)?grok.*": [],
|
|
146
|
+
r"(xai/)?grok-2.*": [],
|
|
147
|
+
r"(xai/)?grok-3(-fast)?(-beta)?": [],
|
|
148
|
+
r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "high"],
|
|
145
149
|
}
|
|
146
150
|
|
|
147
151
|
|
|
148
152
|
REASONING_MODELS = [
|
|
149
153
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
|
|
150
154
|
r"(gemini/)?gemini.*thinking.*",
|
|
151
|
-
r"(gemini/)?gemini-2.5
|
|
155
|
+
r"(gemini/)?gemini-2.5.*",
|
|
156
|
+
r"(xai/)?grok-3-mini.*",
|
|
152
157
|
]
|
|
153
158
|
|
|
154
159
|
|
|
@@ -190,7 +195,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
190
195
|
)
|
|
191
196
|
|
|
192
197
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
193
|
-
dataset_config=self.dataset_config,
|
|
198
|
+
dataset_config=self.dataset_config,
|
|
199
|
+
model_config=self.model_config,
|
|
200
|
+
tokenizer=None,
|
|
201
|
+
generative_type=self.generative_type,
|
|
194
202
|
)
|
|
195
203
|
|
|
196
204
|
@property
|
|
@@ -201,13 +209,20 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
201
209
|
The generative type of the model, or None if it has not been set yet.
|
|
202
210
|
"""
|
|
203
211
|
if self.model_config.revision == "thinking":
|
|
204
|
-
|
|
212
|
+
type_ = GenerativeType.REASONING
|
|
205
213
|
elif re.fullmatch(
|
|
206
214
|
pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
|
|
207
215
|
):
|
|
208
|
-
|
|
216
|
+
type_ = GenerativeType.REASONING
|
|
209
217
|
else:
|
|
210
|
-
|
|
218
|
+
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
219
|
+
|
|
220
|
+
log_once(
|
|
221
|
+
f"Detected generative type {type_.name!r} for model "
|
|
222
|
+
f"{self.model_config.model_id!r}",
|
|
223
|
+
level=logging.DEBUG,
|
|
224
|
+
)
|
|
225
|
+
return type_
|
|
211
226
|
|
|
212
227
|
def generate(self, inputs: dict) -> GenerativeModelOutput:
|
|
213
228
|
"""Generate outputs from the model.
|
|
@@ -243,7 +258,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
243
258
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
244
259
|
# time we generate a new dataset since the dataset config can change
|
|
245
260
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
246
|
-
dataset_config=self.dataset_config,
|
|
261
|
+
dataset_config=self.dataset_config,
|
|
262
|
+
model_config=self.model_config,
|
|
263
|
+
tokenizer=None,
|
|
264
|
+
generative_type=self.generative_type,
|
|
247
265
|
)
|
|
248
266
|
|
|
249
267
|
if self.buffer["first_label_token_mapping"]:
|
|
@@ -254,16 +272,41 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
254
272
|
assert "json" in messages[0]["content"].lower(), (
|
|
255
273
|
"Prompt must contain 'json' for JSON tasks."
|
|
256
274
|
)
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
275
|
+
if self.generative_type == GenerativeType.REASONING:
|
|
276
|
+
log_once(
|
|
277
|
+
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
278
|
+
"and thus does not support structured generation, so we do not "
|
|
279
|
+
"enable it.",
|
|
280
|
+
level=logging.DEBUG,
|
|
281
|
+
)
|
|
282
|
+
elif litellm.utils.supports_response_schema(
|
|
283
|
+
model=self.model_config.model_id
|
|
284
|
+
):
|
|
285
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
286
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
287
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
288
|
+
for tag_name in ner_tag_names
|
|
289
|
+
}
|
|
290
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
291
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
292
|
+
log_once(
|
|
293
|
+
"Enabling structured generation for model "
|
|
294
|
+
f"{self.model_config.model_id!r} with the JSON schema "
|
|
295
|
+
f"{pydantic_class.model_json_schema()}",
|
|
296
|
+
level=logging.DEBUG,
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
generation_kwargs["response_format"] = dict(type="json_object")
|
|
300
|
+
log_once(
|
|
301
|
+
"Enabling structured JSON generation for model "
|
|
302
|
+
f"{self.model_config.model_id!r} with no custom JSON schema, as "
|
|
303
|
+
"the model does not support schemas.",
|
|
304
|
+
level=logging.DEBUG,
|
|
305
|
+
)
|
|
263
306
|
|
|
264
307
|
if self.model_config.revision == "thinking":
|
|
265
308
|
generation_kwargs["thinking"] = dict(
|
|
266
|
-
type="enabled", budget_tokens=REASONING_MAX_TOKENS
|
|
309
|
+
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
267
310
|
)
|
|
268
311
|
log_once(
|
|
269
312
|
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
@@ -280,28 +323,42 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
280
323
|
# This drops generation kwargs that are not supported by the model
|
|
281
324
|
litellm.drop_params = True
|
|
282
325
|
|
|
326
|
+
# Error messages that we want to catch and handle
|
|
327
|
+
stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
|
|
328
|
+
logprobs_messages = [
|
|
329
|
+
"you are not allowed to request logprobs",
|
|
330
|
+
"you've reached the maximum number of requests with logprobs",
|
|
331
|
+
"logprobs is not supported",
|
|
332
|
+
"logprobs is not enabled",
|
|
333
|
+
]
|
|
334
|
+
temperature_messages = [
|
|
335
|
+
"'temperature' is not supported with this model.",
|
|
336
|
+
"temperature is not supported with this model",
|
|
337
|
+
]
|
|
338
|
+
temperature_must_be_one_messages = [
|
|
339
|
+
"`temperature` may only be set to 1",
|
|
340
|
+
"'temperature' does not support 0.0 with this model. Only the default "
|
|
341
|
+
"(1) value is supported",
|
|
342
|
+
]
|
|
343
|
+
max_items_messages = ["'maxItems' is not permitted."]
|
|
344
|
+
no_json_schema_messages = ["Property keys should match pattern"]
|
|
345
|
+
|
|
283
346
|
# Extract the generated sequences from the model response. Some APIs cannot
|
|
284
347
|
# handle using newlines as stop sequences, so we try both.
|
|
285
348
|
num_attempts = 10
|
|
286
349
|
for _ in range(num_attempts):
|
|
287
|
-
stop_messages = ["stop_sequences"]
|
|
288
|
-
logprobs_messages = [
|
|
289
|
-
"you are not allowed to request logprobs",
|
|
290
|
-
"you've reached the maximum number of requests with logprobs",
|
|
291
|
-
"logprobs is not supported",
|
|
292
|
-
"logprobs is not enabled",
|
|
293
|
-
]
|
|
294
|
-
temperature_messages = [
|
|
295
|
-
"'temperature' is not supported with this model.",
|
|
296
|
-
"temperature is not supported with this model",
|
|
297
|
-
]
|
|
298
350
|
try:
|
|
299
|
-
model_response = litellm.
|
|
300
|
-
messages=messages,
|
|
351
|
+
model_response = litellm.completion_with_retries(
|
|
352
|
+
messages=messages, **generation_kwargs
|
|
301
353
|
)
|
|
302
354
|
break
|
|
303
355
|
except (BadRequestError, RateLimitError) as e:
|
|
304
356
|
if any(msg.lower() in str(e).lower() for msg in stop_messages):
|
|
357
|
+
log_once(
|
|
358
|
+
f"The model {self.model_config.model_id!r} does not support "
|
|
359
|
+
"stop sequences, so disabling them.",
|
|
360
|
+
level=logging.DEBUG,
|
|
361
|
+
)
|
|
305
362
|
generation_kwargs["stop"] = None
|
|
306
363
|
elif (
|
|
307
364
|
any(msg.lower() in str(e).lower() for msg in logprobs_messages)
|
|
@@ -310,10 +367,55 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
310
367
|
# we ignore this since the rate limiting makes it unusable anyway.
|
|
311
368
|
or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
|
|
312
369
|
):
|
|
370
|
+
log_once(
|
|
371
|
+
f"The model {self.model_config.model_id!r} does not support "
|
|
372
|
+
"logprobs, so disabling it.",
|
|
373
|
+
level=logging.DEBUG,
|
|
374
|
+
)
|
|
313
375
|
generation_kwargs.pop("logprobs")
|
|
314
376
|
generation_kwargs.pop("top_logprobs")
|
|
315
377
|
elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
|
|
378
|
+
log_once(
|
|
379
|
+
f"The model {self.model_config.model_id!r} does not support "
|
|
380
|
+
"temperature, so disabling it.",
|
|
381
|
+
level=logging.DEBUG,
|
|
382
|
+
)
|
|
316
383
|
generation_kwargs.pop("temperature")
|
|
384
|
+
elif any(
|
|
385
|
+
msg.lower() in str(e).lower()
|
|
386
|
+
for msg in temperature_must_be_one_messages
|
|
387
|
+
):
|
|
388
|
+
log_once(
|
|
389
|
+
f"The model {self.model_config.model_id!r} requires "
|
|
390
|
+
"temperature to be set to 1, so setting it.",
|
|
391
|
+
level=logging.DEBUG,
|
|
392
|
+
)
|
|
393
|
+
generation_kwargs["temperature"] = 1.0
|
|
394
|
+
elif any(msg.lower() in str(e).lower() for msg in max_items_messages):
|
|
395
|
+
log_once(
|
|
396
|
+
f"The model {self.model_config.model_id!r} does not support "
|
|
397
|
+
"maxItems in the JSON schema, so disabling it.",
|
|
398
|
+
level=logging.DEBUG,
|
|
399
|
+
)
|
|
400
|
+
ner_tag_names = list(
|
|
401
|
+
self.dataset_config.prompt_label_mapping.values()
|
|
402
|
+
)
|
|
403
|
+
keys_and_their_types = {
|
|
404
|
+
tag_name: (list[str], ...) for tag_name in ner_tag_names
|
|
405
|
+
}
|
|
406
|
+
pydantic_class = create_model(
|
|
407
|
+
"AnswerFormat", **keys_and_their_types
|
|
408
|
+
)
|
|
409
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
410
|
+
elif any(
|
|
411
|
+
msg.lower() in str(e).lower() for msg in no_json_schema_messages
|
|
412
|
+
):
|
|
413
|
+
log_once(
|
|
414
|
+
f"The model {self.model_config.model_id!r} does not support "
|
|
415
|
+
"JSON schemas, so using the vanilla JSON format.",
|
|
416
|
+
level=logging.DEBUG,
|
|
417
|
+
)
|
|
418
|
+
generation_kwargs["response_format"] = dict(type="json_object")
|
|
317
419
|
elif isinstance(e, RateLimitError):
|
|
318
420
|
raise InvalidModel(
|
|
319
421
|
"You have encountered your rate limit for model "
|
|
@@ -332,6 +434,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
332
434
|
Timeout,
|
|
333
435
|
ServiceUnavailableError,
|
|
334
436
|
InternalServerError,
|
|
437
|
+
SystemError,
|
|
335
438
|
) as e:
|
|
336
439
|
logger.debug(
|
|
337
440
|
f"Service temporarily unavailable. The error message was: {e}. "
|
|
@@ -359,9 +462,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
359
462
|
"reasoning. Returning an empty string."
|
|
360
463
|
)
|
|
361
464
|
return GenerativeModelOutput(sequences=[""])
|
|
465
|
+
|
|
362
466
|
model_response_choices = model_response.choices[0]
|
|
363
467
|
assert isinstance(model_response_choices, litellm.Choices)
|
|
364
|
-
|
|
468
|
+
generated_message: litellm.Message = model_response_choices.message
|
|
469
|
+
generation_output = generated_message.content or ""
|
|
365
470
|
generation_output = generation_output.strip()
|
|
366
471
|
|
|
367
472
|
# Structure the model output as a GenerativeModelOutput object
|
|
@@ -132,7 +132,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
132
132
|
self._model: LLM = model
|
|
133
133
|
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
134
134
|
self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
|
|
135
|
-
model=self._model, tokenizer=self._tokenizer
|
|
135
|
+
model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
|
|
136
136
|
)
|
|
137
137
|
|
|
138
138
|
# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
|
|
@@ -146,7 +146,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
146
146
|
self.buffer |= dict(
|
|
147
147
|
instruction_model=self._tokenizer.chat_template is not None,
|
|
148
148
|
first_label_token_mapping=get_first_label_token_mapping(
|
|
149
|
-
dataset_config=self.dataset_config,
|
|
149
|
+
dataset_config=self.dataset_config,
|
|
150
|
+
model_config=self.model_config,
|
|
151
|
+
tokenizer=self._tokenizer,
|
|
152
|
+
generative_type=self.generative_type,
|
|
150
153
|
),
|
|
151
154
|
)
|
|
152
155
|
if self.model_config.adapter_base_model_id is not None:
|
|
@@ -332,30 +335,40 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
332
335
|
if end_of_chat_token:
|
|
333
336
|
stop_tokens.append(end_of_chat_token)
|
|
334
337
|
|
|
338
|
+
logits_processor = None
|
|
335
339
|
if self.dataset_config.task in TASKS_USING_JSON:
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
340
|
+
if self.generative_type == GenerativeType.REASONING:
|
|
341
|
+
log_once(
|
|
342
|
+
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
343
|
+
"and thus does not support structured generation, so we do not "
|
|
344
|
+
"enable it.",
|
|
345
|
+
level=logging.DEBUG,
|
|
346
|
+
)
|
|
347
|
+
else:
|
|
348
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
349
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
350
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
351
|
+
for tag_name in ner_tag_names
|
|
352
|
+
}
|
|
353
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
354
|
+
logits_processor = JSONLogitsProcessor(
|
|
355
|
+
schema=pydantic_class,
|
|
356
|
+
tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
|
|
357
|
+
whitespace_pattern=r" ?",
|
|
358
|
+
)
|
|
359
|
+
log_once(
|
|
360
|
+
"Using structured generation with the JSON schema "
|
|
361
|
+
f"{pydantic_class.model_json_schema()}",
|
|
362
|
+
level=logging.DEBUG,
|
|
363
|
+
)
|
|
354
364
|
|
|
355
365
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
356
366
|
# time we generate a new dataset since the dataset config can change
|
|
357
367
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
358
|
-
dataset_config=self.dataset_config,
|
|
368
|
+
dataset_config=self.dataset_config,
|
|
369
|
+
model_config=self.model_config,
|
|
370
|
+
tokenizer=self._tokenizer,
|
|
371
|
+
generative_type=self.generative_type,
|
|
359
372
|
)
|
|
360
373
|
|
|
361
374
|
# Define the parameters used for vLLM generation
|
|
@@ -391,7 +404,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
391
404
|
) and should_prompts_be_stripped(
|
|
392
405
|
labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
|
|
393
406
|
):
|
|
394
|
-
log_once(
|
|
407
|
+
log_once(
|
|
408
|
+
f"Stripping prompts for model {self.model_config.model_id!r}.",
|
|
409
|
+
level=logging.DEBUG,
|
|
410
|
+
)
|
|
395
411
|
prompts = [prompt.strip() for prompt in prompts]
|
|
396
412
|
|
|
397
413
|
# Generate sequences using vLLM
|
|
@@ -411,18 +427,64 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
411
427
|
f"Encountered error during vLLM generation: {str(e)}. Retrying..."
|
|
412
428
|
)
|
|
413
429
|
sleep(1)
|
|
430
|
+
except ValueError as e:
|
|
431
|
+
# Truncate the prompts if they are too long for the model
|
|
432
|
+
truncate_error_messages = [
|
|
433
|
+
r"prompt \(length [0-9]+\) is longer than the maximum model length"
|
|
434
|
+
]
|
|
435
|
+
if any(
|
|
436
|
+
re.search(pattern, str(e), flags=re.IGNORECASE) is not None
|
|
437
|
+
for pattern in truncate_error_messages
|
|
438
|
+
):
|
|
439
|
+
logger.info(
|
|
440
|
+
"Prompts are too long, so truncating them and trying again..."
|
|
441
|
+
)
|
|
442
|
+
tokenized_prompts = self._tokenizer(
|
|
443
|
+
text=prompts,
|
|
444
|
+
truncation=True,
|
|
445
|
+
max_length=max(
|
|
446
|
+
self._tokenizer.model_max_length - max_tokens, 0
|
|
447
|
+
),
|
|
448
|
+
)
|
|
449
|
+
prompts = self._tokenizer.batch_decode(
|
|
450
|
+
sequences=tokenized_prompts.input_ids, skip_special_tokens=True
|
|
451
|
+
)
|
|
452
|
+
else:
|
|
453
|
+
raise InvalidBenchmark(
|
|
454
|
+
f"An error occurred during vLLM generation: {str(e)}"
|
|
455
|
+
)
|
|
414
456
|
else:
|
|
415
457
|
raise InvalidBenchmark(
|
|
416
458
|
f"Could not generate sequences after {num_attempts} attempts."
|
|
417
459
|
)
|
|
418
460
|
|
|
461
|
+
# When we shorten the prompts then some residual model outputs persist, so we
|
|
462
|
+
# need to filter these out
|
|
463
|
+
num_extra_outputs = len(raw_outputs) - len(prompts)
|
|
464
|
+
if num_extra_outputs > 0:
|
|
465
|
+
raw_outputs = raw_outputs[num_extra_outputs:]
|
|
466
|
+
if not all(
|
|
467
|
+
raw_output.prompt == prompt
|
|
468
|
+
for raw_output, prompt in zip(raw_outputs, prompts)
|
|
469
|
+
):
|
|
470
|
+
raise InvalidBenchmark(
|
|
471
|
+
f"The prompts and the model outputs do not match. There were "
|
|
472
|
+
f"{num_extra_outputs!r} extra outputs."
|
|
473
|
+
)
|
|
474
|
+
else:
|
|
475
|
+
logger.debug(
|
|
476
|
+
f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
|
|
477
|
+
"which occured as we interupted the generation when we truncated "
|
|
478
|
+
"the prompts."
|
|
479
|
+
)
|
|
480
|
+
|
|
419
481
|
# Parse the raw model outputs
|
|
420
482
|
completion_ids: list[list[int]] = [
|
|
421
483
|
output.outputs[0].token_ids for output in raw_outputs
|
|
422
484
|
]
|
|
423
485
|
if self.end_of_reasoning_token_id in completion_ids[0]:
|
|
424
486
|
completion_ids = [
|
|
425
|
-
token_ids[token_ids.index(self.end_of_reasoning_token_id) +
|
|
487
|
+
token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
|
|
426
488
|
if self.end_of_reasoning_token_id in token_ids
|
|
427
489
|
else token_ids
|
|
428
490
|
for token_ids in completion_ids
|
|
@@ -435,6 +497,13 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
435
497
|
)
|
|
436
498
|
completions = [completion.strip() for completion in completions]
|
|
437
499
|
|
|
500
|
+
# Sanity check
|
|
501
|
+
if len(completions) != len(prompts):
|
|
502
|
+
breakpoint()
|
|
503
|
+
raise InvalidBenchmark(
|
|
504
|
+
f"Expected {len(prompts):,} completions, but got {len(completions):,}."
|
|
505
|
+
)
|
|
506
|
+
|
|
438
507
|
# Add logprobs scores to the output
|
|
439
508
|
if self.buffer["first_label_token_mapping"]:
|
|
440
509
|
scores: list[list[list[tuple[str, float]]]] = [
|
|
@@ -809,7 +878,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
809
878
|
if name.lower() in language_codes:
|
|
810
879
|
chat_template = candidate_template
|
|
811
880
|
log_once(
|
|
812
|
-
f"Using the {name!r} chat template for the tokenizer
|
|
881
|
+
f"Using the {name!r} chat template for the tokenizer for "
|
|
882
|
+
f"model {self.model_config.model_id!r}.",
|
|
813
883
|
level=logging.DEBUG,
|
|
814
884
|
)
|
|
815
885
|
break
|
|
@@ -1169,7 +1239,7 @@ def clear_vllm() -> None:
|
|
|
1169
1239
|
|
|
1170
1240
|
|
|
1171
1241
|
def get_end_of_reasoning_token_id(
|
|
1172
|
-
model: "LLM", tokenizer: "PreTrainedTokenizer"
|
|
1242
|
+
model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
|
|
1173
1243
|
) -> int | None:
|
|
1174
1244
|
"""Get the end of reasoning token ID for a generative model.
|
|
1175
1245
|
|
|
@@ -1182,6 +1252,8 @@ def get_end_of_reasoning_token_id(
|
|
|
1182
1252
|
The vLLM model.
|
|
1183
1253
|
tokenizer:
|
|
1184
1254
|
The tokenizer.
|
|
1255
|
+
model_id:
|
|
1256
|
+
The model ID.
|
|
1185
1257
|
|
|
1186
1258
|
Returns:
|
|
1187
1259
|
The end of reasoning token ID, or None if it could not be found.
|
|
@@ -1220,10 +1292,8 @@ def get_end_of_reasoning_token_id(
|
|
|
1220
1292
|
completion_match = re.search(pattern=r"<\w+>", string=completion)
|
|
1221
1293
|
if completion_match is None and prompt_match is None:
|
|
1222
1294
|
log_once(
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
"reasoning model."
|
|
1226
|
-
),
|
|
1295
|
+
f"Could not find a reasoning token for model {model_id!r}, so assuming "
|
|
1296
|
+
"the model is not a reasoning model.",
|
|
1227
1297
|
level=logging.DEBUG,
|
|
1228
1298
|
)
|
|
1229
1299
|
return None
|
|
@@ -1249,20 +1319,17 @@ def get_end_of_reasoning_token_id(
|
|
|
1249
1319
|
or end_of_reasoning_token not in special_tokens
|
|
1250
1320
|
):
|
|
1251
1321
|
log_once(
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
),
|
|
1322
|
+
f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
|
|
1323
|
+
f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
|
|
1324
|
+
"them is not registered as a special token, so assuming it is not a "
|
|
1325
|
+
"real reasoning token.",
|
|
1257
1326
|
level=logging.DEBUG,
|
|
1258
1327
|
)
|
|
1259
1328
|
return None
|
|
1260
1329
|
|
|
1261
1330
|
log_once(
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
f"token {end_of_reasoning_token!r}."
|
|
1265
|
-
),
|
|
1331
|
+
f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
|
|
1332
|
+
f"token {end_of_reasoning_token!r} for model {model_id!r}.",
|
|
1266
1333
|
level=logging.DEBUG,
|
|
1267
1334
|
)
|
|
1268
1335
|
|
euroeval/benchmarker.py
CHANGED
|
@@ -782,7 +782,11 @@ class Benchmarker:
|
|
|
782
782
|
dataset_languages=[
|
|
783
783
|
language.code for language in dataset_config.languages
|
|
784
784
|
],
|
|
785
|
-
model=
|
|
785
|
+
model=(
|
|
786
|
+
f"{model_config.model_id}@{model_config.revision}"
|
|
787
|
+
if model_config.revision and model_config.revision != "main"
|
|
788
|
+
else model_config.model_id
|
|
789
|
+
),
|
|
786
790
|
results=results,
|
|
787
791
|
num_model_parameters=model.num_params,
|
|
788
792
|
max_sequence_length=model.model_max_length,
|
|
@@ -1076,6 +1080,10 @@ def initial_logging(
|
|
|
1076
1080
|
benchmark_config:
|
|
1077
1081
|
The general benchmark configuration.
|
|
1078
1082
|
"""
|
|
1083
|
+
model_id = model_config.model_id
|
|
1084
|
+
if model_config.revision and model_config.revision != "main":
|
|
1085
|
+
model_id += f"@{model_config.revision}"
|
|
1086
|
+
|
|
1079
1087
|
split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
|
|
1080
1088
|
if model_config.task in GENERATIVE_PIPELINE_TAGS:
|
|
1081
1089
|
if benchmark_config.few_shot:
|
|
@@ -1084,8 +1092,9 @@ def initial_logging(
|
|
|
1084
1092
|
eval_type = "Zero-shot benchmarking"
|
|
1085
1093
|
else:
|
|
1086
1094
|
eval_type = "Benchmarking"
|
|
1095
|
+
|
|
1087
1096
|
logger.info(
|
|
1088
|
-
f"{eval_type} {
|
|
1097
|
+
f"{eval_type} {model_id} on the {split_type} split of "
|
|
1089
1098
|
f"{dataset_config.pretty_name}"
|
|
1090
1099
|
)
|
|
1091
1100
|
|
|
@@ -1095,6 +1104,7 @@ def initial_logging(
|
|
|
1095
1104
|
"meaning that the resulting evaluation will not be included in the "
|
|
1096
1105
|
"official leaderboard."
|
|
1097
1106
|
)
|
|
1107
|
+
|
|
1098
1108
|
if benchmark_config.debug:
|
|
1099
1109
|
logger.info(
|
|
1100
1110
|
"Running in debug mode. This will output additional information, as "
|
euroeval/constants.py
CHANGED
|
@@ -16,7 +16,7 @@ MAX_CONTEXT_LENGTH = 5_000
|
|
|
16
16
|
|
|
17
17
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
18
18
|
# time to think
|
|
19
|
-
REASONING_MAX_TOKENS =
|
|
19
|
+
REASONING_MAX_TOKENS = 32_768
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
# The Hugging Face Hub pipeline tags used to classify models as generative
|
euroeval/data_loading.py
CHANGED
|
@@ -39,32 +39,9 @@ def load_data(
|
|
|
39
39
|
HuggingFaceHubDown:
|
|
40
40
|
If the Hugging Face Hub is down.
|
|
41
41
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
dataset = load_dataset(
|
|
46
|
-
path=dataset_config.huggingface_id,
|
|
47
|
-
cache_dir=benchmark_config.cache_dir,
|
|
48
|
-
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
49
|
-
)
|
|
50
|
-
break
|
|
51
|
-
except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
|
|
52
|
-
logger.warning(
|
|
53
|
-
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
54
|
-
)
|
|
55
|
-
time.sleep(1)
|
|
56
|
-
continue
|
|
57
|
-
except HfHubHTTPError:
|
|
58
|
-
raise HuggingFaceHubDown()
|
|
59
|
-
else:
|
|
60
|
-
raise InvalidBenchmark(
|
|
61
|
-
f"Failed to load dataset {dataset_config.huggingface_id!r} after "
|
|
62
|
-
f"{num_attempts} attempts."
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
66
|
-
|
|
67
|
-
dataset = DatasetDict({key: dataset[key] for key in ["train", "val", "test"]})
|
|
42
|
+
dataset = load_raw_data(
|
|
43
|
+
dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
|
|
44
|
+
)
|
|
68
45
|
|
|
69
46
|
if not benchmark_config.evaluate_test_split:
|
|
70
47
|
dataset["test"] = dataset["val"]
|
|
@@ -101,3 +78,48 @@ def load_data(
|
|
|
101
78
|
for idx in range(benchmark_config.num_iterations)
|
|
102
79
|
]
|
|
103
80
|
return datasets
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
|
|
84
|
+
"""Load the raw dataset.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
dataset_config:
|
|
88
|
+
The configuration for the dataset.
|
|
89
|
+
cache_dir:
|
|
90
|
+
The directory to cache the dataset.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
The dataset.
|
|
94
|
+
"""
|
|
95
|
+
num_attempts = 5
|
|
96
|
+
for _ in range(num_attempts):
|
|
97
|
+
try:
|
|
98
|
+
dataset = load_dataset(
|
|
99
|
+
path=dataset_config.huggingface_id,
|
|
100
|
+
cache_dir=cache_dir,
|
|
101
|
+
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
102
|
+
)
|
|
103
|
+
break
|
|
104
|
+
except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
|
|
105
|
+
logger.warning(
|
|
106
|
+
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
107
|
+
)
|
|
108
|
+
time.sleep(1)
|
|
109
|
+
continue
|
|
110
|
+
except HfHubHTTPError:
|
|
111
|
+
raise HuggingFaceHubDown()
|
|
112
|
+
else:
|
|
113
|
+
raise InvalidBenchmark(
|
|
114
|
+
f"Failed to load dataset {dataset_config.huggingface_id!r} after "
|
|
115
|
+
f"{num_attempts} attempts."
|
|
116
|
+
)
|
|
117
|
+
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
118
|
+
required_keys = ["train", "val", "test"]
|
|
119
|
+
missing_keys = [key for key in required_keys if key not in dataset]
|
|
120
|
+
if missing_keys:
|
|
121
|
+
raise InvalidBenchmark(
|
|
122
|
+
"The dataset is missing the following required splits: "
|
|
123
|
+
f"{', '.join(missing_keys)}"
|
|
124
|
+
)
|
|
125
|
+
return DatasetDict({key: dataset[key] for key in required_keys})
|
euroeval/data_models.py
CHANGED
|
@@ -521,14 +521,6 @@ class DatasetConfig:
|
|
|
521
521
|
|
|
522
522
|
Returns:
|
|
523
523
|
The natural string representation of the labels in specified language.
|
|
524
|
-
|
|
525
|
-
Raises:
|
|
526
|
-
NotImplementedError:
|
|
527
|
-
If `and_separator` or `or_separator` are `None`, see `Language`.
|
|
528
|
-
|
|
529
|
-
Example:
|
|
530
|
-
>>> get_labels_str(language=DA)
|
|
531
|
-
"'a', 'b', 'c' eller 'd'"
|
|
532
524
|
"""
|
|
533
525
|
main_language = self.languages[0]
|
|
534
526
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""All Finnish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import FI
|
|
5
|
+
from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SCANDISENT_FI_CONFIG = DatasetConfig(
|
|
10
|
+
name="scandisent-fi",
|
|
11
|
+
pretty_name="the truncated version of the Finnish part of the binary sentiment "
|
|
12
|
+
"classification dataset ScandiSent",
|
|
13
|
+
huggingface_id="EuroEval/scandisent-fi-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[FI],
|
|
16
|
+
_labels=["negative", "positive"],
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
TURKU_NER_FI_CONFIG = DatasetConfig(
|
|
20
|
+
name="turku-ner-fi",
|
|
21
|
+
pretty_name="the Finnish part of the named entity recognition dataset Turku NER",
|
|
22
|
+
huggingface_id="EuroEval/turku-ner-fi-mini",
|
|
23
|
+
task=NER,
|
|
24
|
+
languages=[FI],
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
TYDIQA_FI_CONFIG = DatasetConfig(
|
|
28
|
+
name="tydiqa-fi",
|
|
29
|
+
pretty_name="the Finnish part of the TydiQA reading comprehension dataset",
|
|
30
|
+
huggingface_id="EuroEval/tydiqa-fi-mini",
|
|
31
|
+
task=RC,
|
|
32
|
+
languages=[FI],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
XLSUM_FI_CONFIG = DatasetConfig(
|
|
36
|
+
name="xlsum-fi",
|
|
37
|
+
pretty_name="the Finnish summarisation dataset XL-Sum",
|
|
38
|
+
huggingface_id="EuroEval/xlsum-fi-mini",
|
|
39
|
+
task=SUMM,
|
|
40
|
+
languages=[FI],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
HELLASWAG_FI_CONFIG = DatasetConfig(
|
|
44
|
+
name="hellaswag-fi",
|
|
45
|
+
pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
|
|
46
|
+
"HellaSwag-fi, translated from the English HellaSwag dataset",
|
|
47
|
+
huggingface_id="EuroEval/hellaswag-fi-mini",
|
|
48
|
+
task=COMMON_SENSE,
|
|
49
|
+
languages=[FI],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
SCALA_FI_CONFIG = DatasetConfig(
|
|
53
|
+
name="scala-fi",
|
|
54
|
+
pretty_name="the Finnish part of the linguistic acceptability dataset ScaLA",
|
|
55
|
+
huggingface_id="EuroEval/scala-fi",
|
|
56
|
+
task=LA,
|
|
57
|
+
languages=[FI],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
### Unofficial datasets ###
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Linguistic Acceptability task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
LA_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
|
|
|
36
36
|
default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
|
|
37
37
|
"gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
|
|
41
|
+
default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
|
|
42
|
+
"kieliopillisesti oikein.",
|
|
43
|
+
default_prompt_template="Lause: {text}\nKieliopillisesti oikein: {label}",
|
|
44
|
+
default_instruction_prompt="Lause: {text}\n\nMääritä onko lause "
|
|
45
|
+
"oikein vai ei. Vastaa {labels_str}, ja ei mitään muuta.",
|
|
46
|
+
),
|
|
39
47
|
FO: PromptConfig(
|
|
40
48
|
default_prompt_label_mapping=dict(correct="ja", incorrect="nei"),
|
|
41
49
|
default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for all multiple choice tasks."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
MULTIPLE_CHOICE_TEMPLATES = {
|
|
@@ -36,6 +36,13 @@ MULTIPLE_CHOICE_TEMPLATES = {
|
|
|
36
36
|
"usando solo {labels_str}, y nada más.",
|
|
37
37
|
default_prompt_label_mapping="auto",
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
|
|
41
|
+
default_prompt_template="Kysymys: {text}\nVastaus: {label}",
|
|
42
|
+
default_instruction_prompt="Kysymys: {text}\n\nVastaa yllä olevaan kysymykseen "
|
|
43
|
+
"käyttämällä {labels_str}, äläkä mitään muuta.",
|
|
44
|
+
default_prompt_label_mapping="auto",
|
|
45
|
+
),
|
|
39
46
|
FR: PromptConfig(
|
|
40
47
|
default_prompt_prefix="Les questions suivantes sont des questions à choix "
|
|
41
48
|
"multiples (avec réponses).",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Named Entity Recognition task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
NER_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
|
|
|
80
80
|
"claves {labels_str}. Los valores deben ser listas de las "
|
|
81
81
|
"entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
|
|
82
82
|
),
|
|
83
|
+
FI: PromptConfig(
|
|
84
|
+
default_prompt_label_mapping={
|
|
85
|
+
"b-per": "henkilö",
|
|
86
|
+
"i-per": "henkilö",
|
|
87
|
+
"b-loc": "paikka",
|
|
88
|
+
"i-loc": "paikka",
|
|
89
|
+
"b-org": "organisaatio",
|
|
90
|
+
"i-org": "organisaatio",
|
|
91
|
+
"b-misc": "muut",
|
|
92
|
+
"i-misc": "muut",
|
|
93
|
+
},
|
|
94
|
+
default_prompt_prefix="Seuraavassa on lauseita ja JSON-sanakirjoja, jotka "
|
|
95
|
+
"sisältävät annetussa lauseessa esiintyvät nimetyt entiteetit.",
|
|
96
|
+
default_prompt_template="Lause: {text}\nNimetyt entiteetit: {label}",
|
|
97
|
+
default_instruction_prompt="Lause: {text}\n\nTunnista lauseessa olevat "
|
|
98
|
+
"entiteetit. Tulosta ne JSON-sanakirjana, jonka avaimet ovat {labels_str}. "
|
|
99
|
+
"Arvojen tulee olla listoja kyseisen tyypin nimetyistä entiteeteistä "
|
|
100
|
+
"täsmälleen siinä muodossa kuin ne esiintyvät lauseessa.",
|
|
101
|
+
),
|
|
83
102
|
FO: PromptConfig(
|
|
84
103
|
default_prompt_label_mapping={
|
|
85
104
|
"b-per": "persónur",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Reading Comprehension task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
RC_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -39,6 +39,16 @@ RC_TEMPLATES = {
|
|
|
39
39
|
"sobre el texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
|
|
40
40
|
default_prompt_label_mapping=dict(),
|
|
41
41
|
),
|
|
42
|
+
FI: PromptConfig(
|
|
43
|
+
default_prompt_prefix="Seuraavassa on tekstejä ja niihin liittyviä kysymyksiä "
|
|
44
|
+
"ja vastauksia.",
|
|
45
|
+
default_prompt_template="Teksti: {text}\nKysymys: {question} "
|
|
46
|
+
"\nVastaa enintään 3 sanalla: {label}",
|
|
47
|
+
default_instruction_prompt="Teksti: {text}\n\nVastaa seuraavaan "
|
|
48
|
+
"kysymykseen yllä olevasta tekstistä enintään 3 sanalla.\n\n"
|
|
49
|
+
"Kysymys: {question}",
|
|
50
|
+
default_prompt_label_mapping=dict(),
|
|
51
|
+
),
|
|
42
52
|
FO: PromptConfig(
|
|
43
53
|
default_prompt_prefix="Hetta eru tekstir saman við spurningum og svar.",
|
|
44
54
|
default_prompt_template="Tekstur: {text}\nSpurningur: {question}\nSvara við í "
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Sentiment Analysis task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
SENT_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
|
|
|
44
44
|
default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
|
|
45
45
|
"documento. Responde con {labels_str}, y nada más.",
|
|
46
46
|
),
|
|
47
|
+
FI: PromptConfig(
|
|
48
|
+
default_prompt_label_mapping=dict(
|
|
49
|
+
positive="positiivinen", neutral="neutrali", negative="negatiivinen"
|
|
50
|
+
),
|
|
51
|
+
default_prompt_prefix="Seuraavassa on arvosteluja ja niiden tunnesävy, joka "
|
|
52
|
+
"voi olla {labels_str}.",
|
|
53
|
+
default_prompt_template="Teksti: {text}\nTunnesävy: {label}",
|
|
54
|
+
default_instruction_prompt="Teksti: {text}\n\nLuokittele arvostelun tunnesävy. "
|
|
55
|
+
"Vastaa vain {labels_str}, ei muuta.",
|
|
56
|
+
),
|
|
47
57
|
FO: PromptConfig(
|
|
48
58
|
default_prompt_label_mapping=dict(
|
|
49
59
|
positive="positivt", neutral="neutralt", negative="negativt"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Summarization task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
SUMM_TEMPLATES = {
|
|
@@ -36,6 +36,14 @@ SUMM_TEMPLATES = {
|
|
|
36
36
|
"documento anterior.",
|
|
37
37
|
default_prompt_label_mapping=dict(),
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
|
|
41
|
+
"tiivistelmiä.",
|
|
42
|
+
default_prompt_template="Uutisartikkeli: {text}\nTiivistelmä: {target_text}",
|
|
43
|
+
default_instruction_prompt="Uutisartikkeli: {text}\n\nKirjoita tiivistelmä "
|
|
44
|
+
"yllä olevasta artikkelista.",
|
|
45
|
+
default_prompt_label_mapping=dict(),
|
|
46
|
+
),
|
|
39
47
|
FR: PromptConfig(
|
|
40
48
|
default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
|
|
41
49
|
default_prompt_template="Document: {text}\nRésumé: {target_text}",
|
|
@@ -132,22 +132,23 @@ def extract_labels_from_generation(
|
|
|
132
132
|
The predicted labels.
|
|
133
133
|
"""
|
|
134
134
|
if model_output.scores is not None:
|
|
135
|
-
|
|
135
|
+
labels = get_closest_logprobs_labels(
|
|
136
136
|
generation_logprobs=model_output.scores,
|
|
137
137
|
dataset_config=dataset_config,
|
|
138
138
|
first_label_token_mapping=first_label_token_mapping,
|
|
139
139
|
)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
140
|
+
if labels is not None:
|
|
141
|
+
return labels
|
|
142
|
+
return get_closest_word_edit_labels(
|
|
143
|
+
generated_sequences=model_output.sequences, dataset_config=dataset_config
|
|
144
|
+
)
|
|
144
145
|
|
|
145
146
|
|
|
146
147
|
def get_closest_logprobs_labels(
|
|
147
148
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
148
149
|
dataset_config: "DatasetConfig",
|
|
149
150
|
first_label_token_mapping: dict[str, str] | bool,
|
|
150
|
-
) -> list[str]:
|
|
151
|
+
) -> list[str] | None:
|
|
151
152
|
"""Get the labels with the highest predicted logprob value.
|
|
152
153
|
|
|
153
154
|
In case a candidate label is split into multiple tokens, we only use the first
|
|
@@ -167,7 +168,7 @@ def get_closest_logprobs_labels(
|
|
|
167
168
|
mapping is outputted then the model will always output scores).
|
|
168
169
|
|
|
169
170
|
Returns:
|
|
170
|
-
The predicted labels.
|
|
171
|
+
The predicted labels, or None if labels could not be extracted.
|
|
171
172
|
|
|
172
173
|
Raises:
|
|
173
174
|
InvalidBenchmark:
|
|
@@ -193,10 +194,7 @@ def get_closest_logprobs_labels(
|
|
|
193
194
|
# We want to use the first generated label which contains a unique candidate
|
|
194
195
|
# label, as the output label
|
|
195
196
|
output_label: str | None = None
|
|
196
|
-
|
|
197
|
-
for label_idx, generated_label in enumerate(generated_labels):
|
|
198
|
-
generated_label = "".join(previously_generated_labels) + generated_label
|
|
199
|
-
|
|
197
|
+
for generated_label in generated_labels:
|
|
200
198
|
# Get the candidate labels that starts with the generated label
|
|
201
199
|
if isinstance(first_label_token_mapping, dict):
|
|
202
200
|
if any(
|
|
@@ -222,31 +220,28 @@ def get_closest_logprobs_labels(
|
|
|
222
220
|
if candidate_label.startswith(generated_label)
|
|
223
221
|
}
|
|
224
222
|
|
|
225
|
-
# If we can uniquely determine the output label, we break the loop.
|
|
226
|
-
# there are multiple possible labels then we store the current one, and
|
|
227
|
-
# concatenate it with the next generated label. We can only do this if
|
|
228
|
-
# the current one is the first one, however, since we're using greedy
|
|
229
|
-
# sampling. In case this happens for a label that is not the first one,
|
|
230
|
-
# we warn the user.
|
|
223
|
+
# If we can uniquely determine the output label, we break the loop.
|
|
231
224
|
if len(candidate_output_labels) == 1:
|
|
232
225
|
output_label = candidate_output_labels.pop()
|
|
233
226
|
break
|
|
227
|
+
|
|
228
|
+
# If we have multiple candidate labels, we cannot uniquely determine the
|
|
229
|
+
# output label, so we abandon extracting the labels using logprobs and
|
|
230
|
+
# fall back to using word edit distance.
|
|
234
231
|
elif len(candidate_output_labels) > 1:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
"github.com/EuroEval/EuroEval/issues."
|
|
249
|
-
)
|
|
232
|
+
log_once(
|
|
233
|
+
"Multiple candidate labels found for the generated label "
|
|
234
|
+
f"{generated_label!r}: {candidate_output_labels}. This means "
|
|
235
|
+
"that using logprobs to extract the labels is not reliable, "
|
|
236
|
+
"and we will instead fall back to extracting the labels "
|
|
237
|
+
"using word edit distance.",
|
|
238
|
+
level=logging.DEBUG,
|
|
239
|
+
)
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
# If no candidate label is found, we ignore the generated label, as it
|
|
243
|
+
# basically means that the model is just really bad at generating
|
|
244
|
+
# labels.
|
|
250
245
|
elif len(candidate_output_labels) == 0:
|
|
251
246
|
logger.debug(
|
|
252
247
|
f"No candidate label found for the generated label "
|
|
@@ -10,11 +10,7 @@ from evaluate import EvaluationModule
|
|
|
10
10
|
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
11
11
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
12
12
|
from ..exceptions import InvalidBenchmark
|
|
13
|
-
from ..utils import
|
|
14
|
-
HiddenPrints,
|
|
15
|
-
clear_memory,
|
|
16
|
-
raise_if_model_output_contains_nan_values,
|
|
17
|
-
)
|
|
13
|
+
from ..utils import HiddenPrints, raise_if_model_output_contains_nan_values
|
|
18
14
|
|
|
19
15
|
if t.TYPE_CHECKING:
|
|
20
16
|
from transformers.trainer_utils import EvalPrediction
|
|
@@ -89,20 +85,8 @@ def compute_metrics(
|
|
|
89
85
|
score_dict: dict[str, float] | None = metric.compute(
|
|
90
86
|
predictions=predictions, references=labels, **cfg.compute_kwargs
|
|
91
87
|
)
|
|
92
|
-
|
|
93
|
-
# Clear the cache of the BERTScorer to avoid memory leaks
|
|
94
|
-
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
95
|
-
if hasattr(metric, attribute):
|
|
96
|
-
delattr(metric, attribute)
|
|
97
|
-
|
|
98
|
-
clear_memory()
|
|
99
88
|
break
|
|
100
89
|
except Exception as e:
|
|
101
|
-
# Clear the cache of the BERTScorer to avoid memory leaks
|
|
102
|
-
if hasattr(metric, "cached_bertscorer"):
|
|
103
|
-
del metric.cached_bertscorer
|
|
104
|
-
clear_memory()
|
|
105
|
-
|
|
106
90
|
oom_error = [
|
|
107
91
|
"CUDA out of memory",
|
|
108
92
|
"CUDA error",
|
|
@@ -111,16 +95,7 @@ def compute_metrics(
|
|
|
111
95
|
if not any(error in str(e) for error in oom_error):
|
|
112
96
|
raise InvalidBenchmark(str(e))
|
|
113
97
|
|
|
114
|
-
if cfg.compute_kwargs.get("
|
|
115
|
-
batch_size = cfg.compute_kwargs["batch_size"]
|
|
116
|
-
cfg.compute_kwargs["batch_size"] = batch_size // 2
|
|
117
|
-
logger.debug(
|
|
118
|
-
"Out of memory error occurred during the computation of "
|
|
119
|
-
f"the metric {cfg.pretty_name}. Reducing the batch size to "
|
|
120
|
-
f"{cfg.compute_kwargs['batch_size']}."
|
|
121
|
-
)
|
|
122
|
-
elif cfg.compute_kwargs.get("device", "cpu") != "cpu":
|
|
123
|
-
cfg.compute_kwargs["batch_size"] = 32
|
|
98
|
+
if cfg.compute_kwargs.get("device", "cpu") != "cpu":
|
|
124
99
|
cfg.compute_kwargs["device"] = "cpu"
|
|
125
100
|
logger.debug(
|
|
126
101
|
"Out of memory error occurred during the computation of "
|
|
@@ -129,6 +104,14 @@ def compute_metrics(
|
|
|
129
104
|
)
|
|
130
105
|
else:
|
|
131
106
|
raise InvalidBenchmark(str(e))
|
|
107
|
+
finally:
|
|
108
|
+
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
109
|
+
if hasattr(metric, attribute):
|
|
110
|
+
logger.debug(
|
|
111
|
+
f"Deleting the {attribute!r} attribute of the metric "
|
|
112
|
+
f"{cfg.pretty_name} to free up memory."
|
|
113
|
+
)
|
|
114
|
+
delattr(metric, attribute)
|
|
132
115
|
|
|
133
116
|
# The metric returns None if we are running on multi-GPU and the current
|
|
134
117
|
# process is not the main process
|
euroeval/tasks.py
CHANGED
|
@@ -142,7 +142,7 @@ SUMM = Task(
|
|
|
142
142
|
huggingface_id="bertscore",
|
|
143
143
|
results_key="f1",
|
|
144
144
|
compute_kwargs=dict(
|
|
145
|
-
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=
|
|
145
|
+
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
|
|
146
146
|
),
|
|
147
147
|
),
|
|
148
148
|
MetricConfig(
|
euroeval/tokenization_utils.py
CHANGED
|
@@ -7,6 +7,7 @@ import typing as t
|
|
|
7
7
|
import torch
|
|
8
8
|
|
|
9
9
|
from .constants import TASK_GROUPS_USING_LOGPROBS
|
|
10
|
+
from .enums import GenerativeType
|
|
10
11
|
from .exceptions import InvalidModel
|
|
11
12
|
from .utils import log_once
|
|
12
13
|
|
|
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
|
|
|
14
15
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
15
16
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
16
17
|
|
|
17
|
-
from .data_models import DatasetConfig
|
|
18
|
+
from .data_models import DatasetConfig, ModelConfig
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger("euroeval")
|
|
@@ -254,35 +255,50 @@ def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | N
|
|
|
254
255
|
|
|
255
256
|
|
|
256
257
|
def get_first_label_token_mapping(
|
|
257
|
-
dataset_config: "DatasetConfig",
|
|
258
|
+
dataset_config: "DatasetConfig",
|
|
259
|
+
model_config: "ModelConfig",
|
|
260
|
+
tokenizer: "PreTrainedTokenizer | None",
|
|
261
|
+
generative_type: "GenerativeType | None",
|
|
258
262
|
) -> dict[str, str] | bool:
|
|
259
263
|
"""Check if the model should output scores.
|
|
260
264
|
|
|
261
265
|
Args:
|
|
262
266
|
dataset_config:
|
|
263
267
|
The dataset configuration.
|
|
268
|
+
model_config:
|
|
269
|
+
The model configuration.
|
|
264
270
|
tokenizer:
|
|
265
271
|
The tokenizer, or None if not available.
|
|
272
|
+
generative_type:
|
|
273
|
+
The generative type, or None if not available.
|
|
266
274
|
|
|
267
275
|
Returns:
|
|
268
276
|
A mapping from labels to the first token in each label, or alternatively a
|
|
269
277
|
Boolean value indicating whether the model should output scores (if the mapping
|
|
270
278
|
is outputted then the model will always output scores).
|
|
271
279
|
"""
|
|
280
|
+
if generative_type == GenerativeType.REASONING:
|
|
281
|
+
log_once(
|
|
282
|
+
f"The model {model_config.model_id!r} is a reasoning model and "
|
|
283
|
+
"thus does not support logprobs, so we do not enable it.",
|
|
284
|
+
level=logging.DEBUG,
|
|
285
|
+
)
|
|
286
|
+
return False
|
|
287
|
+
|
|
272
288
|
# If we do not have any tokenizer, then we cannot check if the model should output
|
|
273
289
|
# scores and we just assume it should if the dataset supports it
|
|
274
290
|
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
275
291
|
if tokenizer is None:
|
|
276
292
|
if output_scores:
|
|
277
293
|
log_once(
|
|
278
|
-
"The model will output scores, since the
|
|
279
|
-
"tokenizer is available.",
|
|
294
|
+
f"The model {model_config.model_id!r} will output scores, since the "
|
|
295
|
+
"dataset supports it and no tokenizer is available.",
|
|
280
296
|
level=logging.DEBUG,
|
|
281
297
|
)
|
|
282
298
|
else:
|
|
283
299
|
log_once(
|
|
284
|
-
"The model will not output scores, since
|
|
285
|
-
"it and no tokenizer is available.",
|
|
300
|
+
f"The model {model_config.model_id!r} will not output scores, since "
|
|
301
|
+
"the dataset does not support it and no tokenizer is available.",
|
|
286
302
|
level=logging.DEBUG,
|
|
287
303
|
)
|
|
288
304
|
return output_scores
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.7.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -32,7 +32,7 @@ Requires-Python: <4.0,>=3.10
|
|
|
32
32
|
Requires-Dist: accelerate>=0.34.2
|
|
33
33
|
Requires-Dist: bert-score>=0.3.13
|
|
34
34
|
Requires-Dist: click>=8.1.3
|
|
35
|
-
Requires-Dist: datasets>=
|
|
35
|
+
Requires-Dist: datasets>=3.5.0
|
|
36
36
|
Requires-Dist: demjson3>=3.0.6
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
38
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
@@ -239,6 +239,18 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
239
239
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
240
240
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
241
241
|
|
|
242
|
+
|
|
243
|
+
### Contribute to EuroEval
|
|
244
|
+
|
|
245
|
+
We welcome contributions to EuroEval! Whether you're fixing bugs, adding features, or
|
|
246
|
+
contributing new datasets, your help makes this project better for everyone.
|
|
247
|
+
|
|
248
|
+
- **General contributions**: Check out our [contribution guidelines](CONTRIBUTING.md)
|
|
249
|
+
for information on how to get started.
|
|
250
|
+
- **Adding datasets**: If you're interested in adding a new dataset to EuroEval, we have
|
|
251
|
+
a [dedicated guide](NEW_DATASET_GUIDE.md) with step-by-step instructions.
|
|
252
|
+
|
|
253
|
+
|
|
242
254
|
### Special Thanks
|
|
243
255
|
- Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
|
|
244
256
|
[Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
|
|
2
2
|
euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
3
|
+
euroeval/benchmarker.py,sha256=gOLNpW11cBX_8AvotnlGNbejtOM4acmXS3aovNREqhA,48434
|
|
4
4
|
euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
|
-
euroeval/constants.py,sha256=
|
|
7
|
-
euroeval/data_loading.py,sha256=
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
6
|
+
euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
|
|
7
|
+
euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
|
|
8
|
+
euroeval/data_models.py,sha256=Nlb2s26u5OvQ2AITAt25NMpeI1IHM2_qqbpyU_bZhiY,22907
|
|
9
9
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
10
|
euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
|
|
11
11
|
euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
|
|
@@ -17,21 +17,22 @@ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
|
17
17
|
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
18
18
|
euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
|
|
19
19
|
euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
|
|
20
|
-
euroeval/tasks.py,sha256=
|
|
21
|
-
euroeval/tokenization_utils.py,sha256=
|
|
20
|
+
euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
|
|
21
|
+
euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
|
|
22
22
|
euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
|
|
23
23
|
euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
|
|
27
27
|
euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
|
|
28
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
28
|
+
euroeval/benchmark_modules/litellm.py,sha256=9Fhh7Zyn6F4JBlRoQkST1wIeb8z0YliRRrcmD5pONs4,52551
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=vwAE7SGRhePqkzAt1S-FKPelEqe8VMGwah9Nj2J1hLs,51295
|
|
30
30
|
euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
|
|
31
31
|
euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
|
|
32
32
|
euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
|
|
33
33
|
euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
|
|
34
34
|
euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
|
|
35
|
+
euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
|
|
35
36
|
euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
|
|
36
37
|
euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
|
|
37
38
|
euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
|
|
@@ -40,20 +41,20 @@ euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada
|
|
|
40
41
|
euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
|
|
41
42
|
euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
|
|
42
43
|
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
43
|
-
euroeval/prompt_templates/linguistic_acceptability.py,sha256=
|
|
44
|
-
euroeval/prompt_templates/multiple_choice.py,sha256=
|
|
45
|
-
euroeval/prompt_templates/named_entity_recognition.py,sha256=
|
|
46
|
-
euroeval/prompt_templates/reading_comprehension.py,sha256=
|
|
47
|
-
euroeval/prompt_templates/sentiment_classification.py,sha256=
|
|
48
|
-
euroeval/prompt_templates/summarization.py,sha256=
|
|
44
|
+
euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
|
|
45
|
+
euroeval/prompt_templates/multiple_choice.py,sha256=6iEqiPpT-3WJN_gsyhyapnwsrcsYGdVkSkzwn-VKKxw,5101
|
|
46
|
+
euroeval/prompt_templates/named_entity_recognition.py,sha256=Xd6gBJD2e1l8-We2Ujor7crRUBcbgnNeeVknBIrTMJo,12737
|
|
47
|
+
euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLOS7ZrDUeasuLFt-dtqCnYk,6585
|
|
48
|
+
euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
|
|
49
|
+
euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
|
|
49
50
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
50
51
|
euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
|
|
51
52
|
euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
|
|
52
|
-
euroeval/task_group_utils/sequence_classification.py,sha256=
|
|
53
|
-
euroeval/task_group_utils/text_to_text.py,sha256=
|
|
53
|
+
euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
|
|
54
|
+
euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
|
|
54
55
|
euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
|
|
55
|
-
euroeval-15.
|
|
56
|
-
euroeval-15.
|
|
57
|
-
euroeval-15.
|
|
58
|
-
euroeval-15.
|
|
59
|
-
euroeval-15.
|
|
56
|
+
euroeval-15.7.0.dist-info/METADATA,sha256=8oMsbhHWeO7j4KQdn4lpt-O94Nw0erwRoD_Ogk6CX2U,13669
|
|
57
|
+
euroeval-15.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
58
|
+
euroeval-15.7.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
59
|
+
euroeval-15.7.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
60
|
+
euroeval-15.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|