EuroEval 15.7.1__py3-none-any.whl → 15.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +1 -1
- euroeval/benchmark_modules/litellm.py +341 -150
- euroeval/benchmark_modules/vllm.py +1 -1
- euroeval/benchmarker.py +24 -12
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/english.py +1 -1
- euroeval/dataset_configs/finnish.py +11 -1
- euroeval/dataset_configs/italian.py +11 -1
- euroeval/dataset_configs/spanish.py +11 -1
- euroeval/finetuning.py +29 -31
- euroeval/languages.py +1 -1
- euroeval/task_group_utils/sequence_classification.py +46 -11
- euroeval/tokenization_utils.py +52 -16
- euroeval/utils.py +41 -0
- {euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/METADATA +1 -1
- {euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/RECORD +19 -19
- {euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/WHEEL +0 -0
- {euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -238,7 +238,7 @@ def prepare_languages(
|
|
|
238
238
|
The default language codes of the languages to include.
|
|
239
239
|
|
|
240
240
|
Returns:
|
|
241
|
-
The prepared
|
|
241
|
+
The prepared dataset languages.
|
|
242
242
|
"""
|
|
243
243
|
# Create a dictionary that maps languages to their associated language objects
|
|
244
244
|
language_mapping = get_all_languages()
|
|
@@ -32,6 +32,7 @@ from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
|
32
32
|
from litellm.types.utils import ChoiceLogprobs, ModelResponse
|
|
33
33
|
from pydantic import conlist, create_model
|
|
34
34
|
from requests.exceptions import RequestException
|
|
35
|
+
from tqdm.asyncio import tqdm as tqdm_async
|
|
35
36
|
from tqdm.auto import tqdm
|
|
36
37
|
from transformers.trainer import Trainer
|
|
37
38
|
|
|
@@ -66,7 +67,12 @@ from ..task_group_utils import (
|
|
|
66
67
|
)
|
|
67
68
|
from ..tokenization_utils import get_first_label_token_mapping
|
|
68
69
|
from ..types import ExtractLabelsFunction
|
|
69
|
-
from ..utils import
|
|
70
|
+
from ..utils import (
|
|
71
|
+
catch_coroutine_exception,
|
|
72
|
+
create_model_cache_dir,
|
|
73
|
+
log_once,
|
|
74
|
+
safe_run,
|
|
75
|
+
)
|
|
70
76
|
from .base import BenchmarkModule
|
|
71
77
|
from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
|
|
72
78
|
|
|
@@ -159,9 +165,21 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
159
165
|
"""A generative model from LiteLLM."""
|
|
160
166
|
|
|
161
167
|
fresh_model = False
|
|
162
|
-
batching_preference = BatchingPreference.
|
|
168
|
+
batching_preference = BatchingPreference.ALL_AT_ONCE
|
|
163
169
|
high_priority = False
|
|
164
170
|
|
|
171
|
+
_handleable_exceptions = (
|
|
172
|
+
BadRequestError,
|
|
173
|
+
RateLimitError,
|
|
174
|
+
APIError,
|
|
175
|
+
APIConnectionError,
|
|
176
|
+
Timeout,
|
|
177
|
+
ServiceUnavailableError,
|
|
178
|
+
InternalServerError,
|
|
179
|
+
SystemError,
|
|
180
|
+
AuthenticationError,
|
|
181
|
+
)
|
|
182
|
+
|
|
165
183
|
def __init__(
|
|
166
184
|
self,
|
|
167
185
|
model_config: ModelConfig,
|
|
@@ -233,10 +251,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
233
251
|
The generated model outputs.
|
|
234
252
|
"""
|
|
235
253
|
assert "messages" in inputs, "The input must contain a 'messages' key."
|
|
236
|
-
|
|
237
|
-
"API models only support single-sample batching."
|
|
238
|
-
)
|
|
239
|
-
messages = inputs["messages"][0]
|
|
254
|
+
messages = inputs["messages"]
|
|
240
255
|
|
|
241
256
|
generation_kwargs: dict[str, t.Any] = dict(
|
|
242
257
|
model=self.model_config.model_id,
|
|
@@ -267,9 +282,20 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
267
282
|
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
268
283
|
|
|
269
284
|
if self.dataset_config.task in TASKS_USING_JSON:
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
285
|
+
for msg_list in messages:
|
|
286
|
+
# msg_list is a list of {'role':…, 'content':…} dicts
|
|
287
|
+
if not msg_list:
|
|
288
|
+
raise InvalidBenchmark(
|
|
289
|
+
"Encountered an empty message list in 'messages'."
|
|
290
|
+
)
|
|
291
|
+
last = msg_list[-1]
|
|
292
|
+
assert isinstance(last, dict), (
|
|
293
|
+
f"Expected dict message, got {type(last)}"
|
|
294
|
+
)
|
|
295
|
+
assert "json" in last["content"].lower(), (
|
|
296
|
+
"Prompt must contain 'json' for JSON tasks."
|
|
297
|
+
)
|
|
298
|
+
|
|
273
299
|
if self.generative_type == GenerativeType.REASONING:
|
|
274
300
|
log_once(
|
|
275
301
|
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
@@ -321,6 +347,76 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
321
347
|
# This drops generation kwargs that are not supported by the model
|
|
322
348
|
litellm.drop_params = True
|
|
323
349
|
|
|
350
|
+
# Extract the generated sequences from the model response. Some APIs cannot
|
|
351
|
+
# handle using newlines as stop sequences, so we try both.
|
|
352
|
+
num_attempts = 10
|
|
353
|
+
|
|
354
|
+
all_responses = {}
|
|
355
|
+
all_failures = []
|
|
356
|
+
to_run = list(enumerate(messages))
|
|
357
|
+
|
|
358
|
+
for attempt in range(num_attempts):
|
|
359
|
+
if not to_run:
|
|
360
|
+
break
|
|
361
|
+
|
|
362
|
+
batch_indices, batch_msgs = zip(*to_run)
|
|
363
|
+
model_response, failures = safe_run(
|
|
364
|
+
self._generate_async(
|
|
365
|
+
messages=list(batch_msgs),
|
|
366
|
+
generation_kwargs=generation_kwargs,
|
|
367
|
+
max_retries=3,
|
|
368
|
+
max_reruns=15,
|
|
369
|
+
)
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
for orig_idx, response in zip(batch_indices, model_response):
|
|
373
|
+
all_responses[orig_idx] = response
|
|
374
|
+
|
|
375
|
+
if not failures:
|
|
376
|
+
to_run = []
|
|
377
|
+
break
|
|
378
|
+
|
|
379
|
+
all_failures.extend(failures)
|
|
380
|
+
to_run = [(orig_idx, messages[orig_idx]) for orig_idx, _ in failures]
|
|
381
|
+
logger.debug(
|
|
382
|
+
f"Attempt {attempt + 1}/{num_attempts}: "
|
|
383
|
+
f"retrying {len(to_run)} failed message(s)"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
for _, error in failures:
|
|
387
|
+
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
388
|
+
else:
|
|
389
|
+
raise InvalidBenchmark(
|
|
390
|
+
message=f"Failed to generate text, after {num_attempts} attempts."
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
if to_run:
|
|
394
|
+
raise InvalidBenchmark(
|
|
395
|
+
f"Failed to generate text after {num_attempts} attempts. "
|
|
396
|
+
f"Errors: {all_failures}"
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
ordered_responses = [all_responses[i] for i in range(len(messages))]
|
|
400
|
+
model_output = self._create_model_output(
|
|
401
|
+
model_responses=ordered_responses, model_id=self.model_config.model_id
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
return model_output
|
|
405
|
+
|
|
406
|
+
def _handle_exception(
|
|
407
|
+
self, error: Exception, generation_kwargs: dict[str, t.Any]
|
|
408
|
+
) -> None:
|
|
409
|
+
"""Handle an exception from the model.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
error:
|
|
413
|
+
The exception to handle.
|
|
414
|
+
generation_kwargs:
|
|
415
|
+
The generation kwargs to pass to the model.
|
|
416
|
+
"""
|
|
417
|
+
error_msg = str(error).lower()
|
|
418
|
+
model_id = self.model_config.model_id
|
|
419
|
+
|
|
324
420
|
# Error messages that we want to catch and handle
|
|
325
421
|
stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
|
|
326
422
|
logprobs_messages = [
|
|
@@ -341,153 +437,238 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
341
437
|
max_items_messages = ["'maxItems' is not permitted."]
|
|
342
438
|
no_json_schema_messages = ["Property keys should match pattern"]
|
|
343
439
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
)
|
|
407
|
-
generation_kwargs["response_format"] = pydantic_class
|
|
408
|
-
elif any(
|
|
409
|
-
msg.lower() in str(e).lower() for msg in no_json_schema_messages
|
|
410
|
-
):
|
|
411
|
-
log_once(
|
|
412
|
-
f"The model {self.model_config.model_id!r} does not support "
|
|
413
|
-
"JSON schemas, so using the vanilla JSON format.",
|
|
414
|
-
level=logging.DEBUG,
|
|
415
|
-
)
|
|
416
|
-
generation_kwargs["response_format"] = dict(type="json_object")
|
|
417
|
-
elif isinstance(e, RateLimitError):
|
|
418
|
-
raise InvalidModel(
|
|
419
|
-
"You have encountered your rate limit for model "
|
|
420
|
-
f"{self.model_config.model_id!r}. Skipping."
|
|
421
|
-
)
|
|
422
|
-
else:
|
|
423
|
-
raise InvalidBenchmark(
|
|
424
|
-
f"Failed to generate text. The error message was: {e}"
|
|
425
|
-
)
|
|
426
|
-
except APIError as e:
|
|
427
|
-
raise InvalidBenchmark(
|
|
428
|
-
f"Failed to generate text. The error message was: {e}"
|
|
429
|
-
)
|
|
430
|
-
except (
|
|
440
|
+
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
441
|
+
log_once(
|
|
442
|
+
f"The model {model_id!r} does not support "
|
|
443
|
+
"stop sequences, so disabling them.",
|
|
444
|
+
level=logging.DEBUG,
|
|
445
|
+
)
|
|
446
|
+
generation_kwargs["stop"] = None
|
|
447
|
+
return
|
|
448
|
+
elif (
|
|
449
|
+
any(msg.lower() in error_msg for msg in logprobs_messages)
|
|
450
|
+
# Special case for Vertex AI models, since they have strict rate
|
|
451
|
+
# limits on using logprobs. They also have a cap of 5 logprobs, but
|
|
452
|
+
# we ignore this since the rate limiting makes it unusable anyway.
|
|
453
|
+
or (isinstance(error, VertexAIError) and "logprobs" in error_msg)
|
|
454
|
+
):
|
|
455
|
+
log_once(
|
|
456
|
+
f"The model {model_id!r} does not support logprobs, so disabling it.",
|
|
457
|
+
level=logging.DEBUG,
|
|
458
|
+
)
|
|
459
|
+
generation_kwargs.pop("logprobs")
|
|
460
|
+
generation_kwargs.pop("top_logprobs")
|
|
461
|
+
return
|
|
462
|
+
elif any(msg.lower() in error_msg for msg in temperature_messages):
|
|
463
|
+
log_once(
|
|
464
|
+
f"The model {model_id!r} does not support "
|
|
465
|
+
"temperature, so disabling it.",
|
|
466
|
+
level=logging.DEBUG,
|
|
467
|
+
)
|
|
468
|
+
generation_kwargs.pop("temperature")
|
|
469
|
+
return
|
|
470
|
+
elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
|
|
471
|
+
log_once(
|
|
472
|
+
f"The model {model_id!r} requires "
|
|
473
|
+
"temperature to be set to 1, so setting it.",
|
|
474
|
+
level=logging.DEBUG,
|
|
475
|
+
)
|
|
476
|
+
generation_kwargs["temperature"] = 1.0
|
|
477
|
+
return
|
|
478
|
+
elif any(msg.lower() in error_msg for msg in max_items_messages):
|
|
479
|
+
log_once(
|
|
480
|
+
f"The model {model_id!r} does not support "
|
|
481
|
+
"maxItems in the JSON schema, so disabling it.",
|
|
482
|
+
level=logging.DEBUG,
|
|
483
|
+
)
|
|
484
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
485
|
+
keys_and_their_types = {
|
|
486
|
+
tag_name: (list[str], ...) for tag_name in ner_tag_names
|
|
487
|
+
}
|
|
488
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
489
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
490
|
+
return
|
|
491
|
+
elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
|
|
492
|
+
log_once(
|
|
493
|
+
f"The model {self.model_config.model_id!r} does not support "
|
|
494
|
+
"JSON schemas, so using the vanilla JSON format.",
|
|
495
|
+
level=logging.DEBUG,
|
|
496
|
+
)
|
|
497
|
+
generation_kwargs["response_format"] = dict(type="json_object")
|
|
498
|
+
return
|
|
499
|
+
elif isinstance(
|
|
500
|
+
error,
|
|
501
|
+
(
|
|
431
502
|
APIConnectionError,
|
|
432
503
|
Timeout,
|
|
433
504
|
ServiceUnavailableError,
|
|
434
505
|
InternalServerError,
|
|
435
506
|
SystemError,
|
|
436
|
-
)
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
507
|
+
),
|
|
508
|
+
):
|
|
509
|
+
logger.debug(
|
|
510
|
+
f"Service temporarily unavailable. The error message was: {error}. "
|
|
511
|
+
f"Retrying in 5 seconds..."
|
|
512
|
+
)
|
|
513
|
+
sleep(5)
|
|
514
|
+
return
|
|
515
|
+
|
|
516
|
+
if isinstance(error, RateLimitError):
|
|
517
|
+
raise InvalidModel(
|
|
518
|
+
f"You have encountered your rate limit for model {model_id!r}. "
|
|
519
|
+
"Skipping."
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
if isinstance(error, AuthenticationError):
|
|
523
|
+
raise NeedsAdditionalArgument(
|
|
524
|
+
cli_argument="--api-key",
|
|
525
|
+
script_argument="api_key=<your-api-key>",
|
|
526
|
+
run_with_cli=self.benchmark_config.run_with_cli,
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
raise InvalidBenchmark(
|
|
530
|
+
f"Failed to generate text. The error message was: {error}"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
async def _generate_async(
|
|
534
|
+
self,
|
|
535
|
+
messages: list[dict[str, t.Any]],
|
|
536
|
+
generation_kwargs: dict[str, t.Any],
|
|
537
|
+
max_retries: int,
|
|
538
|
+
max_reruns: int,
|
|
539
|
+
) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
|
|
540
|
+
"""Generate outputs from the model asynchronously.
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
messages:
|
|
544
|
+
The messages to pass to the model.
|
|
545
|
+
generation_kwargs:
|
|
546
|
+
The generation kwargs to pass to the model.
|
|
547
|
+
max_retries:
|
|
548
|
+
The maximum number of retries to make.
|
|
549
|
+
max_reruns:
|
|
550
|
+
The maximum number of reruns to make.
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
A tuple containing the successful responses and the failed responses.
|
|
554
|
+
"""
|
|
555
|
+
success = []
|
|
556
|
+
all_failures = {}
|
|
557
|
+
to_run = list(enumerate(messages))
|
|
558
|
+
prev_fail_count = len(to_run)
|
|
559
|
+
rerun_count = 0
|
|
560
|
+
|
|
561
|
+
while to_run and rerun_count < max_reruns and prev_fail_count > 0:
|
|
562
|
+
requests = [
|
|
563
|
+
litellm.acompletion(
|
|
564
|
+
messages=msg, max_retries=max_retries, **generation_kwargs
|
|
440
565
|
)
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
566
|
+
for _, msg in to_run
|
|
567
|
+
]
|
|
568
|
+
wrapped_requests = [
|
|
569
|
+
catch_coroutine_exception(request) for request in requests
|
|
570
|
+
]
|
|
571
|
+
responses = await tqdm_async.gather(*wrapped_requests, leave=False)
|
|
572
|
+
|
|
573
|
+
next_to_run = []
|
|
574
|
+
current_fail_count = 0
|
|
575
|
+
|
|
576
|
+
for (orig_idx, _), response in zip(to_run, responses):
|
|
577
|
+
if isinstance(response, Exception):
|
|
578
|
+
current_fail_count += 1
|
|
579
|
+
all_failures[orig_idx] = response
|
|
580
|
+
next_to_run.append((orig_idx, messages[orig_idx]))
|
|
581
|
+
else:
|
|
582
|
+
success.append(response)
|
|
583
|
+
|
|
584
|
+
if current_fail_count >= prev_fail_count:
|
|
585
|
+
logger.warning(
|
|
586
|
+
"Retry loop aborting due to no progress: "
|
|
587
|
+
f"current_fail_count={current_fail_count}, "
|
|
588
|
+
f"prev_fail_count={prev_fail_count}"
|
|
447
589
|
)
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
590
|
+
break
|
|
591
|
+
|
|
592
|
+
prev_fail_count = current_fail_count
|
|
593
|
+
to_run = next_to_run
|
|
594
|
+
rerun_count += 1
|
|
595
|
+
|
|
596
|
+
failures = [(orig_idx, all_failures[orig_idx]) for orig_idx, _ in to_run]
|
|
597
|
+
return success, failures
|
|
452
598
|
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
599
|
+
@staticmethod
|
|
600
|
+
def _create_model_output(
|
|
601
|
+
model_responses: list[ModelResponse], model_id: str
|
|
602
|
+
) -> GenerativeModelOutput:
|
|
603
|
+
"""Create a GenerativeModelOutput object from a list of ModelResponse objects.
|
|
604
|
+
|
|
605
|
+
Args:
|
|
606
|
+
model_responses:
|
|
607
|
+
The list of ModelResponse objects to create the GenerativeModelOutput
|
|
608
|
+
object from.
|
|
609
|
+
model_id:
|
|
610
|
+
The ID of the model.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
A GenerativeModelOutput object.
|
|
614
|
+
"""
|
|
615
|
+
sequences = []
|
|
616
|
+
scores = []
|
|
617
|
+
for model_response in model_responses:
|
|
618
|
+
if not model_response.choices:
|
|
619
|
+
# This happens for reasoning models, when they don't finish thinking
|
|
620
|
+
# and run out of tokens. Happens quite rarely, but we need to handle it.
|
|
621
|
+
logger.warning(
|
|
622
|
+
f"The model {model_id!r} did not end up "
|
|
623
|
+
"generating any text. This is likely because the model ran "
|
|
624
|
+
"out of tokens while reasoning. Returning an empty string."
|
|
625
|
+
)
|
|
626
|
+
continue
|
|
627
|
+
|
|
628
|
+
model_response_choices = model_response.choices[0]
|
|
629
|
+
assert isinstance(model_response_choices, litellm.Choices)
|
|
630
|
+
generated_message: litellm.Message = model_response_choices.message
|
|
631
|
+
generation_output = generated_message.content or ""
|
|
632
|
+
generation_output = generation_output.strip()
|
|
633
|
+
|
|
634
|
+
# Structure the model output as a GenerativeModelOutput object
|
|
635
|
+
sequences.append(generation_output)
|
|
636
|
+
if hasattr(model_response_choices, "logprobs"):
|
|
637
|
+
logprobs_obj = model_response_choices.logprobs
|
|
638
|
+
if isinstance(logprobs_obj, ChoiceLogprobs):
|
|
639
|
+
logprobs_list: list[list[tuple[str, float]]] = [
|
|
640
|
+
[
|
|
641
|
+
(top_logprob.token, top_logprob.logprob)
|
|
642
|
+
for top_logprob in content.top_logprobs
|
|
643
|
+
]
|
|
644
|
+
for content in model_response_choices.logprobs.content or list()
|
|
645
|
+
]
|
|
646
|
+
scores.append(logprobs_list)
|
|
647
|
+
else:
|
|
648
|
+
log_once(
|
|
649
|
+
"The logprobs object is malformed, so we won't use logprobs to "
|
|
650
|
+
"determine the labels.",
|
|
651
|
+
level=logging.WARNING,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
if not sequences:
|
|
457
655
|
logger.warning(
|
|
458
|
-
|
|
459
|
-
"
|
|
460
|
-
"
|
|
656
|
+
"No sequences were generated by the model "
|
|
657
|
+
f"{model_id!r}. This may be due to the "
|
|
658
|
+
"model running out of tokens or an issue with the input data. "
|
|
659
|
+
"Returning an empty GenerativeModelOutput."
|
|
461
660
|
)
|
|
462
|
-
return GenerativeModelOutput(sequences=[
|
|
463
|
-
|
|
464
|
-
model_response_choices = model_response.choices[0]
|
|
465
|
-
assert isinstance(model_response_choices, litellm.Choices)
|
|
466
|
-
generated_message: litellm.Message = model_response_choices.message
|
|
467
|
-
generation_output = generated_message.content or ""
|
|
468
|
-
generation_output = generation_output.strip()
|
|
469
|
-
|
|
470
|
-
# Structure the model output as a GenerativeModelOutput object
|
|
471
|
-
model_output = GenerativeModelOutput(sequences=[generation_output])
|
|
472
|
-
if hasattr(model_response_choices, "logprobs"):
|
|
473
|
-
logprobs_obj = model_response_choices.logprobs
|
|
474
|
-
if isinstance(logprobs_obj, ChoiceLogprobs):
|
|
475
|
-
logprobs_list: list[list[tuple[str, float]]] = [
|
|
476
|
-
[
|
|
477
|
-
(top_logprob.token, top_logprob.logprob)
|
|
478
|
-
for top_logprob in content.top_logprobs
|
|
479
|
-
]
|
|
480
|
-
for content in model_response_choices.logprobs.content or list()
|
|
481
|
-
]
|
|
482
|
-
model_output.scores = [logprobs_list]
|
|
483
|
-
else:
|
|
484
|
-
log_once(
|
|
485
|
-
"The logprobs object is malformed, so we won't use logprobs to "
|
|
486
|
-
"determine the labels.",
|
|
487
|
-
level=logging.WARNING,
|
|
488
|
-
)
|
|
661
|
+
return GenerativeModelOutput(sequences=[], scores=None)
|
|
489
662
|
|
|
490
|
-
|
|
663
|
+
if scores and len(sequences) != len(scores):
|
|
664
|
+
raise InvalidBenchmark(
|
|
665
|
+
"Sequences and scores must have the same length. "
|
|
666
|
+
f"Got {len(sequences)} sequences and {len(scores)} scores."
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
return GenerativeModelOutput(
|
|
670
|
+
sequences=sequences, scores=scores if scores else None
|
|
671
|
+
)
|
|
491
672
|
|
|
492
673
|
@cached_property
|
|
493
674
|
def num_params(self) -> int:
|
|
@@ -1007,6 +1188,10 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1007
1188
|
|
|
1008
1189
|
Returns:
|
|
1009
1190
|
Whether the model was downloaded successfully.
|
|
1191
|
+
|
|
1192
|
+
Raises:
|
|
1193
|
+
InvalidModel:
|
|
1194
|
+
If Ollama is not running or the model cannot be downloaded.
|
|
1010
1195
|
"""
|
|
1011
1196
|
if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
|
|
1012
1197
|
return False
|
|
@@ -1021,11 +1206,17 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1021
1206
|
level=logging.WARNING,
|
|
1022
1207
|
)
|
|
1023
1208
|
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1209
|
+
try:
|
|
1210
|
+
downloaded_ollama_models: list[str] = [
|
|
1211
|
+
model_obj.model
|
|
1212
|
+
for model_obj in ollama.list().models
|
|
1213
|
+
if model_obj.model is not None
|
|
1214
|
+
]
|
|
1215
|
+
except ConnectionError:
|
|
1216
|
+
raise InvalidModel(
|
|
1217
|
+
"Ollama does not seem to be running, so we cannot evaluate the model "
|
|
1218
|
+
f"{model_id!r}. Please make sure that Ollama is running and try again."
|
|
1219
|
+
)
|
|
1029
1220
|
|
|
1030
1221
|
ollama_model_id = "/".join(model_id.split("/")[1:])
|
|
1031
1222
|
if ollama_model_id not in downloaded_ollama_models:
|
|
@@ -797,7 +797,7 @@ def load_model_and_tokenizer(
|
|
|
797
797
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
798
798
|
max_lora_rank=256,
|
|
799
799
|
)
|
|
800
|
-
except (ValueError, OSError) as e:
|
|
800
|
+
except (RuntimeError, ValueError, OSError) as e:
|
|
801
801
|
if "awaiting a review from the repo authors" in str(e):
|
|
802
802
|
raise InvalidModel(
|
|
803
803
|
f"The model {model_id!r} is awaiting a review from the repository "
|
euroeval/benchmarker.py
CHANGED
|
@@ -11,6 +11,7 @@ from pathlib import Path
|
|
|
11
11
|
from shutil import rmtree
|
|
12
12
|
from time import sleep
|
|
13
13
|
|
|
14
|
+
from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
|
14
15
|
from torch.distributed import destroy_process_group
|
|
15
16
|
|
|
16
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
@@ -27,7 +28,7 @@ from .model_loading import load_model
|
|
|
27
28
|
from .scores import log_scores
|
|
28
29
|
from .speed_benchmark import benchmark_speed
|
|
29
30
|
from .tasks import SPEED
|
|
30
|
-
from .utils import enforce_reproducibility
|
|
31
|
+
from .utils import enforce_reproducibility, get_package_version
|
|
31
32
|
|
|
32
33
|
if t.TYPE_CHECKING:
|
|
33
34
|
from .benchmark_modules import BenchmarkModule
|
|
@@ -164,6 +165,15 @@ class Benchmarker:
|
|
|
164
165
|
if task is not None and dataset is not None:
|
|
165
166
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
166
167
|
|
|
168
|
+
# Bail early if hf_transfer is enabled but not installed.
|
|
169
|
+
if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
|
|
170
|
+
raise ImportError(
|
|
171
|
+
"Fast download using 'hf_transfer' is enabled "
|
|
172
|
+
"(HF_HUB_ENABLE_HF_TRANSFER=1) but the 'hf_transfer' "
|
|
173
|
+
"package is not available in your environment. "
|
|
174
|
+
"Try installing it with `pip install hf_transfer`."
|
|
175
|
+
)
|
|
176
|
+
|
|
167
177
|
self.benchmark_config_default_params = BenchmarkConfigParams(
|
|
168
178
|
progress_bar=progress_bar,
|
|
169
179
|
save_results=save_results,
|
|
@@ -372,15 +382,7 @@ class Benchmarker:
|
|
|
372
382
|
|
|
373
383
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
374
384
|
for model_id in model_ids:
|
|
375
|
-
|
|
376
|
-
model_config = get_model_config(
|
|
377
|
-
model_id=model_id, benchmark_config=benchmark_config
|
|
378
|
-
)
|
|
379
|
-
except InvalidModel as e:
|
|
380
|
-
logger.info(e.message)
|
|
381
|
-
num_finished_benchmarks += len(dataset_configs)
|
|
382
|
-
continue
|
|
383
|
-
|
|
385
|
+
model_config: ModelConfig | None = None
|
|
384
386
|
loaded_model: BenchmarkModule | None = None
|
|
385
387
|
for dataset_config in dataset_configs:
|
|
386
388
|
# Skip if we have already benchmarked this model on this dataset and
|
|
@@ -394,12 +396,22 @@ class Benchmarker:
|
|
|
394
396
|
):
|
|
395
397
|
logger.debug(
|
|
396
398
|
f"Skipping benchmarking {model_id} on "
|
|
397
|
-
f"{dataset_config.pretty_name}, as it "
|
|
398
|
-
"
|
|
399
|
+
f"{dataset_config.pretty_name}, as it has already been "
|
|
400
|
+
"benchmarked."
|
|
399
401
|
)
|
|
400
402
|
num_finished_benchmarks += 1
|
|
401
403
|
continue
|
|
402
404
|
|
|
405
|
+
if model_config is None:
|
|
406
|
+
try:
|
|
407
|
+
model_config = get_model_config(
|
|
408
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
409
|
+
)
|
|
410
|
+
except InvalidModel as e:
|
|
411
|
+
logger.info(e.message)
|
|
412
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
413
|
+
continue
|
|
414
|
+
|
|
403
415
|
# Skip if the model is an encoder model and the task is generative
|
|
404
416
|
task_is_generative = (
|
|
405
417
|
dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
|
|
@@ -7,6 +7,7 @@ from .danish import * # noqa: F403
|
|
|
7
7
|
from .dutch import * # noqa: F403
|
|
8
8
|
from .english import * # noqa: F403
|
|
9
9
|
from .faroese import * # noqa: F403
|
|
10
|
+
from .finnish import * # noqa: F403
|
|
10
11
|
from .french import * # noqa: F403
|
|
11
12
|
from .german import * # noqa: F403
|
|
12
13
|
from .icelandic import * # noqa: F403
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import FI
|
|
5
|
-
from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -58,3 +58,13 @@ SCALA_FI_CONFIG = DatasetConfig(
|
|
|
58
58
|
)
|
|
59
59
|
|
|
60
60
|
### Unofficial datasets ###
|
|
61
|
+
|
|
62
|
+
BELEBELE_FI_CONFIG = DatasetConfig(
|
|
63
|
+
name="belebele-fi",
|
|
64
|
+
pretty_name="the Finnish multiple choice reading comprehension dataset "
|
|
65
|
+
"BeleBele-fi, translated from the English BeleBele dataset",
|
|
66
|
+
huggingface_id="EuroEval/belebele-fi-mini",
|
|
67
|
+
task=MCRC,
|
|
68
|
+
languages=[FI],
|
|
69
|
+
unofficial=True,
|
|
70
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import IT
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -79,3 +79,13 @@ WIKINEURAL_IT_CONFIG = DatasetConfig(
|
|
|
79
79
|
languages=[IT],
|
|
80
80
|
unofficial=True,
|
|
81
81
|
)
|
|
82
|
+
|
|
83
|
+
BELEBELE_IT_CONFIG = DatasetConfig(
|
|
84
|
+
name="belebele-it",
|
|
85
|
+
pretty_name="the Italian multiple choice reading comprehension dataset "
|
|
86
|
+
"BeleBele-it, translated from the English BeleBele dataset",
|
|
87
|
+
huggingface_id="EuroEval/belebele-it-mini",
|
|
88
|
+
task=MCRC,
|
|
89
|
+
languages=[IT],
|
|
90
|
+
unofficial=True,
|
|
91
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import ES
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -76,3 +76,13 @@ XQUAD_ES_CONFIG = DatasetConfig(
|
|
|
76
76
|
languages=[ES],
|
|
77
77
|
unofficial=True,
|
|
78
78
|
)
|
|
79
|
+
|
|
80
|
+
BELEBELE_ES_CONFIG = DatasetConfig(
|
|
81
|
+
name="belebele-es",
|
|
82
|
+
pretty_name="the Spanish multiple choice reading comprehension dataset "
|
|
83
|
+
"BeleBele-es, translated from the English BeleBele dataset",
|
|
84
|
+
huggingface_id="EuroEval/belebele-es-mini",
|
|
85
|
+
task=MCRC,
|
|
86
|
+
languages=[ES],
|
|
87
|
+
unofficial=True,
|
|
88
|
+
)
|
euroeval/finetuning.py
CHANGED
|
@@ -103,7 +103,6 @@ def finetune(
|
|
|
103
103
|
itr_scores = finetune_single_iteration(
|
|
104
104
|
model=model if model_already_initialized else None,
|
|
105
105
|
dataset=datasets[idx],
|
|
106
|
-
iteration_idx=idx,
|
|
107
106
|
training_args=training_args,
|
|
108
107
|
model_config=model_config,
|
|
109
108
|
dataset_config=dataset_config,
|
|
@@ -158,7 +157,6 @@ def finetune(
|
|
|
158
157
|
def finetune_single_iteration(
|
|
159
158
|
model: BenchmarkModule | None,
|
|
160
159
|
dataset: DatasetDict,
|
|
161
|
-
iteration_idx: int,
|
|
162
160
|
training_args: TrainingArguments,
|
|
163
161
|
model_config: "ModelConfig",
|
|
164
162
|
dataset_config: "DatasetConfig",
|
|
@@ -171,8 +169,6 @@ def finetune_single_iteration(
|
|
|
171
169
|
The model to use in the benchmark. If None then a new model will be loaded.
|
|
172
170
|
dataset:
|
|
173
171
|
The dataset to use for training and evaluation.
|
|
174
|
-
iteration_idx:
|
|
175
|
-
The index of the iteration.
|
|
176
172
|
training_args:
|
|
177
173
|
The training arguments.
|
|
178
174
|
model_config:
|
|
@@ -213,41 +209,42 @@ def finetune_single_iteration(
|
|
|
213
209
|
|
|
214
210
|
trainer.log = no_logging
|
|
215
211
|
|
|
216
|
-
# Re-block terminal output, as it gets unblocked by the `transformers`
|
|
217
|
-
#
|
|
212
|
+
# Re-block terminal output, as it gets unblocked by the `transformers` package
|
|
213
|
+
# before training
|
|
218
214
|
block_terminal_output()
|
|
219
215
|
|
|
220
|
-
# Sort out callbacks. We remove the callbacks that are producing unnecessary
|
|
221
|
-
#
|
|
216
|
+
# Sort out callbacks. We remove the callbacks that are producing unnecessary output,
|
|
217
|
+
# to avoid cluttering the terminal output
|
|
222
218
|
if not benchmark_config.verbose:
|
|
223
219
|
trainer.remove_callback(PrinterCallback)
|
|
224
220
|
trainer.remove_callback(ProgressCallback)
|
|
225
221
|
if benchmark_config.progress_bar:
|
|
226
222
|
trainer.add_callback(NeverLeaveProgressCallback)
|
|
227
223
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
224
|
+
# Train the model
|
|
225
|
+
trainer.train()
|
|
226
|
+
|
|
227
|
+
# Evaluate the model
|
|
228
|
+
with torch.inference_mode():
|
|
229
|
+
try:
|
|
230
|
+
test_scores = trainer.evaluate(
|
|
231
|
+
eval_dataset=dataset["test"],
|
|
232
|
+
orig_eval_dataset=dataset["original_test"],
|
|
233
|
+
metric_key_prefix="test",
|
|
234
|
+
)
|
|
235
|
+
except TypeError:
|
|
236
|
+
test_scores = trainer.evaluate(
|
|
237
|
+
eval_dataset=dataset["test"], metric_key_prefix="test"
|
|
238
|
+
)
|
|
239
|
+
except NaNValueInModelOutput as e:
|
|
240
|
+
del trainer
|
|
241
|
+
del model
|
|
242
|
+
clear_memory()
|
|
243
|
+
raise e
|
|
244
|
+
except (RuntimeError, ValueError, IndexError) as e:
|
|
245
|
+
raise InvalidBenchmark(str(e))
|
|
246
|
+
|
|
247
|
+
return test_scores
|
|
251
248
|
|
|
252
249
|
|
|
253
250
|
def get_training_args(
|
|
@@ -300,6 +297,7 @@ def get_training_args(
|
|
|
300
297
|
save_total_limit=1,
|
|
301
298
|
per_device_train_batch_size=batch_size,
|
|
302
299
|
per_device_eval_batch_size=batch_size,
|
|
300
|
+
eval_accumulation_steps=32,
|
|
303
301
|
optim=OptimizerNames.ADAMW_TORCH,
|
|
304
302
|
learning_rate=2e-5,
|
|
305
303
|
warmup_ratio=0.01,
|
euroeval/languages.py
CHANGED
|
@@ -21,6 +21,7 @@ def get_all_languages() -> dict[str, Language]:
|
|
|
21
21
|
DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
|
|
22
22
|
NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
|
|
23
23
|
EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
|
|
24
|
+
FI = Language(code="fi", name="Finnish", _and_separator="ja", _or_separator="tai")
|
|
24
25
|
FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
|
|
25
26
|
FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
|
|
26
27
|
DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
|
|
@@ -78,7 +79,6 @@ EO = Language(code="eo", name="Esperanto")
|
|
|
78
79
|
ET = Language(code="et", name="Estonian")
|
|
79
80
|
EE = Language(code="ee", name="Ewe")
|
|
80
81
|
FJ = Language(code="fj", name="Fijian")
|
|
81
|
-
FI = Language(code="fi", name="Finnish")
|
|
82
82
|
FY = Language(code="fy", name="Western Frisian")
|
|
83
83
|
FF = Language(code="ff", name="Fulah")
|
|
84
84
|
GD = Language(code="gd", name="Gaelic")
|
|
@@ -132,6 +132,11 @@ def extract_labels_from_generation(
|
|
|
132
132
|
The predicted labels.
|
|
133
133
|
"""
|
|
134
134
|
if model_output.scores is not None:
|
|
135
|
+
if first_label_token_mapping is False:
|
|
136
|
+
raise InvalidBenchmark(
|
|
137
|
+
"The model outputted logprobs, but the first label token mapping is "
|
|
138
|
+
"not provided. This means that the model should not output logprobs."
|
|
139
|
+
)
|
|
135
140
|
labels = get_closest_logprobs_labels(
|
|
136
141
|
generation_logprobs=model_output.scores,
|
|
137
142
|
dataset_config=dataset_config,
|
|
@@ -147,7 +152,7 @@ def extract_labels_from_generation(
|
|
|
147
152
|
def get_closest_logprobs_labels(
|
|
148
153
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
149
154
|
dataset_config: "DatasetConfig",
|
|
150
|
-
first_label_token_mapping: dict[str, str] |
|
|
155
|
+
first_label_token_mapping: dict[str, str] | t.Literal[True],
|
|
151
156
|
) -> list[str] | None:
|
|
152
157
|
"""Get the labels with the highest predicted logprob value.
|
|
153
158
|
|
|
@@ -164,8 +169,7 @@ def get_closest_logprobs_labels(
|
|
|
164
169
|
The configuration of the dataset.
|
|
165
170
|
first_label_token_mapping:
|
|
166
171
|
A mapping from labels to the first token in each label, or alternatively a
|
|
167
|
-
|
|
168
|
-
mapping is outputted then the model will always output scores).
|
|
172
|
+
`True` value indicating that the model should output logprobs.
|
|
169
173
|
|
|
170
174
|
Returns:
|
|
171
175
|
The predicted labels, or None if labels could not be extracted.
|
|
@@ -195,7 +199,9 @@ def get_closest_logprobs_labels(
|
|
|
195
199
|
# label, as the output label
|
|
196
200
|
output_label: str | None = None
|
|
197
201
|
for generated_label in generated_labels:
|
|
198
|
-
# Get the candidate labels
|
|
202
|
+
# Get the candidate labels. If we have a first label token mapping, we
|
|
203
|
+
# use it to get the candidate labels. Otherwise, we check if any of the
|
|
204
|
+
# labels start with the generated label.
|
|
199
205
|
if isinstance(first_label_token_mapping, dict):
|
|
200
206
|
if any(
|
|
201
207
|
candidate_label not in first_label_token_mapping
|
|
@@ -239,14 +245,43 @@ def get_closest_logprobs_labels(
|
|
|
239
245
|
)
|
|
240
246
|
return None
|
|
241
247
|
|
|
242
|
-
# If no candidate label is found, we
|
|
243
|
-
#
|
|
244
|
-
#
|
|
248
|
+
# If no candidate label is found, we first check if any of the labels
|
|
249
|
+
# start with the generated label. This could be the case if the labels
|
|
250
|
+
# in the first token mapping is inaccurate or incomplete, for instance
|
|
251
|
+
# if 'pos' is in the first label token mapping, but the model outputted
|
|
252
|
+
# 'posit'. If this is the case then we cannot trust the first label
|
|
253
|
+
# token mapping, and we fall back to using word edit distance.
|
|
254
|
+
# Otherwise, the generated label is just bad, and we skip to the next
|
|
255
|
+
# generated label.
|
|
245
256
|
elif len(candidate_output_labels) == 0:
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
257
|
+
candidate_output_labels_starting_with_generated_label = [
|
|
258
|
+
candidate_label
|
|
259
|
+
for candidate_label in candidate_labels
|
|
260
|
+
if candidate_label.startswith(generated_label)
|
|
261
|
+
]
|
|
262
|
+
if candidate_output_labels_starting_with_generated_label:
|
|
263
|
+
log_once(
|
|
264
|
+
f"No candidate label found for the generated label "
|
|
265
|
+
f"{generated_label!r}. This means that using logprobs to "
|
|
266
|
+
"extract the labels is not reliable, and we will instead "
|
|
267
|
+
"fall back to extracting the labels using word edit "
|
|
268
|
+
"distance.",
|
|
269
|
+
level=logging.DEBUG,
|
|
270
|
+
)
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
# If we did not find any candidate label for any of the generated labels, we
|
|
274
|
+
# assume that something is wrong with the model output, and we fall back to
|
|
275
|
+
# using word edit distance to extract the labels
|
|
276
|
+
else:
|
|
277
|
+
log_once(
|
|
278
|
+
f"No candidate label found for any of the generated labels "
|
|
279
|
+
f"{generated_labels}. This means that using logprobs to extract "
|
|
280
|
+
"the labels is not reliable, and we will instead fall back to "
|
|
281
|
+
"extracting the labels using word edit distance.",
|
|
282
|
+
level=logging.DEBUG,
|
|
283
|
+
)
|
|
284
|
+
return None
|
|
250
285
|
|
|
251
286
|
if output_label is not None:
|
|
252
287
|
output_labels.append(output_label)
|
euroeval/tokenization_utils.py
CHANGED
|
@@ -169,7 +169,7 @@ def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
|
|
|
169
169
|
|
|
170
170
|
vocab: dict[str, int] = tokenizer.get_vocab()
|
|
171
171
|
|
|
172
|
-
candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
|
|
172
|
+
candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
|
|
173
173
|
for candidate_bos_token in candidate_bos_tokens:
|
|
174
174
|
if candidate_bos_token in vocab:
|
|
175
175
|
bos_token = candidate_bos_token
|
|
@@ -200,7 +200,7 @@ def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
|
|
|
200
200
|
|
|
201
201
|
vocab: dict[str, int] = tokenizer.get_vocab()
|
|
202
202
|
|
|
203
|
-
candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
|
|
203
|
+
candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
|
|
204
204
|
for candidate_eos_token in candidate_eos_tokens:
|
|
205
205
|
if candidate_eos_token in vocab:
|
|
206
206
|
eos_token = candidate_eos_token
|
|
@@ -311,24 +311,60 @@ def get_first_label_token_mapping(
|
|
|
311
311
|
for label in dataset_config.labels
|
|
312
312
|
]
|
|
313
313
|
|
|
314
|
-
#
|
|
315
|
-
|
|
316
|
-
|
|
314
|
+
# Tokenize some text containing each label, which we will use to extract the
|
|
315
|
+
# first token of each label
|
|
316
|
+
all_tokens: list[list[str]]
|
|
317
|
+
if tokenizer.chat_template is None:
|
|
318
|
+
add_prefix_space = should_prefix_space_be_added_to_labels(
|
|
317
319
|
labels_to_be_generated=local_labels, tokenizer=tokenizer
|
|
318
320
|
)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
321
|
+
all_tokens = [
|
|
322
|
+
tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
|
|
323
|
+
for label in local_labels
|
|
324
|
+
]
|
|
325
|
+
else:
|
|
326
|
+
all_tokens = [
|
|
327
|
+
tokenizer.convert_ids_to_tokens(
|
|
328
|
+
ids=tokenizer.apply_chat_template(
|
|
329
|
+
conversation=[
|
|
330
|
+
dict(role="user", content=""),
|
|
331
|
+
dict(role="assistant", content=label),
|
|
332
|
+
],
|
|
333
|
+
add_generation_prompt=True,
|
|
334
|
+
tokenize=True,
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
for label in local_labels
|
|
338
|
+
]
|
|
339
|
+
|
|
340
|
+
# Remove any non-alphabetic characters from the tokens
|
|
341
|
+
all_tokens = [
|
|
342
|
+
[
|
|
343
|
+
re.sub(
|
|
344
|
+
pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
|
|
345
|
+
repl="",
|
|
346
|
+
string=token.lower(),
|
|
347
|
+
)
|
|
348
|
+
for token in token_list
|
|
349
|
+
]
|
|
350
|
+
for token_list in all_tokens
|
|
330
351
|
]
|
|
331
352
|
|
|
353
|
+
# Extract the first token of each label
|
|
354
|
+
first_tokens: list[str] = list()
|
|
355
|
+
for token_list, label in zip(all_tokens, local_labels):
|
|
356
|
+
matching_tokens = [
|
|
357
|
+
tok for tok in token_list if tok and label.startswith(tok)
|
|
358
|
+
]
|
|
359
|
+
if not matching_tokens:
|
|
360
|
+
log_once(
|
|
361
|
+
f"No matching token found in token_list for label '{label}', so "
|
|
362
|
+
"we will not output scores.",
|
|
363
|
+
level=logging.DEBUG,
|
|
364
|
+
)
|
|
365
|
+
return False
|
|
366
|
+
first_tokens.append(matching_tokens[0])
|
|
367
|
+
|
|
332
368
|
# Build a mapping from labels to the first token in each label if the first
|
|
333
369
|
# tokens are distinct
|
|
334
370
|
if len(first_tokens) == len(set(first_tokens)):
|
euroeval/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions to be used in other scripts."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import gc
|
|
4
5
|
import importlib
|
|
5
6
|
import importlib.metadata
|
|
@@ -327,3 +328,43 @@ def get_package_version(package_name: str) -> str | None:
|
|
|
327
328
|
return importlib.metadata.version(package_name)
|
|
328
329
|
except importlib.metadata.PackageNotFoundError:
|
|
329
330
|
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
T = t.TypeVar("T", bound=object)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
|
|
337
|
+
"""Run a coroutine, ensuring that the event loop is always closed when we're done.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
coroutine:
|
|
341
|
+
The coroutine to run.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
The result of the coroutine.
|
|
345
|
+
"""
|
|
346
|
+
loop = asyncio.new_event_loop()
|
|
347
|
+
try:
|
|
348
|
+
asyncio.set_event_loop(loop)
|
|
349
|
+
return loop.run_until_complete(coroutine)
|
|
350
|
+
finally:
|
|
351
|
+
loop.close()
|
|
352
|
+
asyncio.set_event_loop(None)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
async def catch_coroutine_exception(
|
|
356
|
+
coroutine: t.Coroutine[t.Any, t.Any, T],
|
|
357
|
+
) -> T | Exception:
|
|
358
|
+
"""Run a coroutine, catching any exceptions and returning them.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
coroutine:
|
|
362
|
+
The coroutine to run.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
The result of the coroutine, or the exception if it was raised.
|
|
366
|
+
"""
|
|
367
|
+
try:
|
|
368
|
+
return await coroutine
|
|
369
|
+
except Exception as exc:
|
|
370
|
+
return exc
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
|
|
2
|
-
euroeval/benchmark_config_factory.py,sha256=
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=RDYotoLcfNr3xU8Cw-G-Y8wLe6RSlJD1Ok9C97lWfOs,12553
|
|
3
|
+
euroeval/benchmarker.py,sha256=EHoYilZ2Xx0-6_aEBlG84MsZbomJSiHNHc4wKOVVBB8,49199
|
|
4
4
|
euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
6
|
euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
|
|
@@ -8,38 +8,38 @@ euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
|
|
|
8
8
|
euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
|
|
9
9
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
10
|
euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
|
|
11
|
-
euroeval/finetuning.py,sha256=
|
|
11
|
+
euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
|
|
12
12
|
euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
|
|
13
13
|
euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
|
|
14
14
|
euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
|
|
15
|
-
euroeval/languages.py,sha256=
|
|
15
|
+
euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
|
|
16
16
|
euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
|
|
17
17
|
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
18
18
|
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
19
19
|
euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
|
|
20
20
|
euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
|
|
21
21
|
euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
|
|
22
|
-
euroeval/tokenization_utils.py,sha256=
|
|
22
|
+
euroeval/tokenization_utils.py,sha256=kghOIZMM3H0P9YDv0VBSNI7drzgJXlkRtMwt3Cgeev8,13907
|
|
23
23
|
euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
|
|
24
|
-
euroeval/utils.py,sha256=
|
|
24
|
+
euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
|
|
25
25
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
26
26
|
euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
|
|
27
27
|
euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
|
|
28
28
|
euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
|
|
29
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
30
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
31
|
-
euroeval/dataset_configs/__init__.py,sha256=
|
|
29
|
+
euroeval/benchmark_modules/litellm.py,sha256=dd7OqBvWA75zNrsEHtC3cx3rNpNJ-1QOL2arV_CqYG0,48231
|
|
30
|
+
euroeval/benchmark_modules/vllm.py,sha256=DJyla0jr-DVMPPs4RBguxq1Xn5YguvyuAnIlgIOfFaw,39394
|
|
31
|
+
euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
|
|
32
32
|
euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
|
|
33
33
|
euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
|
|
34
|
-
euroeval/dataset_configs/english.py,sha256
|
|
34
|
+
euroeval/dataset_configs/english.py,sha256=-N85DiNVrZFqpahNUTfxaWy4vvdOWC8Bi0G4uAO4uDw,2326
|
|
35
35
|
euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
|
|
36
|
-
euroeval/dataset_configs/finnish.py,sha256=
|
|
36
|
+
euroeval/dataset_configs/finnish.py,sha256=_8YWIlZNpO8Qi233bH7cKwm3tq3WETLfC_6mzg7LLog,2045
|
|
37
37
|
euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
|
|
38
38
|
euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
|
|
39
39
|
euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
|
|
40
|
-
euroeval/dataset_configs/italian.py,sha256=
|
|
40
|
+
euroeval/dataset_configs/italian.py,sha256=KNjCvTzsEqH_EEk3At8slKqNwWWiIdbv_t5ke7n9nZI,2660
|
|
41
41
|
euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada4qwE7tw0,5181
|
|
42
|
-
euroeval/dataset_configs/spanish.py,sha256=
|
|
42
|
+
euroeval/dataset_configs/spanish.py,sha256=NviL-FzJ5jq1bLTRvbtZBiGrAmZjxyijZNpKZFrnT-M,2527
|
|
43
43
|
euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
|
|
44
44
|
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
45
45
|
euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
|
|
@@ -51,11 +51,11 @@ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5w
|
|
|
51
51
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
52
52
|
euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
|
|
53
53
|
euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
|
|
54
|
-
euroeval/task_group_utils/sequence_classification.py,sha256=
|
|
54
|
+
euroeval/task_group_utils/sequence_classification.py,sha256=MCdO5h3v_LWTkrvKAeefPq7rl1H5mFed50nAL4uZq0E,13837
|
|
55
55
|
euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
|
|
56
56
|
euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
|
|
57
|
-
euroeval-15.
|
|
58
|
-
euroeval-15.
|
|
59
|
-
euroeval-15.
|
|
60
|
-
euroeval-15.
|
|
61
|
-
euroeval-15.
|
|
57
|
+
euroeval-15.8.0.dist-info/METADATA,sha256=-GcGBuEnlAPmpT9ItDAmS0psT__jwbVoNkTYOiSeRzA,13669
|
|
58
|
+
euroeval-15.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
59
|
+
euroeval-15.8.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
60
|
+
euroeval-15.8.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
61
|
+
euroeval-15.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|