EuroEval 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +4 -2
- euroeval/benchmark_modules/fresh.py +3 -1
- euroeval/benchmark_modules/hf.py +8 -4
- euroeval/benchmark_modules/litellm.py +5 -17
- euroeval/benchmark_modules/vllm.py +98 -30
- euroeval/benchmarker.py +291 -405
- euroeval/cli.py +1 -1
- euroeval/constants.py +3 -0
- euroeval/data_models.py +35 -35
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +0 -2
- euroeval/dataset_configs/dutch.py +0 -2
- euroeval/dataset_configs/english.py +0 -2
- euroeval/dataset_configs/finnish.py +0 -2
- euroeval/dataset_configs/french.py +0 -2
- euroeval/dataset_configs/german.py +0 -2
- euroeval/dataset_configs/italian.py +0 -2
- euroeval/dataset_configs/latvian.py +2 -3
- euroeval/dataset_configs/lithuanian.py +62 -0
- euroeval/dataset_configs/norwegian.py +0 -2
- euroeval/dataset_configs/polish.py +0 -2
- euroeval/dataset_configs/portuguese.py +0 -2
- euroeval/dataset_configs/spanish.py +0 -2
- euroeval/dataset_configs/swedish.py +0 -3
- euroeval/metrics/huggingface.py +1 -1
- euroeval/metrics/pipeline.py +5 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +9 -0
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +10 -0
- euroeval/prompt_templates/sentiment_classification.py +11 -0
- euroeval/tokenisation_utils.py +8 -8
- euroeval/utils.py +10 -5
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
- euroeval-16.3.0.dist-info/RECORD +71 -0
- euroeval-16.2.1.dist-info/RECORD +0 -70
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py
CHANGED
|
@@ -6,13 +6,13 @@ import logging
|
|
|
6
6
|
import re
|
|
7
7
|
import sys
|
|
8
8
|
import typing as t
|
|
9
|
-
from copy import deepcopy
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from shutil import rmtree
|
|
12
11
|
from time import sleep
|
|
13
12
|
|
|
14
13
|
from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
|
15
14
|
from torch.distributed import destroy_process_group
|
|
15
|
+
from tqdm.auto import tqdm
|
|
16
16
|
|
|
17
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
18
|
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
@@ -33,6 +33,7 @@ from .utils import (
|
|
|
33
33
|
get_package_version,
|
|
34
34
|
internet_connection_available,
|
|
35
35
|
log_once,
|
|
36
|
+
split_model_id,
|
|
36
37
|
)
|
|
37
38
|
|
|
38
39
|
if t.TYPE_CHECKING:
|
|
@@ -83,7 +84,7 @@ class Benchmarker:
|
|
|
83
84
|
num_iterations: int = 10,
|
|
84
85
|
api_base: str | None = None,
|
|
85
86
|
api_version: str | None = None,
|
|
86
|
-
gpu_memory_utilization: float = 0.
|
|
87
|
+
gpu_memory_utilization: float = 0.8,
|
|
87
88
|
generative_type: GenerativeType | None = None,
|
|
88
89
|
debug: bool = False,
|
|
89
90
|
run_with_cli: bool = False,
|
|
@@ -200,10 +201,10 @@ class Benchmarker:
|
|
|
200
201
|
)
|
|
201
202
|
|
|
202
203
|
self.benchmark_config_default_params = BenchmarkConfigParams(
|
|
203
|
-
progress_bar=progress_bar,
|
|
204
|
-
save_results=save_results,
|
|
205
204
|
task=task,
|
|
206
205
|
dataset=dataset,
|
|
206
|
+
progress_bar=progress_bar,
|
|
207
|
+
save_results=save_results,
|
|
207
208
|
language=language,
|
|
208
209
|
model_language=model_language,
|
|
209
210
|
dataset_language=dataset_language,
|
|
@@ -212,21 +213,21 @@ class Benchmarker:
|
|
|
212
213
|
raise_errors=raise_errors,
|
|
213
214
|
cache_dir=cache_dir,
|
|
214
215
|
api_key=api_key,
|
|
215
|
-
|
|
216
|
-
|
|
216
|
+
api_base=api_base,
|
|
217
|
+
api_version=api_version,
|
|
217
218
|
trust_remote_code=trust_remote_code,
|
|
218
219
|
clear_model_cache=clear_model_cache,
|
|
219
220
|
evaluate_test_split=evaluate_test_split,
|
|
220
221
|
few_shot=few_shot,
|
|
221
222
|
num_iterations=num_iterations,
|
|
222
|
-
|
|
223
|
-
|
|
223
|
+
requires_safetensors=requires_safetensors,
|
|
224
|
+
download_only=download_only,
|
|
224
225
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
225
226
|
generative_type=generative_type,
|
|
226
|
-
|
|
227
|
+
verbose=verbose,
|
|
228
|
+
force=force,
|
|
227
229
|
debug=debug,
|
|
228
230
|
run_with_cli=run_with_cli,
|
|
229
|
-
requires_safetensors=requires_safetensors,
|
|
230
231
|
)
|
|
231
232
|
|
|
232
233
|
self.benchmark_config = build_benchmark_config(
|
|
@@ -332,8 +333,8 @@ class Benchmarker:
|
|
|
332
333
|
raise_errors: bool | None = None,
|
|
333
334
|
cache_dir: str | None = None,
|
|
334
335
|
api_key: str | None = None,
|
|
335
|
-
|
|
336
|
-
|
|
336
|
+
api_base: str | None = None,
|
|
337
|
+
api_version: str | None = None,
|
|
337
338
|
trust_remote_code: bool | None = None,
|
|
338
339
|
clear_model_cache: bool | None = None,
|
|
339
340
|
evaluate_test_split: bool | None = None,
|
|
@@ -341,6 +342,11 @@ class Benchmarker:
|
|
|
341
342
|
num_iterations: int | None = None,
|
|
342
343
|
requires_safetensors: bool | None = None,
|
|
343
344
|
download_only: bool | None = None,
|
|
345
|
+
gpu_memory_utilization: float | None = None,
|
|
346
|
+
generative_type: GenerativeType | None = None,
|
|
347
|
+
force: bool | None = None,
|
|
348
|
+
verbose: bool | None = None,
|
|
349
|
+
debug: bool | None = None,
|
|
344
350
|
) -> list[BenchmarkResult]:
|
|
345
351
|
"""Benchmarks models on datasets.
|
|
346
352
|
|
|
@@ -393,13 +399,13 @@ class Benchmarker:
|
|
|
393
399
|
api_key:
|
|
394
400
|
The API key to use for a given inference server. Defaults to the value
|
|
395
401
|
specified when initialising the benchmarker.
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
the benchmarker.
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
402
|
+
api_base:
|
|
403
|
+
The base URL for a given inference API. Only relevant if `model` refers
|
|
404
|
+
to a model on an inference API. Defaults to the value specified when
|
|
405
|
+
initialising the benchmarker.
|
|
406
|
+
api_version:
|
|
407
|
+
The version of the API to use. Defaults to the value specified when
|
|
408
|
+
initialising the benchmarker.
|
|
403
409
|
trust_remote_code:
|
|
404
410
|
Whether to trust remote code when loading models. Defaults to the value
|
|
405
411
|
specified when initialising the benchmarker.
|
|
@@ -424,6 +430,27 @@ class Benchmarker:
|
|
|
424
430
|
download_only:
|
|
425
431
|
Whether to only download the models without evaluating them. Defaults
|
|
426
432
|
to the value specified when initialising the benchmarker.
|
|
433
|
+
gpu_memory_utilization:
|
|
434
|
+
The GPU memory utilization to use for vLLM. Only relevant if the model
|
|
435
|
+
is generative. A larger value will result in faster evaluation, but at
|
|
436
|
+
the risk of running out of GPU memory. Only reduce this if you are
|
|
437
|
+
running out of GPU memory. Defaults to the value specified when
|
|
438
|
+
initialising the benchmarker.
|
|
439
|
+
generative_type:
|
|
440
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
441
|
+
generative. If not specified, then the type will be inferred based on
|
|
442
|
+
the tags of the model. Defaults to the value specified when initialising
|
|
443
|
+
the benchmarker.
|
|
444
|
+
force:
|
|
445
|
+
Whether to force evaluations of models, even if they have been
|
|
446
|
+
benchmarked already. Defaults to the value specified when initialising
|
|
447
|
+
the benchmarker.
|
|
448
|
+
verbose:
|
|
449
|
+
Whether to output additional output. Defaults to the value specified
|
|
450
|
+
when initialising the benchmarker.
|
|
451
|
+
debug:
|
|
452
|
+
Whether to output debug information. Defaults to the value specified
|
|
453
|
+
when initialising the benchmarker.
|
|
427
454
|
|
|
428
455
|
Returns:
|
|
429
456
|
A list of benchmark results.
|
|
@@ -435,28 +462,141 @@ class Benchmarker:
|
|
|
435
462
|
if task is not None and dataset is not None:
|
|
436
463
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
437
464
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
465
|
+
# Get a new updated benchmark configuration, based on any changes to the
|
|
466
|
+
# parameters
|
|
467
|
+
benchmark_config_params = BenchmarkConfigParams(
|
|
468
|
+
task=(
|
|
469
|
+
task if task is not None else self.benchmark_config_default_params.task
|
|
470
|
+
),
|
|
471
|
+
dataset=(
|
|
472
|
+
dataset
|
|
473
|
+
if dataset is not None
|
|
474
|
+
else self.benchmark_config_default_params.dataset
|
|
475
|
+
),
|
|
476
|
+
progress_bar=(
|
|
477
|
+
progress_bar
|
|
478
|
+
if progress_bar is not None
|
|
479
|
+
else self.benchmark_config_default_params.progress_bar
|
|
480
|
+
),
|
|
481
|
+
save_results=(
|
|
482
|
+
save_results
|
|
483
|
+
if save_results is not None
|
|
484
|
+
else self.benchmark_config_default_params.save_results
|
|
485
|
+
),
|
|
486
|
+
language=(
|
|
487
|
+
language
|
|
488
|
+
if language is not None
|
|
489
|
+
else self.benchmark_config_default_params.language
|
|
490
|
+
),
|
|
491
|
+
model_language=(
|
|
492
|
+
model_language
|
|
493
|
+
if model_language is not None
|
|
494
|
+
else self.benchmark_config_default_params.model_language
|
|
495
|
+
),
|
|
496
|
+
dataset_language=(
|
|
497
|
+
dataset_language
|
|
498
|
+
if dataset_language is not None
|
|
499
|
+
else self.benchmark_config_default_params.dataset_language
|
|
500
|
+
),
|
|
501
|
+
device=(
|
|
502
|
+
device
|
|
503
|
+
if device is not None
|
|
504
|
+
else self.benchmark_config_default_params.device
|
|
505
|
+
),
|
|
506
|
+
batch_size=(
|
|
507
|
+
batch_size
|
|
508
|
+
if batch_size is not None
|
|
509
|
+
else self.benchmark_config_default_params.batch_size
|
|
510
|
+
),
|
|
511
|
+
raise_errors=(
|
|
512
|
+
raise_errors
|
|
513
|
+
if raise_errors is not None
|
|
514
|
+
else self.benchmark_config_default_params.raise_errors
|
|
515
|
+
),
|
|
516
|
+
cache_dir=(
|
|
517
|
+
cache_dir
|
|
518
|
+
if cache_dir is not None
|
|
519
|
+
else self.benchmark_config_default_params.cache_dir
|
|
520
|
+
),
|
|
521
|
+
api_key=(
|
|
522
|
+
api_key
|
|
523
|
+
if api_key is not None
|
|
524
|
+
else self.benchmark_config_default_params.api_key
|
|
525
|
+
),
|
|
526
|
+
api_base=(
|
|
527
|
+
api_base
|
|
528
|
+
if api_base is not None
|
|
529
|
+
else self.benchmark_config_default_params.api_base
|
|
530
|
+
),
|
|
531
|
+
api_version=(
|
|
532
|
+
api_version
|
|
533
|
+
if api_version is not None
|
|
534
|
+
else self.benchmark_config_default_params.api_version
|
|
535
|
+
),
|
|
536
|
+
trust_remote_code=(
|
|
537
|
+
trust_remote_code
|
|
538
|
+
if trust_remote_code is not None
|
|
539
|
+
else self.benchmark_config_default_params.trust_remote_code
|
|
540
|
+
),
|
|
541
|
+
clear_model_cache=(
|
|
542
|
+
clear_model_cache
|
|
543
|
+
if clear_model_cache is not None
|
|
544
|
+
else self.benchmark_config_default_params.clear_model_cache
|
|
545
|
+
),
|
|
546
|
+
evaluate_test_split=(
|
|
547
|
+
evaluate_test_split
|
|
548
|
+
if evaluate_test_split is not None
|
|
549
|
+
else self.benchmark_config_default_params.evaluate_test_split
|
|
550
|
+
),
|
|
551
|
+
few_shot=(
|
|
552
|
+
few_shot
|
|
553
|
+
if few_shot is not None
|
|
554
|
+
else self.benchmark_config_default_params.few_shot
|
|
555
|
+
),
|
|
556
|
+
num_iterations=(
|
|
557
|
+
num_iterations
|
|
558
|
+
if num_iterations is not None
|
|
559
|
+
else self.benchmark_config_default_params.num_iterations
|
|
560
|
+
),
|
|
561
|
+
requires_safetensors=(
|
|
562
|
+
requires_safetensors
|
|
563
|
+
if requires_safetensors is not None
|
|
564
|
+
else self.benchmark_config_default_params.requires_safetensors
|
|
565
|
+
),
|
|
566
|
+
download_only=(
|
|
567
|
+
download_only
|
|
568
|
+
if download_only is not None
|
|
569
|
+
else self.benchmark_config_default_params.download_only
|
|
570
|
+
),
|
|
571
|
+
gpu_memory_utilization=(
|
|
572
|
+
gpu_memory_utilization
|
|
573
|
+
if gpu_memory_utilization is not None
|
|
574
|
+
else self.benchmark_config_default_params.gpu_memory_utilization
|
|
575
|
+
),
|
|
576
|
+
generative_type=(
|
|
577
|
+
generative_type
|
|
578
|
+
if generative_type is not None
|
|
579
|
+
else self.benchmark_config_default_params.generative_type
|
|
580
|
+
),
|
|
581
|
+
force=(
|
|
582
|
+
force
|
|
583
|
+
if force is not None
|
|
584
|
+
else self.benchmark_config_default_params.force
|
|
585
|
+
),
|
|
586
|
+
verbose=(
|
|
587
|
+
verbose
|
|
588
|
+
if verbose is not None
|
|
589
|
+
else self.benchmark_config_default_params.verbose
|
|
590
|
+
),
|
|
591
|
+
debug=(
|
|
592
|
+
debug
|
|
593
|
+
if debug is not None
|
|
594
|
+
else self.benchmark_config_default_params.debug
|
|
595
|
+
),
|
|
596
|
+
run_with_cli=self.benchmark_config_default_params.run_with_cli,
|
|
597
|
+
)
|
|
598
|
+
benchmark_config = build_benchmark_config(
|
|
599
|
+
benchmark_config_params=benchmark_config_params
|
|
460
600
|
)
|
|
461
601
|
|
|
462
602
|
adjust_logging_level(verbose=benchmark_config.verbose)
|
|
@@ -469,46 +609,90 @@ class Benchmarker:
|
|
|
469
609
|
dataset_names=benchmark_config.datasets
|
|
470
610
|
)
|
|
471
611
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
612
|
+
# Get all the model configs
|
|
613
|
+
model_configs: list[ModelConfig] = list()
|
|
614
|
+
for model_id in tqdm(
|
|
615
|
+
iterable=model_ids,
|
|
616
|
+
desc="Fetching model configurations",
|
|
617
|
+
disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
|
|
618
|
+
):
|
|
478
619
|
try:
|
|
479
620
|
model_config = get_model_config(
|
|
480
621
|
model_id=model_id, benchmark_config=benchmark_config
|
|
481
622
|
)
|
|
623
|
+
model_configs.append(model_config)
|
|
482
624
|
except InvalidModel as e:
|
|
483
625
|
logger.info(e.message)
|
|
484
|
-
|
|
626
|
+
|
|
627
|
+
# Create a dictionary that takes each model config to the dataset configs that
|
|
628
|
+
# we need to benchmark the model on. Here we remove the datasets that the model
|
|
629
|
+
# has already been benchmarked on, or datasets that the model cannot be
|
|
630
|
+
# benchmarked on.
|
|
631
|
+
model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
|
|
632
|
+
model_config: [
|
|
633
|
+
dataset_config
|
|
634
|
+
for dataset_config in dataset_configs
|
|
635
|
+
if (
|
|
636
|
+
benchmark_config.force
|
|
637
|
+
or not model_has_been_benchmarked(
|
|
638
|
+
model_config=model_config,
|
|
639
|
+
dataset_config=dataset_config,
|
|
640
|
+
benchmark_config=benchmark_config,
|
|
641
|
+
benchmark_results=self.benchmark_results,
|
|
642
|
+
)
|
|
643
|
+
)
|
|
644
|
+
and model_config.model_type in dataset_config.allowed_model_types
|
|
645
|
+
]
|
|
646
|
+
for model_config in model_configs
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
total_benchmarks = sum(
|
|
650
|
+
len(dataset_configs)
|
|
651
|
+
for dataset_configs in model_config_to_dataset_configs.values()
|
|
652
|
+
)
|
|
653
|
+
if total_benchmarks == 0:
|
|
654
|
+
logger.info(
|
|
655
|
+
"No benchmarks to run, as all the selected models have already been "
|
|
656
|
+
"benchmarked on all the selected datasets."
|
|
657
|
+
)
|
|
658
|
+
return list()
|
|
659
|
+
|
|
660
|
+
logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
|
|
661
|
+
|
|
662
|
+
num_finished_benchmarks = 0
|
|
663
|
+
current_benchmark_results: list[BenchmarkResult] = list()
|
|
664
|
+
for model_config in model_configs:
|
|
665
|
+
if not model_config_to_dataset_configs[model_config]:
|
|
666
|
+
logger.debug(
|
|
667
|
+
f"Skipping model {model_config.model_id!r} because it has "
|
|
668
|
+
"already been benchmarked on all valid datasets."
|
|
669
|
+
)
|
|
485
670
|
continue
|
|
486
671
|
|
|
487
672
|
if model_config.adapter_base_model_id:
|
|
488
673
|
open_issue_msg = (
|
|
489
|
-
"If offline support is important to you, please "
|
|
490
|
-
"
|
|
674
|
+
"If offline support is important to you, please consider opening "
|
|
675
|
+
"an issue at https://github.com/EuroEval/EuroEval/issues."
|
|
491
676
|
)
|
|
492
677
|
if not internet_connection_available():
|
|
493
678
|
raise InvalidModel(
|
|
494
679
|
"Offline benchmarking of models with adapters is not currently "
|
|
495
|
-
"supported. "
|
|
496
|
-
|
|
680
|
+
"supported. An active internet connection is required. "
|
|
681
|
+
"{open_issue_msg}"
|
|
497
682
|
)
|
|
498
683
|
elif benchmark_config.download_only:
|
|
499
684
|
log_once(
|
|
500
685
|
"You are using download only mode with a model that includes "
|
|
501
|
-
"an adapter. "
|
|
502
|
-
"
|
|
503
|
-
"
|
|
504
|
-
"An internet connection will be required during evaluation. "
|
|
686
|
+
"an adapter. Please note that offline benchmarking of "
|
|
687
|
+
"adapter models is not currently supported - an internet "
|
|
688
|
+
"connection will be required during evaluation in this case. "
|
|
505
689
|
f"{open_issue_msg}",
|
|
506
690
|
level=logging.WARNING,
|
|
507
691
|
)
|
|
508
692
|
|
|
509
693
|
loaded_model: BenchmarkModule | None = None
|
|
510
694
|
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
511
|
-
for dataset_config in
|
|
695
|
+
for dataset_config in model_config_to_dataset_configs[model_config]:
|
|
512
696
|
# Revert any changes to the benchmark configuration made for the
|
|
513
697
|
# previous dataset
|
|
514
698
|
for param, value in benchmark_params_to_revert.items():
|
|
@@ -536,34 +720,6 @@ class Benchmarker:
|
|
|
536
720
|
benchmark_params_to_revert["few_shot"] = True
|
|
537
721
|
benchmark_config.few_shot = False
|
|
538
722
|
|
|
539
|
-
# Skip if we have already benchmarked this model on this dataset and
|
|
540
|
-
# we are not forcing the benchmark
|
|
541
|
-
if not benchmark_config.force and model_has_been_benchmarked(
|
|
542
|
-
model_id=model_id,
|
|
543
|
-
dataset=dataset_config.name,
|
|
544
|
-
few_shot=benchmark_config.few_shot,
|
|
545
|
-
validation_split=not benchmark_config.evaluate_test_split,
|
|
546
|
-
benchmark_results=self.benchmark_results,
|
|
547
|
-
):
|
|
548
|
-
logger.debug(
|
|
549
|
-
f"Skipping benchmarking {model_id} on "
|
|
550
|
-
f"{dataset_config.pretty_name}, as it has already been "
|
|
551
|
-
"benchmarked."
|
|
552
|
-
)
|
|
553
|
-
num_finished_benchmarks += 1
|
|
554
|
-
continue
|
|
555
|
-
|
|
556
|
-
# Skip if the model type should not be benchmarked on this dataset
|
|
557
|
-
model_type = model_config.model_type
|
|
558
|
-
allowed_model_types = dataset_config.allowed_model_types
|
|
559
|
-
if model_type not in allowed_model_types:
|
|
560
|
-
logger.debug(
|
|
561
|
-
f"Skipping benchmarking {model_id} on "
|
|
562
|
-
f"{dataset_config.pretty_name}, as it is of type {model_type}, "
|
|
563
|
-
f"and the only allowed model types are {allowed_model_types}."
|
|
564
|
-
)
|
|
565
|
-
continue
|
|
566
|
-
|
|
567
723
|
# We do not re-initialise generative models as their architecture is not
|
|
568
724
|
# customised to specific datasets
|
|
569
725
|
if model_config.model_type == ModelType.GENERATIVE:
|
|
@@ -597,6 +753,22 @@ class Benchmarker:
|
|
|
597
753
|
else:
|
|
598
754
|
loaded_model.dataset_config = dataset_config
|
|
599
755
|
|
|
756
|
+
# Skip the benchmark if the model is not of the correct
|
|
757
|
+
# generative type
|
|
758
|
+
if (
|
|
759
|
+
loaded_model.generative_type
|
|
760
|
+
not in dataset_config.allowed_generative_types
|
|
761
|
+
):
|
|
762
|
+
logger.debug(
|
|
763
|
+
f"Skipping the benchmark of model "
|
|
764
|
+
f"{model_config.model_id!r}on dataset "
|
|
765
|
+
f"{dataset_config.name!r} because the model has generative "
|
|
766
|
+
f"type {loaded_model.generative_type} and the dataset "
|
|
767
|
+
f"only allows {dataset_config.allowed_generative_types}."
|
|
768
|
+
)
|
|
769
|
+
num_finished_benchmarks += 1
|
|
770
|
+
continue
|
|
771
|
+
|
|
600
772
|
# Benchmark a single model on a single dataset
|
|
601
773
|
benchmark_output_or_err = self._benchmark_single(
|
|
602
774
|
model=loaded_model,
|
|
@@ -654,176 +826,6 @@ class Benchmarker:
|
|
|
654
826
|
destroy_process_group()
|
|
655
827
|
return current_benchmark_results
|
|
656
828
|
|
|
657
|
-
def _get_updated_benchmark_config(
|
|
658
|
-
self,
|
|
659
|
-
progress_bar: bool | None = None,
|
|
660
|
-
save_results: bool | None = None,
|
|
661
|
-
task: str | list[str] | None | None = None,
|
|
662
|
-
dataset: str | list[str] | None | None = None,
|
|
663
|
-
language: str | list[str] | None = None,
|
|
664
|
-
model_language: str | list[str] | None | None = None,
|
|
665
|
-
dataset_language: str | list[str] | None | None = None,
|
|
666
|
-
device: Device | None | None = None,
|
|
667
|
-
batch_size: int | None = None,
|
|
668
|
-
raise_errors: bool | None = None,
|
|
669
|
-
cache_dir: str | None = None,
|
|
670
|
-
api_key: str | None | None = None,
|
|
671
|
-
force: bool | None = None,
|
|
672
|
-
verbose: bool | None = None,
|
|
673
|
-
trust_remote_code: bool | None = None,
|
|
674
|
-
clear_model_cache: bool | None = None,
|
|
675
|
-
evaluate_test_split: bool | None = None,
|
|
676
|
-
few_shot: bool | None = None,
|
|
677
|
-
num_iterations: int | None = None,
|
|
678
|
-
api_base: str | None | None = None,
|
|
679
|
-
api_version: str | None | None = None,
|
|
680
|
-
debug: bool | None = None,
|
|
681
|
-
run_with_cli: bool | None = None,
|
|
682
|
-
requires_safetensors: bool | None = None,
|
|
683
|
-
download_only: bool | None = None,
|
|
684
|
-
) -> "BenchmarkConfig":
|
|
685
|
-
"""Get an updated benchmark configuration.
|
|
686
|
-
|
|
687
|
-
Args:
|
|
688
|
-
progress_bar:
|
|
689
|
-
Whether progress bars should be shown. If None, then this value will not
|
|
690
|
-
be updated.
|
|
691
|
-
save_results:
|
|
692
|
-
Whether to save the benchmark results to
|
|
693
|
-
'euroeval_benchmark_results.jsonl'. If None, then this value will not
|
|
694
|
-
be updated.
|
|
695
|
-
task:
|
|
696
|
-
The tasks benchmark the model(s) on. If None, then this value will not
|
|
697
|
-
be updated.
|
|
698
|
-
dataset:
|
|
699
|
-
The datasets to benchmark on. If None, then this value will not be
|
|
700
|
-
updated.
|
|
701
|
-
language:
|
|
702
|
-
The language codes of the languages to include, both for models and
|
|
703
|
-
datasets. If None, then this value will not be updated.
|
|
704
|
-
model_language:
|
|
705
|
-
The language codes of the languages to include for models. If None, then
|
|
706
|
-
this value will not be updated.
|
|
707
|
-
dataset_language:
|
|
708
|
-
The language codes of the languages to include for datasets. If None,
|
|
709
|
-
then this value will not be updated.
|
|
710
|
-
device:
|
|
711
|
-
The device to use for benchmarking. If None, then this value will not be
|
|
712
|
-
updated.
|
|
713
|
-
batch_size:
|
|
714
|
-
The batch size to use. If None, then this value will not be updated.
|
|
715
|
-
raise_errors:
|
|
716
|
-
Whether to raise errors instead of skipping the model evaluation. If
|
|
717
|
-
None, then this value will not be updated.
|
|
718
|
-
cache_dir:
|
|
719
|
-
Directory to store cached models. If None, then this value will not be
|
|
720
|
-
updated.
|
|
721
|
-
api_key:
|
|
722
|
-
The API key to use for a given inference server. If None, then this
|
|
723
|
-
value will not be updated.
|
|
724
|
-
force:
|
|
725
|
-
Whether to force evaluations of models, even if they have been
|
|
726
|
-
benchmarked already. If None, then this value will not be updated.
|
|
727
|
-
verbose:
|
|
728
|
-
Whether to output additional output. If None, then this value will not
|
|
729
|
-
be updated.
|
|
730
|
-
trust_remote_code:
|
|
731
|
-
Whether to trust remote code when loading models. If None, then this
|
|
732
|
-
value will not be updated.
|
|
733
|
-
clear_model_cache:
|
|
734
|
-
Whether to clear the model cache after benchmarking each model. If None,
|
|
735
|
-
then this value will not be updated.
|
|
736
|
-
evaluate_test_split:
|
|
737
|
-
Whether to evaluate the test split of the datasets. If None, then this
|
|
738
|
-
value will not be updated.
|
|
739
|
-
few_shot:
|
|
740
|
-
Whether to only evaluate the model using few-shot evaluation. If None,
|
|
741
|
-
then this value will not be updated.
|
|
742
|
-
num_iterations:
|
|
743
|
-
The number of times each model should be evaluated. If None, then this
|
|
744
|
-
value will not be updated.
|
|
745
|
-
api_base:
|
|
746
|
-
The base URL for a given inference API. If None, then this value will
|
|
747
|
-
not be updated.
|
|
748
|
-
api_version:
|
|
749
|
-
The version of the API to use. If None, then this value will not be
|
|
750
|
-
updated.
|
|
751
|
-
debug:
|
|
752
|
-
Whether to output debug information. If None, then this value will not
|
|
753
|
-
be updated.
|
|
754
|
-
run_with_cli:
|
|
755
|
-
Whether the benchmarker is being run from the command-line interface.
|
|
756
|
-
If None, then this value will not be updated.
|
|
757
|
-
requires_safetensors:
|
|
758
|
-
Whether to only allow models that use the safetensors format. If None,
|
|
759
|
-
then this value will not be updated.
|
|
760
|
-
download_only:
|
|
761
|
-
Whether to only download the models without evaluating them. If None,
|
|
762
|
-
then this value will not be updated.
|
|
763
|
-
download_only:
|
|
764
|
-
Whether to only download models and datasets without performing any
|
|
765
|
-
benchmarking. If None, then this value will not be updated.
|
|
766
|
-
|
|
767
|
-
Returns:
|
|
768
|
-
The updated benchmark configuration.
|
|
769
|
-
"""
|
|
770
|
-
benchmark_config_params = deepcopy(self.benchmark_config_default_params)
|
|
771
|
-
|
|
772
|
-
if progress_bar is not None:
|
|
773
|
-
benchmark_config_params.progress_bar = progress_bar
|
|
774
|
-
if save_results is not None:
|
|
775
|
-
benchmark_config_params.save_results = save_results
|
|
776
|
-
if task is not None:
|
|
777
|
-
benchmark_config_params.task = task
|
|
778
|
-
benchmark_config_params.dataset = None
|
|
779
|
-
if dataset is not None:
|
|
780
|
-
benchmark_config_params.dataset = dataset
|
|
781
|
-
benchmark_config_params.task = None
|
|
782
|
-
if language is not None:
|
|
783
|
-
benchmark_config_params.language = language
|
|
784
|
-
if model_language is not None:
|
|
785
|
-
benchmark_config_params.model_language = model_language
|
|
786
|
-
if dataset_language is not None:
|
|
787
|
-
benchmark_config_params.dataset_language = dataset_language
|
|
788
|
-
if device is not None:
|
|
789
|
-
benchmark_config_params.device = device
|
|
790
|
-
if batch_size is not None:
|
|
791
|
-
benchmark_config_params.batch_size = batch_size
|
|
792
|
-
if raise_errors is not None:
|
|
793
|
-
benchmark_config_params.raise_errors = raise_errors
|
|
794
|
-
if cache_dir is not None:
|
|
795
|
-
benchmark_config_params.cache_dir = cache_dir
|
|
796
|
-
if api_key is not None:
|
|
797
|
-
benchmark_config_params.api_key = api_key
|
|
798
|
-
if force is not None:
|
|
799
|
-
benchmark_config_params.force = force
|
|
800
|
-
if verbose is not None:
|
|
801
|
-
benchmark_config_params.verbose = verbose
|
|
802
|
-
if trust_remote_code is not None:
|
|
803
|
-
benchmark_config_params.trust_remote_code = trust_remote_code
|
|
804
|
-
if clear_model_cache is not None:
|
|
805
|
-
benchmark_config_params.clear_model_cache = clear_model_cache
|
|
806
|
-
if evaluate_test_split is not None:
|
|
807
|
-
benchmark_config_params.evaluate_test_split = evaluate_test_split
|
|
808
|
-
if few_shot is not None:
|
|
809
|
-
benchmark_config_params.few_shot = few_shot
|
|
810
|
-
if num_iterations is not None:
|
|
811
|
-
benchmark_config_params.num_iterations = num_iterations
|
|
812
|
-
if api_base is not None:
|
|
813
|
-
benchmark_config_params.api_base = api_base
|
|
814
|
-
if api_version is not None:
|
|
815
|
-
benchmark_config_params.api_version = api_version
|
|
816
|
-
if debug is not None:
|
|
817
|
-
benchmark_config_params.debug = debug
|
|
818
|
-
if run_with_cli is not None:
|
|
819
|
-
benchmark_config_params.run_with_cli = run_with_cli
|
|
820
|
-
if requires_safetensors is not None:
|
|
821
|
-
benchmark_config_params.requires_safetensors = requires_safetensors
|
|
822
|
-
if download_only is not None:
|
|
823
|
-
benchmark_config_params.download_only = download_only
|
|
824
|
-
|
|
825
|
-
return build_benchmark_config(benchmark_config_params=benchmark_config_params)
|
|
826
|
-
|
|
827
829
|
def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
|
|
828
830
|
"""Prepare the model ID(s) to be benchmarked.
|
|
829
831
|
|
|
@@ -991,164 +993,30 @@ class Benchmarker:
|
|
|
991
993
|
raise e
|
|
992
994
|
return e
|
|
993
995
|
|
|
994
|
-
def __call__(
|
|
995
|
-
self
|
|
996
|
-
model: list[str] | str,
|
|
997
|
-
task: str | list[str] | None = None,
|
|
998
|
-
dataset: list[str] | str | None = None,
|
|
999
|
-
progress_bar: bool | None = None,
|
|
1000
|
-
save_results: bool | None = None,
|
|
1001
|
-
language: str | list[str] | None = None,
|
|
1002
|
-
model_language: str | list[str] | None = None,
|
|
1003
|
-
dataset_language: str | list[str] | None = None,
|
|
1004
|
-
device: Device | None = None,
|
|
1005
|
-
batch_size: int | None = None,
|
|
1006
|
-
raise_errors: bool | None = None,
|
|
1007
|
-
cache_dir: str | None = None,
|
|
1008
|
-
api_key: str | None = None,
|
|
1009
|
-
force: bool | None = None,
|
|
1010
|
-
verbose: bool | None = None,
|
|
1011
|
-
trust_remote_code: bool | None = None,
|
|
1012
|
-
clear_model_cache: bool | None = None,
|
|
1013
|
-
evaluate_test_split: bool | None = None,
|
|
1014
|
-
few_shot: bool | None = None,
|
|
1015
|
-
num_iterations: int | None = None,
|
|
1016
|
-
requires_safetensors: bool | None = None,
|
|
1017
|
-
) -> list[BenchmarkResult]:
|
|
1018
|
-
"""Benchmarks models on datasets.
|
|
1019
|
-
|
|
1020
|
-
Args:
|
|
1021
|
-
model:
|
|
1022
|
-
The full Hugging Face Hub path(s) to the pretrained transformer model.
|
|
1023
|
-
The specific model version to use can be added after the suffix '@':
|
|
1024
|
-
"model@v1.0.0". It can be a branch name, a tag name, or a commit id,
|
|
1025
|
-
and defaults to the latest version if not specified.
|
|
1026
|
-
task:
|
|
1027
|
-
The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
|
|
1028
|
-
If both `task` and `dataset` are None then all datasets will be
|
|
1029
|
-
benchmarked. Defaults to None.
|
|
1030
|
-
dataset:
|
|
1031
|
-
The datasets to benchmark on. Mutually exclusive with `task`. If both
|
|
1032
|
-
`task` and `dataset` are None then all datasets will be benchmarked.
|
|
1033
|
-
Defaults to None.
|
|
1034
|
-
progress_bar:
|
|
1035
|
-
Whether progress bars should be shown. Defaults to the value specified
|
|
1036
|
-
when initialising the benchmarker.
|
|
1037
|
-
save_results:
|
|
1038
|
-
Whether to save the benchmark results to
|
|
1039
|
-
'euroeval_benchmark_results.jsonl'. Defaults to the value specified
|
|
1040
|
-
when initialising the benchmarker.
|
|
1041
|
-
language:
|
|
1042
|
-
The language codes of the languages to include, both for models and
|
|
1043
|
-
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
|
|
1044
|
-
'all' if all languages should be considered. Defaults to the value
|
|
1045
|
-
specified when initialising the benchmarker.
|
|
1046
|
-
model_language:
|
|
1047
|
-
The language codes of the languages to include for models. If specified
|
|
1048
|
-
then this overrides the `language` parameter for model languages.
|
|
1049
|
-
Defaults to the value specified when initialising the benchmarker.
|
|
1050
|
-
dataset_language:
|
|
1051
|
-
The language codes of the languages to include for datasets. If
|
|
1052
|
-
specified then this overrides the `language` parameter for dataset
|
|
1053
|
-
languages. Defaults to the value specified when initialising the
|
|
1054
|
-
benchmarker.
|
|
1055
|
-
device:
|
|
1056
|
-
The device to use for benchmarking. Defaults to the value specified when
|
|
1057
|
-
initialising the benchmarker.
|
|
1058
|
-
batch_size:
|
|
1059
|
-
The batch size to use. Defaults to the value specified when initialising
|
|
1060
|
-
the benchmarker.
|
|
1061
|
-
raise_errors:
|
|
1062
|
-
Whether to raise errors instead of skipping the model evaluation.
|
|
1063
|
-
cache_dir:
|
|
1064
|
-
Directory to store cached models. Defaults to the value specified when
|
|
1065
|
-
initialising the benchmarker.
|
|
1066
|
-
api_key:
|
|
1067
|
-
The API key to use for a given inference server. Defaults to the value
|
|
1068
|
-
specified when initialising the benchmarker.
|
|
1069
|
-
force:
|
|
1070
|
-
Whether to force evaluations of models, even if they have been
|
|
1071
|
-
benchmarked already. Defaults to the value specified when initialising
|
|
1072
|
-
the benchmarker.
|
|
1073
|
-
verbose:
|
|
1074
|
-
Whether to output additional output. Defaults to the value specified
|
|
1075
|
-
when initialising the benchmarker.
|
|
1076
|
-
trust_remote_code:
|
|
1077
|
-
Whether to trust remote code when loading models. Defaults to the value
|
|
1078
|
-
specified when initialising the benchmarker.
|
|
1079
|
-
clear_model_cache:
|
|
1080
|
-
Whether to clear the model cache after benchmarking each model. Defaults
|
|
1081
|
-
to the value specified when initialising the benchmarker.
|
|
1082
|
-
evaluate_test_split:
|
|
1083
|
-
Whether to evaluate the test split of the datasets. Defaults to the
|
|
1084
|
-
value specified when initialising the benchmarker.
|
|
1085
|
-
few_shot:
|
|
1086
|
-
Whether to only evaluate the model using few-shot evaluation. Only
|
|
1087
|
-
relevant if the model is generative. Defaults to the value specified
|
|
1088
|
-
when initialising the benchmarker.
|
|
1089
|
-
num_iterations:
|
|
1090
|
-
The number of times each model should be evaluated. This is only meant
|
|
1091
|
-
to be used for power users, and scores will not be allowed on the
|
|
1092
|
-
leaderboards if this is changed. Defaults to the value specified when
|
|
1093
|
-
initialising the benchmarker.
|
|
1094
|
-
requires_safetensors:
|
|
1095
|
-
Whether to only allow models that use the safetensors format. Defaults
|
|
1096
|
-
to the value specified when initialising the benchmarker.
|
|
1097
|
-
|
|
1098
|
-
Returns:
|
|
1099
|
-
A list of benchmark results.
|
|
1100
|
-
|
|
1101
|
-
Raises:
|
|
1102
|
-
ValueError:
|
|
1103
|
-
If both `task` and `dataset` are specified.
|
|
1104
|
-
"""
|
|
996
|
+
def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
|
|
997
|
+
"""Alias for `self.benchmark()`."""
|
|
1105
998
|
logger.warning(
|
|
1106
999
|
"Calling the `Benchmarker` class directly is deprecated. Please use the "
|
|
1107
1000
|
"`benchmark` function instead. This will be removed in a future version."
|
|
1108
1001
|
)
|
|
1109
|
-
return self.benchmark(
|
|
1110
|
-
model=model,
|
|
1111
|
-
task=task,
|
|
1112
|
-
dataset=dataset,
|
|
1113
|
-
progress_bar=progress_bar,
|
|
1114
|
-
save_results=save_results,
|
|
1115
|
-
language=language,
|
|
1116
|
-
model_language=model_language,
|
|
1117
|
-
dataset_language=dataset_language,
|
|
1118
|
-
device=device,
|
|
1119
|
-
batch_size=batch_size,
|
|
1120
|
-
raise_errors=raise_errors,
|
|
1121
|
-
cache_dir=cache_dir,
|
|
1122
|
-
api_key=api_key,
|
|
1123
|
-
force=force,
|
|
1124
|
-
verbose=verbose,
|
|
1125
|
-
trust_remote_code=trust_remote_code,
|
|
1126
|
-
clear_model_cache=clear_model_cache,
|
|
1127
|
-
evaluate_test_split=evaluate_test_split,
|
|
1128
|
-
few_shot=few_shot,
|
|
1129
|
-
num_iterations=num_iterations,
|
|
1130
|
-
requires_safetensors=requires_safetensors,
|
|
1131
|
-
)
|
|
1002
|
+
return self.benchmark(*args, **kwds)
|
|
1132
1003
|
|
|
1133
1004
|
|
|
1134
1005
|
def model_has_been_benchmarked(
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
validation_split: bool,
|
|
1006
|
+
model_config: "ModelConfig",
|
|
1007
|
+
dataset_config: "DatasetConfig",
|
|
1008
|
+
benchmark_config: "BenchmarkConfig",
|
|
1139
1009
|
benchmark_results: list[BenchmarkResult],
|
|
1140
1010
|
) -> bool:
|
|
1141
1011
|
"""Checks whether a model has already been benchmarked on a dataset.
|
|
1142
1012
|
|
|
1143
1013
|
Args:
|
|
1144
|
-
|
|
1145
|
-
The model
|
|
1146
|
-
|
|
1147
|
-
The dataset.
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
validation_split:
|
|
1151
|
-
Whether the model was evaluated on the validation split.
|
|
1014
|
+
model_config:
|
|
1015
|
+
The configuration of the model we are evaluating.
|
|
1016
|
+
dataset_config:
|
|
1017
|
+
The configuration of the dataset we are evaluating on.
|
|
1018
|
+
benchmark_config:
|
|
1019
|
+
The general benchmark configuration.
|
|
1152
1020
|
benchmark_results:
|
|
1153
1021
|
The benchmark results.
|
|
1154
1022
|
|
|
@@ -1156,10 +1024,28 @@ def model_has_been_benchmarked(
|
|
|
1156
1024
|
Whether the model has already been evaluated on the dataset.
|
|
1157
1025
|
"""
|
|
1158
1026
|
for record in benchmark_results:
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1027
|
+
model_id_components = split_model_id(model_id=record.model)
|
|
1028
|
+
same_model_id = model_id_components.model_id == model_config.model_id
|
|
1029
|
+
same_revision = model_id_components.revision == model_config.revision
|
|
1030
|
+
same_param = model_id_components.param == model_config.param
|
|
1031
|
+
same_dataset = record.dataset == dataset_config.name
|
|
1032
|
+
same_split = (
|
|
1033
|
+
record.validation_split != benchmark_config.evaluate_test_split
|
|
1034
|
+
or "val" not in dataset_config.splits
|
|
1035
|
+
)
|
|
1036
|
+
same_num_shots = (
|
|
1037
|
+
record.few_shot == benchmark_config.few_shot
|
|
1038
|
+
or not record.generative
|
|
1039
|
+
or dataset_config.task.requires_zero_shot
|
|
1040
|
+
)
|
|
1041
|
+
if (
|
|
1042
|
+
same_model_id
|
|
1043
|
+
and same_revision
|
|
1044
|
+
and same_param
|
|
1045
|
+
and same_dataset
|
|
1046
|
+
and same_split
|
|
1047
|
+
and same_num_shots
|
|
1048
|
+
):
|
|
1163
1049
|
return True
|
|
1164
1050
|
return False
|
|
1165
1051
|
|