EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +9 -2
- euroeval/benchmark_config_factory.py +51 -50
- euroeval/benchmark_modules/base.py +9 -21
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +101 -71
- euroeval/benchmark_modules/litellm.py +115 -53
- euroeval/benchmark_modules/vllm.py +107 -92
- euroeval/benchmarker.py +144 -121
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +86 -8
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +80 -29
- euroeval/data_models.py +338 -330
- euroeval/dataset_configs/__init__.py +12 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +55 -93
- euroeval/dataset_configs/dutch.py +48 -87
- euroeval/dataset_configs/english.py +45 -77
- euroeval/dataset_configs/estonian.py +42 -34
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -69
- euroeval/dataset_configs/french.py +39 -75
- euroeval/dataset_configs/german.py +45 -82
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -79
- euroeval/dataset_configs/latvian.py +28 -35
- euroeval/dataset_configs/lithuanian.py +28 -26
- euroeval/dataset_configs/norwegian.py +72 -115
- euroeval/dataset_configs/polish.py +33 -61
- euroeval/dataset_configs/portuguese.py +33 -66
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/spanish.py +42 -77
- euroeval/dataset_configs/swedish.py +52 -90
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +24 -17
- euroeval/generation.py +15 -14
- euroeval/generation_utils.py +8 -8
- euroeval/languages.py +395 -323
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +21 -6
- euroeval/metrics/llm_as_a_judge.py +6 -4
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +17 -19
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +99 -42
- euroeval/prompt_templates/multiple_choice.py +102 -38
- euroeval/prompt_templates/named_entity_recognition.py +172 -51
- euroeval/prompt_templates/reading_comprehension.py +119 -42
- euroeval/prompt_templates/sentiment_classification.py +110 -40
- euroeval/prompt_templates/summarization.py +85 -40
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +11 -10
- euroeval/speed_benchmark.py +5 -6
- euroeval/task_group_utils/multiple_choice_classification.py +2 -4
- euroeval/task_group_utils/question_answering.py +24 -16
- euroeval/task_group_utils/sequence_classification.py +48 -35
- euroeval/task_group_utils/text_to_text.py +19 -9
- euroeval/task_group_utils/token_classification.py +21 -17
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +33 -22
- euroeval/types.py +10 -9
- euroeval/utils.py +35 -149
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Class that benchmarks language models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import contextlib
|
|
5
|
+
import datetime as dt
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
8
|
+
import os
|
|
6
9
|
import re
|
|
7
|
-
import sys
|
|
8
10
|
import typing as t
|
|
9
11
|
from pathlib import Path
|
|
10
12
|
from shutil import rmtree
|
|
@@ -12,7 +14,6 @@ from time import sleep
|
|
|
12
14
|
|
|
13
15
|
from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
|
14
16
|
from torch.distributed import destroy_process_group
|
|
15
|
-
from tqdm.auto import tqdm
|
|
16
17
|
|
|
17
18
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
19
|
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
@@ -23,6 +24,7 @@ from .enums import Device, GenerativeType, ModelType
|
|
|
23
24
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
|
|
24
25
|
from .finetuning import finetune
|
|
25
26
|
from .generation import generate
|
|
27
|
+
from .logging_utils import adjust_logging_level, get_pbar, log, log_once
|
|
26
28
|
from .model_config import get_model_config
|
|
27
29
|
from .model_loading import load_model
|
|
28
30
|
from .scores import log_scores
|
|
@@ -32,16 +34,12 @@ from .utils import (
|
|
|
32
34
|
enforce_reproducibility,
|
|
33
35
|
get_package_version,
|
|
34
36
|
internet_connection_available,
|
|
35
|
-
log_once,
|
|
36
37
|
split_model_id,
|
|
37
38
|
)
|
|
38
39
|
|
|
39
40
|
if t.TYPE_CHECKING:
|
|
40
41
|
from .benchmark_modules import BenchmarkModule
|
|
41
|
-
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
logger = logging.getLogger("euroeval")
|
|
42
|
+
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig, Task
|
|
45
43
|
|
|
46
44
|
|
|
47
45
|
class Benchmarker:
|
|
@@ -65,11 +63,11 @@ class Benchmarker:
|
|
|
65
63
|
self,
|
|
66
64
|
progress_bar: bool = True,
|
|
67
65
|
save_results: bool = True,
|
|
68
|
-
task: str |
|
|
69
|
-
dataset:
|
|
70
|
-
language: str |
|
|
71
|
-
model_language: str |
|
|
72
|
-
dataset_language: str |
|
|
66
|
+
task: "str | Task | c.Sequence[str | Task] | None" = None,
|
|
67
|
+
dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
|
|
68
|
+
language: str | c.Sequence[str] = "all",
|
|
69
|
+
model_language: str | c.Sequence[str] | None = None,
|
|
70
|
+
dataset_language: str | c.Sequence[str] | None = None,
|
|
73
71
|
device: Device | None = None,
|
|
74
72
|
batch_size: int = 32,
|
|
75
73
|
raise_errors: bool = False,
|
|
@@ -179,6 +177,8 @@ class Benchmarker:
|
|
|
179
177
|
ValueError:
|
|
180
178
|
If both `task` and `dataset` are specified, or if `download_only`
|
|
181
179
|
is True and we have no internet connection.
|
|
180
|
+
ImportError:
|
|
181
|
+
If `hf_transfer` is enabled but not installed.
|
|
182
182
|
"""
|
|
183
183
|
if task is not None and dataset is not None:
|
|
184
184
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
@@ -200,6 +200,10 @@ class Benchmarker:
|
|
|
200
200
|
"Try installing it with `pip install hf_transfer`."
|
|
201
201
|
)
|
|
202
202
|
|
|
203
|
+
# If FULL_LOG has been set, then force verbose mode
|
|
204
|
+
if os.getenv("FULL_LOG", "0") == "1":
|
|
205
|
+
verbose = True
|
|
206
|
+
|
|
203
207
|
self.benchmark_config_default_params = BenchmarkConfigParams(
|
|
204
208
|
task=task,
|
|
205
209
|
dataset=dataset,
|
|
@@ -235,13 +239,13 @@ class Benchmarker:
|
|
|
235
239
|
)
|
|
236
240
|
|
|
237
241
|
# Initialise variable storing model lists, so we only have to fetch it once
|
|
238
|
-
self._model_lists: dict[str,
|
|
242
|
+
self._model_lists: dict[str, c.Sequence[str]] | None = None
|
|
239
243
|
|
|
240
244
|
self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
|
|
241
245
|
adjust_logging_level(verbose=self.benchmark_config.verbose)
|
|
242
246
|
|
|
243
247
|
@property
|
|
244
|
-
def benchmark_results(self) ->
|
|
248
|
+
def benchmark_results(self) -> c.Sequence[BenchmarkResult]:
|
|
245
249
|
"""The benchmark results.
|
|
246
250
|
|
|
247
251
|
Returns:
|
|
@@ -301,7 +305,6 @@ class Benchmarker:
|
|
|
301
305
|
)
|
|
302
306
|
del dataset
|
|
303
307
|
|
|
304
|
-
log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
|
|
305
308
|
model = load_model(
|
|
306
309
|
model_config=model_config,
|
|
307
310
|
dataset_config=dataset_config,
|
|
@@ -320,14 +323,14 @@ class Benchmarker:
|
|
|
320
323
|
|
|
321
324
|
def benchmark(
|
|
322
325
|
self,
|
|
323
|
-
model:
|
|
324
|
-
task: str |
|
|
325
|
-
dataset:
|
|
326
|
+
model: c.Sequence[str] | str,
|
|
327
|
+
task: "str | Task | c.Sequence[str | Task] | None" = None,
|
|
328
|
+
dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
|
|
326
329
|
progress_bar: bool | None = None,
|
|
327
330
|
save_results: bool | None = None,
|
|
328
|
-
language: str |
|
|
329
|
-
model_language: str |
|
|
330
|
-
dataset_language: str |
|
|
331
|
+
language: str | c.Sequence[str] | None = None,
|
|
332
|
+
model_language: str | c.Sequence[str] | None = None,
|
|
333
|
+
dataset_language: str | c.Sequence[str] | None = None,
|
|
331
334
|
device: Device | None = None,
|
|
332
335
|
batch_size: int | None = None,
|
|
333
336
|
raise_errors: bool | None = None,
|
|
@@ -347,7 +350,7 @@ class Benchmarker:
|
|
|
347
350
|
force: bool | None = None,
|
|
348
351
|
verbose: bool | None = None,
|
|
349
352
|
debug: bool | None = None,
|
|
350
|
-
) ->
|
|
353
|
+
) -> c.Sequence[BenchmarkResult]:
|
|
351
354
|
"""Benchmarks models on datasets.
|
|
352
355
|
|
|
353
356
|
Args:
|
|
@@ -605,13 +608,11 @@ class Benchmarker:
|
|
|
605
608
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
606
609
|
|
|
607
610
|
model_ids = self._prepare_model_ids(model_id=model)
|
|
608
|
-
dataset_configs =
|
|
609
|
-
dataset_names=benchmark_config.datasets
|
|
610
|
-
)
|
|
611
|
+
dataset_configs = benchmark_config.datasets
|
|
611
612
|
|
|
612
613
|
# Get all the model configs
|
|
613
614
|
model_configs: list[ModelConfig] = list()
|
|
614
|
-
for model_id in
|
|
615
|
+
for model_id in get_pbar(
|
|
615
616
|
iterable=model_ids,
|
|
616
617
|
desc="Fetching model configurations",
|
|
617
618
|
disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
|
|
@@ -622,50 +623,63 @@ class Benchmarker:
|
|
|
622
623
|
)
|
|
623
624
|
model_configs.append(model_config)
|
|
624
625
|
except InvalidModel as e:
|
|
625
|
-
|
|
626
|
+
log(e.message, level=logging.ERROR)
|
|
626
627
|
|
|
627
628
|
# Create a dictionary that takes each model config to the dataset configs that
|
|
628
|
-
# we need to benchmark the model on.
|
|
629
|
-
#
|
|
630
|
-
|
|
631
|
-
|
|
629
|
+
# we need to benchmark the model on. We initially include all the relevant
|
|
630
|
+
# datasets for each model.
|
|
631
|
+
model_config_to_dataset_configs: dict[
|
|
632
|
+
ModelConfig, c.Sequence[DatasetConfig]
|
|
633
|
+
] = {
|
|
632
634
|
model_config: [
|
|
633
635
|
dataset_config
|
|
634
636
|
for dataset_config in dataset_configs
|
|
635
|
-
if
|
|
636
|
-
benchmark_config.force
|
|
637
|
-
or not model_has_been_benchmarked(
|
|
638
|
-
model_config=model_config,
|
|
639
|
-
dataset_config=dataset_config,
|
|
640
|
-
benchmark_config=benchmark_config,
|
|
641
|
-
benchmark_results=self.benchmark_results,
|
|
642
|
-
)
|
|
643
|
-
)
|
|
644
|
-
and model_config.model_type in dataset_config.allowed_model_types
|
|
637
|
+
if model_config.model_type in dataset_config.allowed_model_types
|
|
645
638
|
]
|
|
646
639
|
for model_config in model_configs
|
|
647
640
|
}
|
|
648
641
|
|
|
642
|
+
# Initialise the current benchmark results with all the ones that we have cached
|
|
643
|
+
# on disk already (can be none), and remove those datasets from the mapping
|
|
644
|
+
current_benchmark_results: list[BenchmarkResult] = list()
|
|
645
|
+
for (
|
|
646
|
+
model_config,
|
|
647
|
+
model_dataset_configs,
|
|
648
|
+
) in model_config_to_dataset_configs.items():
|
|
649
|
+
new_model_dataset_configs: list[DatasetConfig] = list()
|
|
650
|
+
for dataset_config in model_dataset_configs:
|
|
651
|
+
benchmark_record = get_record(
|
|
652
|
+
model_config=model_config,
|
|
653
|
+
dataset_config=dataset_config,
|
|
654
|
+
benchmark_config=benchmark_config,
|
|
655
|
+
benchmark_results=self.benchmark_results,
|
|
656
|
+
)
|
|
657
|
+
if benchmark_record is not None and not benchmark_config.force:
|
|
658
|
+
current_benchmark_results.append(benchmark_record)
|
|
659
|
+
else:
|
|
660
|
+
new_model_dataset_configs.append(dataset_config)
|
|
661
|
+
model_config_to_dataset_configs[model_config] = new_model_dataset_configs
|
|
662
|
+
|
|
649
663
|
total_benchmarks = sum(
|
|
650
664
|
len(dataset_configs)
|
|
651
665
|
for dataset_configs in model_config_to_dataset_configs.values()
|
|
652
666
|
)
|
|
653
667
|
if total_benchmarks == 0:
|
|
654
|
-
|
|
668
|
+
log(
|
|
655
669
|
"No benchmarks to run, as all the selected models have already been "
|
|
656
|
-
"benchmarked on all the selected datasets."
|
|
670
|
+
"benchmarked on all the selected datasets.",
|
|
671
|
+
level=logging.INFO,
|
|
657
672
|
)
|
|
658
|
-
return
|
|
659
|
-
|
|
660
|
-
logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
|
|
673
|
+
return current_benchmark_results
|
|
661
674
|
|
|
662
675
|
num_finished_benchmarks = 0
|
|
663
|
-
|
|
676
|
+
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
664
677
|
for model_config in model_configs:
|
|
665
678
|
if not model_config_to_dataset_configs[model_config]:
|
|
666
|
-
|
|
679
|
+
log(
|
|
667
680
|
f"Skipping model {model_config.model_id!r} because it has "
|
|
668
|
-
"already been benchmarked on all valid datasets."
|
|
681
|
+
"already been benchmarked on all valid datasets.",
|
|
682
|
+
level=logging.DEBUG,
|
|
669
683
|
)
|
|
670
684
|
continue
|
|
671
685
|
|
|
@@ -691,7 +705,6 @@ class Benchmarker:
|
|
|
691
705
|
)
|
|
692
706
|
|
|
693
707
|
loaded_model: BenchmarkModule | None = None
|
|
694
|
-
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
695
708
|
for dataset_config in model_config_to_dataset_configs[model_config]:
|
|
696
709
|
# Revert any changes to the benchmark configuration made for the
|
|
697
710
|
# previous dataset
|
|
@@ -704,18 +717,20 @@ class Benchmarker:
|
|
|
704
717
|
"val" not in dataset_config.splits
|
|
705
718
|
and not benchmark_config.evaluate_test_split
|
|
706
719
|
):
|
|
707
|
-
|
|
720
|
+
log(
|
|
708
721
|
"The dataset does not have a validation split, so even though "
|
|
709
722
|
"you requested evaluating the validation split (the default), "
|
|
710
|
-
"we will evaluate on the test split."
|
|
723
|
+
"we will evaluate on the test split.",
|
|
724
|
+
level=logging.DEBUG,
|
|
711
725
|
)
|
|
712
726
|
benchmark_params_to_revert["evaluate_test_split"] = False
|
|
713
727
|
benchmark_config.evaluate_test_split = True
|
|
714
728
|
if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
|
|
715
|
-
|
|
729
|
+
log(
|
|
716
730
|
"The task requires zero-shot evaluation, so even though you "
|
|
717
731
|
"requested few-shot evaluation (the default), we will evaluate "
|
|
718
|
-
"zero-shot."
|
|
732
|
+
"zero-shot.",
|
|
733
|
+
level=logging.DEBUG,
|
|
719
734
|
)
|
|
720
735
|
benchmark_params_to_revert["few_shot"] = True
|
|
721
736
|
benchmark_config.few_shot = False
|
|
@@ -723,13 +738,7 @@ class Benchmarker:
|
|
|
723
738
|
# We do not re-initialise generative models as their architecture is not
|
|
724
739
|
# customised to specific datasets
|
|
725
740
|
if model_config.model_type == ModelType.GENERATIVE:
|
|
726
|
-
initial_logging(
|
|
727
|
-
model_config=model_config,
|
|
728
|
-
dataset_config=dataset_config,
|
|
729
|
-
benchmark_config=benchmark_config,
|
|
730
|
-
)
|
|
731
741
|
if loaded_model is None:
|
|
732
|
-
logger.info("Loading model...")
|
|
733
742
|
try:
|
|
734
743
|
loaded_model = load_model(
|
|
735
744
|
model_config=model_config,
|
|
@@ -739,7 +748,7 @@ class Benchmarker:
|
|
|
739
748
|
except InvalidModel as e:
|
|
740
749
|
if benchmark_config.raise_errors:
|
|
741
750
|
raise e
|
|
742
|
-
|
|
751
|
+
log(e.message, level=logging.ERROR)
|
|
743
752
|
|
|
744
753
|
# Add the remaining number of benchmarks for the model to
|
|
745
754
|
# our benchmark counter, since we're skipping the rest of
|
|
@@ -759,12 +768,13 @@ class Benchmarker:
|
|
|
759
768
|
loaded_model.generative_type
|
|
760
769
|
not in dataset_config.allowed_generative_types
|
|
761
770
|
):
|
|
762
|
-
|
|
771
|
+
log(
|
|
763
772
|
f"Skipping the benchmark of model "
|
|
764
773
|
f"{model_config.model_id!r}on dataset "
|
|
765
774
|
f"{dataset_config.name!r} because the model has generative "
|
|
766
775
|
f"type {loaded_model.generative_type} and the dataset "
|
|
767
|
-
f"only allows {dataset_config.allowed_generative_types}."
|
|
776
|
+
f"only allows {dataset_config.allowed_generative_types}.",
|
|
777
|
+
level=logging.DEBUG,
|
|
768
778
|
)
|
|
769
779
|
num_finished_benchmarks += 1
|
|
770
780
|
continue
|
|
@@ -775,6 +785,8 @@ class Benchmarker:
|
|
|
775
785
|
model_config=model_config,
|
|
776
786
|
dataset_config=dataset_config,
|
|
777
787
|
benchmark_config=benchmark_config,
|
|
788
|
+
num_finished_benchmarks=num_finished_benchmarks,
|
|
789
|
+
num_total_benchmarks=total_benchmarks,
|
|
778
790
|
)
|
|
779
791
|
|
|
780
792
|
if (
|
|
@@ -784,12 +796,12 @@ class Benchmarker:
|
|
|
784
796
|
raise benchmark_output_or_err
|
|
785
797
|
|
|
786
798
|
elif isinstance(benchmark_output_or_err, InvalidBenchmark):
|
|
787
|
-
|
|
799
|
+
log(benchmark_output_or_err.message, level=logging.WARNING)
|
|
788
800
|
num_finished_benchmarks += 1
|
|
789
801
|
continue
|
|
790
802
|
|
|
791
803
|
elif isinstance(benchmark_output_or_err, InvalidModel):
|
|
792
|
-
|
|
804
|
+
log(benchmark_output_or_err.message, level=logging.WARNING)
|
|
793
805
|
|
|
794
806
|
# Add the remaining number of benchmarks for the model to our
|
|
795
807
|
# benchmark counter, since we're skipping the rest of them
|
|
@@ -805,15 +817,15 @@ class Benchmarker:
|
|
|
805
817
|
record.append_to_results(results_path=self.results_path)
|
|
806
818
|
|
|
807
819
|
num_finished_benchmarks += 1
|
|
808
|
-
logger.info(
|
|
809
|
-
f"Finished {num_finished_benchmarks} out of "
|
|
810
|
-
f"{total_benchmarks} benchmarks."
|
|
811
|
-
)
|
|
812
820
|
|
|
813
821
|
del loaded_model
|
|
814
822
|
if benchmark_config.clear_model_cache:
|
|
815
823
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
816
824
|
|
|
825
|
+
log(
|
|
826
|
+
f"\nCompleted {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO
|
|
827
|
+
)
|
|
828
|
+
|
|
817
829
|
# This avoids the following warning at the end of the benchmarking:
|
|
818
830
|
# Warning: WARNING: process group has NOT been destroyed before we destruct
|
|
819
831
|
# ProcessGroupNCCL. On normal program exit, the application should call
|
|
@@ -826,7 +838,7 @@ class Benchmarker:
|
|
|
826
838
|
destroy_process_group()
|
|
827
839
|
return current_benchmark_results
|
|
828
840
|
|
|
829
|
-
def _prepare_model_ids(self, model_id:
|
|
841
|
+
def _prepare_model_ids(self, model_id: c.Sequence[str] | str) -> c.Sequence[str]:
|
|
830
842
|
"""Prepare the model ID(s) to be benchmarked.
|
|
831
843
|
|
|
832
844
|
Args:
|
|
@@ -857,6 +869,8 @@ class Benchmarker:
|
|
|
857
869
|
model_config: "ModelConfig",
|
|
858
870
|
dataset_config: "DatasetConfig",
|
|
859
871
|
benchmark_config: "BenchmarkConfig",
|
|
872
|
+
num_finished_benchmarks: int,
|
|
873
|
+
num_total_benchmarks: int,
|
|
860
874
|
) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
|
|
861
875
|
"""Benchmark a single model on a single dataset.
|
|
862
876
|
|
|
@@ -869,25 +883,29 @@ class Benchmarker:
|
|
|
869
883
|
The configuration of the dataset we are evaluating on.
|
|
870
884
|
benchmark_config:
|
|
871
885
|
The general benchmark configuration.
|
|
886
|
+
num_finished_benchmarks:
|
|
887
|
+
The number of benchmarks that have already been completed.
|
|
888
|
+
num_total_benchmarks:
|
|
889
|
+
The total number of benchmarks to be completed.
|
|
872
890
|
|
|
873
891
|
Returns:
|
|
874
892
|
The benchmark result, or an error if the benchmark was unsuccessful.
|
|
875
|
-
"""
|
|
876
|
-
if model is None:
|
|
877
|
-
initial_logging(
|
|
878
|
-
model_config=model_config,
|
|
879
|
-
dataset_config=dataset_config,
|
|
880
|
-
benchmark_config=benchmark_config,
|
|
881
|
-
)
|
|
882
893
|
|
|
883
|
-
|
|
894
|
+
Raises:
|
|
895
|
+
RuntimeError:
|
|
896
|
+
If the MPS fallback is not enabled when required.
|
|
897
|
+
InvalidBenchmark:
|
|
898
|
+
If the benchmark was unsuccessful.
|
|
899
|
+
InvalidModel:
|
|
900
|
+
If the model is invalid.
|
|
901
|
+
"""
|
|
902
|
+
for _ in range(num_attempts := 5):
|
|
884
903
|
try:
|
|
885
904
|
# Set random seeds to enforce reproducibility of the randomly
|
|
886
905
|
# initialised weights
|
|
887
906
|
rng = enforce_reproducibility()
|
|
888
907
|
|
|
889
908
|
if model is None or model_config.model_type != ModelType.GENERATIVE:
|
|
890
|
-
logger.info("Loading model...")
|
|
891
909
|
model = load_model(
|
|
892
910
|
model_config=model_config,
|
|
893
911
|
dataset_config=dataset_config,
|
|
@@ -895,6 +913,14 @@ class Benchmarker:
|
|
|
895
913
|
)
|
|
896
914
|
assert model is not None
|
|
897
915
|
|
|
916
|
+
initial_logging(
|
|
917
|
+
model_config=model_config,
|
|
918
|
+
dataset_config=dataset_config,
|
|
919
|
+
benchmark_config=benchmark_config,
|
|
920
|
+
num_finished_benchmarks=num_finished_benchmarks,
|
|
921
|
+
num_total_benchmarks=num_total_benchmarks,
|
|
922
|
+
)
|
|
923
|
+
|
|
898
924
|
if dataset_config.task == SPEED:
|
|
899
925
|
scores = benchmark_speed(
|
|
900
926
|
model=model, benchmark_config=benchmark_config
|
|
@@ -962,14 +988,15 @@ class Benchmarker:
|
|
|
962
988
|
few_shot=benchmark_config.few_shot,
|
|
963
989
|
validation_split=not benchmark_config.evaluate_test_split,
|
|
964
990
|
)
|
|
965
|
-
|
|
991
|
+
log(f"Results:\n{results}", level=logging.DEBUG)
|
|
966
992
|
return record
|
|
967
993
|
|
|
968
994
|
except HuggingFaceHubDown:
|
|
969
995
|
wait_time = 30
|
|
970
|
-
|
|
996
|
+
log(
|
|
971
997
|
f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
|
|
972
|
-
"seconds."
|
|
998
|
+
"seconds.",
|
|
999
|
+
level=logging.DEBUG,
|
|
973
1000
|
)
|
|
974
1001
|
sleep(wait_time)
|
|
975
1002
|
continue
|
|
@@ -992,23 +1019,29 @@ class Benchmarker:
|
|
|
992
1019
|
elif benchmark_config.raise_errors:
|
|
993
1020
|
raise e
|
|
994
1021
|
return e
|
|
1022
|
+
else:
|
|
1023
|
+
return InvalidBenchmark(
|
|
1024
|
+
f"Failed to benchmark model {model_config.model_id!r} on dataset "
|
|
1025
|
+
f"{dataset_config.name!r} after {num_attempts} attempts."
|
|
1026
|
+
)
|
|
995
1027
|
|
|
996
1028
|
def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
|
|
997
1029
|
"""Alias for `self.benchmark()`."""
|
|
998
|
-
|
|
1030
|
+
log(
|
|
999
1031
|
"Calling the `Benchmarker` class directly is deprecated. Please use the "
|
|
1000
|
-
"`benchmark` function instead. This will be removed in a future version."
|
|
1032
|
+
"`benchmark` function instead. This will be removed in a future version.",
|
|
1033
|
+
level=logging.WARNING,
|
|
1001
1034
|
)
|
|
1002
1035
|
return self.benchmark(*args, **kwds)
|
|
1003
1036
|
|
|
1004
1037
|
|
|
1005
|
-
def
|
|
1038
|
+
def get_record(
|
|
1006
1039
|
model_config: "ModelConfig",
|
|
1007
1040
|
dataset_config: "DatasetConfig",
|
|
1008
1041
|
benchmark_config: "BenchmarkConfig",
|
|
1009
|
-
benchmark_results:
|
|
1010
|
-
) ->
|
|
1011
|
-
"""
|
|
1042
|
+
benchmark_results: c.Sequence[BenchmarkResult],
|
|
1043
|
+
) -> BenchmarkResult | None:
|
|
1044
|
+
"""Get the benchmark record for a given model and dataset.
|
|
1012
1045
|
|
|
1013
1046
|
Args:
|
|
1014
1047
|
model_config:
|
|
@@ -1021,7 +1054,7 @@ def model_has_been_benchmarked(
|
|
|
1021
1054
|
The benchmark results.
|
|
1022
1055
|
|
|
1023
1056
|
Returns:
|
|
1024
|
-
|
|
1057
|
+
The benchmark record, or None if no such record exists.
|
|
1025
1058
|
"""
|
|
1026
1059
|
for record in benchmark_results:
|
|
1027
1060
|
model_id_components = split_model_id(model_id=record.model)
|
|
@@ -1046,30 +1079,8 @@ def model_has_been_benchmarked(
|
|
|
1046
1079
|
and same_split
|
|
1047
1080
|
and same_num_shots
|
|
1048
1081
|
):
|
|
1049
|
-
return
|
|
1050
|
-
return
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
|
|
1054
|
-
"""Adjust the logging level based on verbosity.
|
|
1055
|
-
|
|
1056
|
-
Args:
|
|
1057
|
-
verbose:
|
|
1058
|
-
Whether to output additional output.
|
|
1059
|
-
ignore_testing:
|
|
1060
|
-
Whether to ignore the testing flag.
|
|
1061
|
-
|
|
1062
|
-
Returns:
|
|
1063
|
-
The logging level that was set.
|
|
1064
|
-
"""
|
|
1065
|
-
if hasattr(sys, "_called_from_test") and not ignore_testing:
|
|
1066
|
-
logging_level = logging.CRITICAL
|
|
1067
|
-
elif verbose:
|
|
1068
|
-
logging_level = logging.DEBUG
|
|
1069
|
-
else:
|
|
1070
|
-
logging_level = logging.INFO
|
|
1071
|
-
logger.setLevel(logging_level)
|
|
1072
|
-
return logging_level
|
|
1082
|
+
return record
|
|
1083
|
+
return None
|
|
1073
1084
|
|
|
1074
1085
|
|
|
1075
1086
|
def clear_model_cache_fn(cache_dir: str) -> None:
|
|
@@ -1090,7 +1101,9 @@ def clear_model_cache_fn(cache_dir: str) -> None:
|
|
|
1090
1101
|
rmtree(sub_model_dir)
|
|
1091
1102
|
|
|
1092
1103
|
|
|
1093
|
-
def prepare_dataset_configs(
|
|
1104
|
+
def prepare_dataset_configs(
|
|
1105
|
+
dataset_names: c.Sequence[str],
|
|
1106
|
+
) -> c.Sequence["DatasetConfig"]:
|
|
1094
1107
|
"""Prepare the dataset configuration(s) to be benchmarked.
|
|
1095
1108
|
|
|
1096
1109
|
Args:
|
|
@@ -1109,6 +1122,8 @@ def initial_logging(
|
|
|
1109
1122
|
model_config: "ModelConfig",
|
|
1110
1123
|
dataset_config: "DatasetConfig",
|
|
1111
1124
|
benchmark_config: "BenchmarkConfig",
|
|
1125
|
+
num_finished_benchmarks: int,
|
|
1126
|
+
num_total_benchmarks: int,
|
|
1112
1127
|
) -> None:
|
|
1113
1128
|
"""Initial logging at the start of the benchmarking process.
|
|
1114
1129
|
|
|
@@ -1119,6 +1134,10 @@ def initial_logging(
|
|
|
1119
1134
|
The configuration of the dataset we are evaluating on.
|
|
1120
1135
|
benchmark_config:
|
|
1121
1136
|
The general benchmark configuration.
|
|
1137
|
+
num_finished_benchmarks:
|
|
1138
|
+
The number of benchmarks that have already been finished.
|
|
1139
|
+
num_total_benchmarks:
|
|
1140
|
+
The total number of benchmarks to be run.
|
|
1122
1141
|
"""
|
|
1123
1142
|
model_id = model_config.model_id
|
|
1124
1143
|
if model_config.revision and model_config.revision != "main":
|
|
@@ -1135,21 +1154,25 @@ def initial_logging(
|
|
|
1135
1154
|
else:
|
|
1136
1155
|
eval_type = "Benchmarking"
|
|
1137
1156
|
|
|
1138
|
-
|
|
1139
|
-
f"{eval_type} {model_id} on the {split_type} split of "
|
|
1140
|
-
f"{dataset_config.pretty_name}"
|
|
1157
|
+
log_once(
|
|
1158
|
+
f"\n{eval_type} {model_id} on the {split_type} split of "
|
|
1159
|
+
f"{dataset_config.pretty_name} ({num_finished_benchmarks + 1}/"
|
|
1160
|
+
f"{num_total_benchmarks} benchmarks)...",
|
|
1161
|
+
prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
|
|
1141
1162
|
)
|
|
1142
1163
|
|
|
1143
1164
|
if dataset_config.unofficial:
|
|
1144
|
-
|
|
1165
|
+
log_once(
|
|
1145
1166
|
f"Note that the {dataset_config.name!r} dataset is unofficial, "
|
|
1146
1167
|
"meaning that the resulting evaluation will not be included in the "
|
|
1147
|
-
"official leaderboard."
|
|
1168
|
+
"official leaderboard.",
|
|
1169
|
+
level=logging.WARNING,
|
|
1148
1170
|
)
|
|
1149
1171
|
|
|
1150
1172
|
if benchmark_config.debug:
|
|
1151
|
-
|
|
1173
|
+
log_once(
|
|
1152
1174
|
"Running in debug mode. This will output additional information, as "
|
|
1153
1175
|
"well as store the model outputs in the current directory after each "
|
|
1154
|
-
"batch. For this reason, evaluation will be slower."
|
|
1176
|
+
"batch. For this reason, evaluation will be slower.",
|
|
1177
|
+
level=logging.WARNING,
|
|
1155
1178
|
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Caching utility functions."""
|
|
2
|
+
|
|
3
|
+
import typing as t
|
|
4
|
+
from functools import wraps
|
|
5
|
+
|
|
6
|
+
from .constants import T
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def cache_arguments(
|
|
10
|
+
*arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
|
|
11
|
+
) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
|
|
12
|
+
"""Cache specified arguments of a function.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
arguments:
|
|
16
|
+
The list of argument names to cache. If empty, all arguments are cached.
|
|
17
|
+
disable_condition:
|
|
18
|
+
A function that checks if cache should be disabled.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
A decorator that caches the specified arguments of a function.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
|
|
25
|
+
"""Decorator that caches the specified arguments of a function.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
func:
|
|
29
|
+
The function to decorate.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The decorated function.
|
|
33
|
+
"""
|
|
34
|
+
cache: dict[tuple, T] = dict()
|
|
35
|
+
|
|
36
|
+
@wraps(func)
|
|
37
|
+
def wrapper(*args, **kwargs) -> T:
|
|
38
|
+
"""Wrapper function that caches the specified arguments.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
*args:
|
|
42
|
+
The positional arguments to the function.
|
|
43
|
+
**kwargs:
|
|
44
|
+
The keyword arguments to the function.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The result of the function.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError:
|
|
51
|
+
If an argument name is not found in the function parameters.
|
|
52
|
+
"""
|
|
53
|
+
if not arguments:
|
|
54
|
+
key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
|
|
55
|
+
else:
|
|
56
|
+
func_params = func.__code__.co_varnames
|
|
57
|
+
key_items: list[t.Any] = list()
|
|
58
|
+
for arg_name in arguments:
|
|
59
|
+
if arg_name in kwargs:
|
|
60
|
+
key_items.append(kwargs[arg_name])
|
|
61
|
+
else:
|
|
62
|
+
try:
|
|
63
|
+
arg_index = func_params.index(arg_name)
|
|
64
|
+
key_items.append(args[arg_index])
|
|
65
|
+
except (ValueError, IndexError):
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"Argument {arg_name} not found in function "
|
|
68
|
+
f"{func.__name__} parameters."
|
|
69
|
+
)
|
|
70
|
+
key = tuple(key_items)
|
|
71
|
+
|
|
72
|
+
# Do not cache if the condition is met
|
|
73
|
+
if key not in cache or disable_condition():
|
|
74
|
+
cache[key] = func(*args, **kwargs)
|
|
75
|
+
return cache[key]
|
|
76
|
+
|
|
77
|
+
return wrapper
|
|
78
|
+
|
|
79
|
+
return caching_decorator
|