EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -4
- euroeval/benchmark_config_factory.py +0 -4
- euroeval/benchmark_modules/base.py +3 -16
- euroeval/benchmark_modules/fresh.py +5 -2
- euroeval/benchmark_modules/hf.py +107 -66
- euroeval/benchmark_modules/litellm.py +103 -55
- euroeval/benchmark_modules/vllm.py +155 -82
- euroeval/benchmarker.py +184 -129
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +1 -1
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +14 -11
- euroeval/data_models.py +12 -4
- euroeval/dataset_configs/__init__.py +3 -0
- euroeval/dataset_configs/czech.py +79 -0
- euroeval/dataset_configs/danish.py +10 -13
- euroeval/dataset_configs/dutch.py +0 -3
- euroeval/dataset_configs/english.py +0 -3
- euroeval/dataset_configs/estonian.py +11 -1
- euroeval/dataset_configs/finnish.py +0 -3
- euroeval/dataset_configs/french.py +0 -3
- euroeval/dataset_configs/german.py +0 -3
- euroeval/dataset_configs/italian.py +0 -3
- euroeval/dataset_configs/latvian.py +2 -4
- euroeval/dataset_configs/lithuanian.py +68 -0
- euroeval/dataset_configs/norwegian.py +0 -3
- euroeval/dataset_configs/polish.py +0 -3
- euroeval/dataset_configs/portuguese.py +0 -3
- euroeval/dataset_configs/slovak.py +60 -0
- euroeval/dataset_configs/spanish.py +0 -3
- euroeval/dataset_configs/swedish.py +10 -15
- euroeval/finetuning.py +21 -15
- euroeval/generation.py +10 -10
- euroeval/generation_utils.py +2 -3
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +10 -6
- euroeval/metrics/llm_as_a_judge.py +5 -3
- euroeval/metrics/pipeline.py +22 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +11 -14
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/linguistic_acceptability.py +30 -3
- euroeval/prompt_templates/multiple_choice.py +34 -1
- euroeval/prompt_templates/named_entity_recognition.py +71 -11
- euroeval/prompt_templates/reading_comprehension.py +41 -3
- euroeval/prompt_templates/sentiment_classification.py +34 -1
- euroeval/prompt_templates/summarization.py +26 -6
- euroeval/scores.py +7 -7
- euroeval/speed_benchmark.py +3 -5
- euroeval/task_group_utils/multiple_choice_classification.py +0 -3
- euroeval/task_group_utils/question_answering.py +0 -3
- euroeval/task_group_utils/sequence_classification.py +43 -31
- euroeval/task_group_utils/text_to_text.py +17 -8
- euroeval/task_group_utils/token_classification.py +10 -9
- euroeval/tokenisation_utils.py +22 -20
- euroeval/utils.py +30 -147
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
- euroeval-16.4.0.dist-info/RECORD +75 -0
- euroeval-16.2.2.dist-info/RECORD +0 -70
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""Class that benchmarks language models."""
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
+
import datetime as dt
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
7
|
+
import os
|
|
6
8
|
import re
|
|
7
|
-
import sys
|
|
8
9
|
import typing as t
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from shutil import rmtree
|
|
@@ -22,6 +23,7 @@ from .enums import Device, GenerativeType, ModelType
|
|
|
22
23
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
|
|
23
24
|
from .finetuning import finetune
|
|
24
25
|
from .generation import generate
|
|
26
|
+
from .logging_utils import adjust_logging_level, get_pbar, log, log_once
|
|
25
27
|
from .model_config import get_model_config
|
|
26
28
|
from .model_loading import load_model
|
|
27
29
|
from .scores import log_scores
|
|
@@ -31,7 +33,7 @@ from .utils import (
|
|
|
31
33
|
enforce_reproducibility,
|
|
32
34
|
get_package_version,
|
|
33
35
|
internet_connection_available,
|
|
34
|
-
|
|
36
|
+
split_model_id,
|
|
35
37
|
)
|
|
36
38
|
|
|
37
39
|
if t.TYPE_CHECKING:
|
|
@@ -39,9 +41,6 @@ if t.TYPE_CHECKING:
|
|
|
39
41
|
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
40
42
|
|
|
41
43
|
|
|
42
|
-
logger = logging.getLogger("euroeval")
|
|
43
|
-
|
|
44
|
-
|
|
45
44
|
class Benchmarker:
|
|
46
45
|
"""Benchmarking all the language models.
|
|
47
46
|
|
|
@@ -82,7 +81,7 @@ class Benchmarker:
|
|
|
82
81
|
num_iterations: int = 10,
|
|
83
82
|
api_base: str | None = None,
|
|
84
83
|
api_version: str | None = None,
|
|
85
|
-
gpu_memory_utilization: float = 0.
|
|
84
|
+
gpu_memory_utilization: float = 0.8,
|
|
86
85
|
generative_type: GenerativeType | None = None,
|
|
87
86
|
debug: bool = False,
|
|
88
87
|
run_with_cli: bool = False,
|
|
@@ -198,6 +197,10 @@ class Benchmarker:
|
|
|
198
197
|
"Try installing it with `pip install hf_transfer`."
|
|
199
198
|
)
|
|
200
199
|
|
|
200
|
+
# If FULL_LOG has been set, then force verbose mode
|
|
201
|
+
if os.getenv("FULL_LOG", "0") == "1":
|
|
202
|
+
verbose = True
|
|
203
|
+
|
|
201
204
|
self.benchmark_config_default_params = BenchmarkConfigParams(
|
|
202
205
|
task=task,
|
|
203
206
|
dataset=dataset,
|
|
@@ -299,7 +302,6 @@ class Benchmarker:
|
|
|
299
302
|
)
|
|
300
303
|
del dataset
|
|
301
304
|
|
|
302
|
-
log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
|
|
303
305
|
model = load_model(
|
|
304
306
|
model_config=model_config,
|
|
305
307
|
dataset_config=dataset_config,
|
|
@@ -607,46 +609,90 @@ class Benchmarker:
|
|
|
607
609
|
dataset_names=benchmark_config.datasets
|
|
608
610
|
)
|
|
609
611
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
612
|
+
# Get all the model configs
|
|
613
|
+
model_configs: list[ModelConfig] = list()
|
|
614
|
+
for model_id in get_pbar(
|
|
615
|
+
iterable=model_ids,
|
|
616
|
+
desc="Fetching model configurations",
|
|
617
|
+
disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
|
|
618
|
+
):
|
|
616
619
|
try:
|
|
617
620
|
model_config = get_model_config(
|
|
618
621
|
model_id=model_id, benchmark_config=benchmark_config
|
|
619
622
|
)
|
|
623
|
+
model_configs.append(model_config)
|
|
620
624
|
except InvalidModel as e:
|
|
621
|
-
|
|
622
|
-
|
|
625
|
+
log(e.message, level=logging.ERROR)
|
|
626
|
+
|
|
627
|
+
# Create a dictionary that takes each model config to the dataset configs that
|
|
628
|
+
# we need to benchmark the model on. Here we remove the datasets that the model
|
|
629
|
+
# has already been benchmarked on, or datasets that the model cannot be
|
|
630
|
+
# benchmarked on.
|
|
631
|
+
model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
|
|
632
|
+
model_config: [
|
|
633
|
+
dataset_config
|
|
634
|
+
for dataset_config in dataset_configs
|
|
635
|
+
if (
|
|
636
|
+
benchmark_config.force
|
|
637
|
+
or not model_has_been_benchmarked(
|
|
638
|
+
model_config=model_config,
|
|
639
|
+
dataset_config=dataset_config,
|
|
640
|
+
benchmark_config=benchmark_config,
|
|
641
|
+
benchmark_results=self.benchmark_results,
|
|
642
|
+
)
|
|
643
|
+
)
|
|
644
|
+
and model_config.model_type in dataset_config.allowed_model_types
|
|
645
|
+
]
|
|
646
|
+
for model_config in model_configs
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
total_benchmarks = sum(
|
|
650
|
+
len(dataset_configs)
|
|
651
|
+
for dataset_configs in model_config_to_dataset_configs.values()
|
|
652
|
+
)
|
|
653
|
+
if total_benchmarks == 0:
|
|
654
|
+
log(
|
|
655
|
+
"No benchmarks to run, as all the selected models have already been "
|
|
656
|
+
"benchmarked on all the selected datasets.",
|
|
657
|
+
level=logging.INFO,
|
|
658
|
+
)
|
|
659
|
+
return list()
|
|
660
|
+
|
|
661
|
+
num_finished_benchmarks = 0
|
|
662
|
+
current_benchmark_results: list[BenchmarkResult] = list()
|
|
663
|
+
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
664
|
+
for model_config in model_configs:
|
|
665
|
+
if not model_config_to_dataset_configs[model_config]:
|
|
666
|
+
log(
|
|
667
|
+
f"Skipping model {model_config.model_id!r} because it has "
|
|
668
|
+
"already been benchmarked on all valid datasets.",
|
|
669
|
+
level=logging.DEBUG,
|
|
670
|
+
)
|
|
623
671
|
continue
|
|
624
672
|
|
|
625
673
|
if model_config.adapter_base_model_id:
|
|
626
674
|
open_issue_msg = (
|
|
627
|
-
"If offline support is important to you, please "
|
|
628
|
-
"
|
|
675
|
+
"If offline support is important to you, please consider opening "
|
|
676
|
+
"an issue at https://github.com/EuroEval/EuroEval/issues."
|
|
629
677
|
)
|
|
630
678
|
if not internet_connection_available():
|
|
631
679
|
raise InvalidModel(
|
|
632
680
|
"Offline benchmarking of models with adapters is not currently "
|
|
633
|
-
"supported. "
|
|
634
|
-
|
|
681
|
+
"supported. An active internet connection is required. "
|
|
682
|
+
"{open_issue_msg}"
|
|
635
683
|
)
|
|
636
684
|
elif benchmark_config.download_only:
|
|
637
685
|
log_once(
|
|
638
686
|
"You are using download only mode with a model that includes "
|
|
639
|
-
"an adapter. "
|
|
640
|
-
"
|
|
641
|
-
"
|
|
642
|
-
"An internet connection will be required during evaluation. "
|
|
687
|
+
"an adapter. Please note that offline benchmarking of "
|
|
688
|
+
"adapter models is not currently supported - an internet "
|
|
689
|
+
"connection will be required during evaluation in this case. "
|
|
643
690
|
f"{open_issue_msg}",
|
|
644
691
|
level=logging.WARNING,
|
|
645
692
|
)
|
|
646
693
|
|
|
647
694
|
loaded_model: BenchmarkModule | None = None
|
|
648
|
-
|
|
649
|
-
for dataset_config in dataset_configs:
|
|
695
|
+
for dataset_config in model_config_to_dataset_configs[model_config]:
|
|
650
696
|
# Revert any changes to the benchmark configuration made for the
|
|
651
697
|
# previous dataset
|
|
652
698
|
for param, value in benchmark_params_to_revert.items():
|
|
@@ -658,60 +704,28 @@ class Benchmarker:
|
|
|
658
704
|
"val" not in dataset_config.splits
|
|
659
705
|
and not benchmark_config.evaluate_test_split
|
|
660
706
|
):
|
|
661
|
-
|
|
707
|
+
log(
|
|
662
708
|
"The dataset does not have a validation split, so even though "
|
|
663
709
|
"you requested evaluating the validation split (the default), "
|
|
664
|
-
"we will evaluate on the test split."
|
|
710
|
+
"we will evaluate on the test split.",
|
|
711
|
+
level=logging.DEBUG,
|
|
665
712
|
)
|
|
666
713
|
benchmark_params_to_revert["evaluate_test_split"] = False
|
|
667
714
|
benchmark_config.evaluate_test_split = True
|
|
668
715
|
if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
|
|
669
|
-
|
|
716
|
+
log(
|
|
670
717
|
"The task requires zero-shot evaluation, so even though you "
|
|
671
718
|
"requested few-shot evaluation (the default), we will evaluate "
|
|
672
|
-
"zero-shot."
|
|
719
|
+
"zero-shot.",
|
|
720
|
+
level=logging.DEBUG,
|
|
673
721
|
)
|
|
674
722
|
benchmark_params_to_revert["few_shot"] = True
|
|
675
723
|
benchmark_config.few_shot = False
|
|
676
724
|
|
|
677
|
-
# Skip if we have already benchmarked this model on this dataset and
|
|
678
|
-
# we are not forcing the benchmark
|
|
679
|
-
if not benchmark_config.force and model_has_been_benchmarked(
|
|
680
|
-
model_id=model_id,
|
|
681
|
-
dataset=dataset_config.name,
|
|
682
|
-
few_shot=benchmark_config.few_shot,
|
|
683
|
-
validation_split=not benchmark_config.evaluate_test_split,
|
|
684
|
-
benchmark_results=self.benchmark_results,
|
|
685
|
-
):
|
|
686
|
-
logger.debug(
|
|
687
|
-
f"Skipping benchmarking {model_id} on "
|
|
688
|
-
f"{dataset_config.pretty_name}, as it has already been "
|
|
689
|
-
"benchmarked."
|
|
690
|
-
)
|
|
691
|
-
num_finished_benchmarks += 1
|
|
692
|
-
continue
|
|
693
|
-
|
|
694
|
-
# Skip if the model type should not be benchmarked on this dataset
|
|
695
|
-
model_type = model_config.model_type
|
|
696
|
-
allowed_model_types = dataset_config.allowed_model_types
|
|
697
|
-
if model_type not in allowed_model_types:
|
|
698
|
-
logger.debug(
|
|
699
|
-
f"Skipping benchmarking {model_id} on "
|
|
700
|
-
f"{dataset_config.pretty_name}, as it is of type {model_type}, "
|
|
701
|
-
f"and the only allowed model types are {allowed_model_types}."
|
|
702
|
-
)
|
|
703
|
-
continue
|
|
704
|
-
|
|
705
725
|
# We do not re-initialise generative models as their architecture is not
|
|
706
726
|
# customised to specific datasets
|
|
707
727
|
if model_config.model_type == ModelType.GENERATIVE:
|
|
708
|
-
initial_logging(
|
|
709
|
-
model_config=model_config,
|
|
710
|
-
dataset_config=dataset_config,
|
|
711
|
-
benchmark_config=benchmark_config,
|
|
712
|
-
)
|
|
713
728
|
if loaded_model is None:
|
|
714
|
-
logger.info("Loading model...")
|
|
715
729
|
try:
|
|
716
730
|
loaded_model = load_model(
|
|
717
731
|
model_config=model_config,
|
|
@@ -721,7 +735,7 @@ class Benchmarker:
|
|
|
721
735
|
except InvalidModel as e:
|
|
722
736
|
if benchmark_config.raise_errors:
|
|
723
737
|
raise e
|
|
724
|
-
|
|
738
|
+
log(e.message, level=logging.ERROR)
|
|
725
739
|
|
|
726
740
|
# Add the remaining number of benchmarks for the model to
|
|
727
741
|
# our benchmark counter, since we're skipping the rest of
|
|
@@ -735,12 +749,31 @@ class Benchmarker:
|
|
|
735
749
|
else:
|
|
736
750
|
loaded_model.dataset_config = dataset_config
|
|
737
751
|
|
|
752
|
+
# Skip the benchmark if the model is not of the correct
|
|
753
|
+
# generative type
|
|
754
|
+
if (
|
|
755
|
+
loaded_model.generative_type
|
|
756
|
+
not in dataset_config.allowed_generative_types
|
|
757
|
+
):
|
|
758
|
+
log(
|
|
759
|
+
f"Skipping the benchmark of model "
|
|
760
|
+
f"{model_config.model_id!r}on dataset "
|
|
761
|
+
f"{dataset_config.name!r} because the model has generative "
|
|
762
|
+
f"type {loaded_model.generative_type} and the dataset "
|
|
763
|
+
f"only allows {dataset_config.allowed_generative_types}.",
|
|
764
|
+
level=logging.DEBUG,
|
|
765
|
+
)
|
|
766
|
+
num_finished_benchmarks += 1
|
|
767
|
+
continue
|
|
768
|
+
|
|
738
769
|
# Benchmark a single model on a single dataset
|
|
739
770
|
benchmark_output_or_err = self._benchmark_single(
|
|
740
771
|
model=loaded_model,
|
|
741
772
|
model_config=model_config,
|
|
742
773
|
dataset_config=dataset_config,
|
|
743
774
|
benchmark_config=benchmark_config,
|
|
775
|
+
num_finished_benchmarks=num_finished_benchmarks,
|
|
776
|
+
num_total_benchmarks=total_benchmarks,
|
|
744
777
|
)
|
|
745
778
|
|
|
746
779
|
if (
|
|
@@ -750,12 +783,12 @@ class Benchmarker:
|
|
|
750
783
|
raise benchmark_output_or_err
|
|
751
784
|
|
|
752
785
|
elif isinstance(benchmark_output_or_err, InvalidBenchmark):
|
|
753
|
-
|
|
786
|
+
log(benchmark_output_or_err.message, level=logging.WARNING)
|
|
754
787
|
num_finished_benchmarks += 1
|
|
755
788
|
continue
|
|
756
789
|
|
|
757
790
|
elif isinstance(benchmark_output_or_err, InvalidModel):
|
|
758
|
-
|
|
791
|
+
log(benchmark_output_or_err.message, level=logging.WARNING)
|
|
759
792
|
|
|
760
793
|
# Add the remaining number of benchmarks for the model to our
|
|
761
794
|
# benchmark counter, since we're skipping the rest of them
|
|
@@ -771,15 +804,13 @@ class Benchmarker:
|
|
|
771
804
|
record.append_to_results(results_path=self.results_path)
|
|
772
805
|
|
|
773
806
|
num_finished_benchmarks += 1
|
|
774
|
-
logger.info(
|
|
775
|
-
f"Finished {num_finished_benchmarks} out of "
|
|
776
|
-
f"{total_benchmarks} benchmarks."
|
|
777
|
-
)
|
|
778
807
|
|
|
779
808
|
del loaded_model
|
|
780
809
|
if benchmark_config.clear_model_cache:
|
|
781
810
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
782
811
|
|
|
812
|
+
log(f"Completed {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO)
|
|
813
|
+
|
|
783
814
|
# This avoids the following warning at the end of the benchmarking:
|
|
784
815
|
# Warning: WARNING: process group has NOT been destroyed before we destruct
|
|
785
816
|
# ProcessGroupNCCL. On normal program exit, the application should call
|
|
@@ -823,6 +854,8 @@ class Benchmarker:
|
|
|
823
854
|
model_config: "ModelConfig",
|
|
824
855
|
dataset_config: "DatasetConfig",
|
|
825
856
|
benchmark_config: "BenchmarkConfig",
|
|
857
|
+
num_finished_benchmarks: int,
|
|
858
|
+
num_total_benchmarks: int,
|
|
826
859
|
) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
|
|
827
860
|
"""Benchmark a single model on a single dataset.
|
|
828
861
|
|
|
@@ -835,25 +868,29 @@ class Benchmarker:
|
|
|
835
868
|
The configuration of the dataset we are evaluating on.
|
|
836
869
|
benchmark_config:
|
|
837
870
|
The general benchmark configuration.
|
|
871
|
+
num_finished_benchmarks:
|
|
872
|
+
The number of benchmarks that have already been completed.
|
|
873
|
+
num_total_benchmarks:
|
|
874
|
+
The total number of benchmarks to be completed.
|
|
838
875
|
|
|
839
876
|
Returns:
|
|
840
877
|
The benchmark result, or an error if the benchmark was unsuccessful.
|
|
841
|
-
"""
|
|
842
|
-
if model is None:
|
|
843
|
-
initial_logging(
|
|
844
|
-
model_config=model_config,
|
|
845
|
-
dataset_config=dataset_config,
|
|
846
|
-
benchmark_config=benchmark_config,
|
|
847
|
-
)
|
|
848
878
|
|
|
849
|
-
|
|
879
|
+
Raises:
|
|
880
|
+
RuntimeError:
|
|
881
|
+
If the MPS fallback is not enabled when required.
|
|
882
|
+
InvalidBenchmark:
|
|
883
|
+
If the benchmark was unsuccessful.
|
|
884
|
+
InvalidModel:
|
|
885
|
+
If the model is invalid.
|
|
886
|
+
"""
|
|
887
|
+
for _ in range(num_attempts := 5):
|
|
850
888
|
try:
|
|
851
889
|
# Set random seeds to enforce reproducibility of the randomly
|
|
852
890
|
# initialised weights
|
|
853
891
|
rng = enforce_reproducibility()
|
|
854
892
|
|
|
855
893
|
if model is None or model_config.model_type != ModelType.GENERATIVE:
|
|
856
|
-
logger.info("Loading model...")
|
|
857
894
|
model = load_model(
|
|
858
895
|
model_config=model_config,
|
|
859
896
|
dataset_config=dataset_config,
|
|
@@ -861,6 +898,14 @@ class Benchmarker:
|
|
|
861
898
|
)
|
|
862
899
|
assert model is not None
|
|
863
900
|
|
|
901
|
+
initial_logging(
|
|
902
|
+
model_config=model_config,
|
|
903
|
+
dataset_config=dataset_config,
|
|
904
|
+
benchmark_config=benchmark_config,
|
|
905
|
+
num_finished_benchmarks=num_finished_benchmarks,
|
|
906
|
+
num_total_benchmarks=num_total_benchmarks,
|
|
907
|
+
)
|
|
908
|
+
|
|
864
909
|
if dataset_config.task == SPEED:
|
|
865
910
|
scores = benchmark_speed(
|
|
866
911
|
model=model, benchmark_config=benchmark_config
|
|
@@ -928,14 +973,15 @@ class Benchmarker:
|
|
|
928
973
|
few_shot=benchmark_config.few_shot,
|
|
929
974
|
validation_split=not benchmark_config.evaluate_test_split,
|
|
930
975
|
)
|
|
931
|
-
|
|
976
|
+
log(f"Results:\n{results}", level=logging.DEBUG)
|
|
932
977
|
return record
|
|
933
978
|
|
|
934
979
|
except HuggingFaceHubDown:
|
|
935
980
|
wait_time = 30
|
|
936
|
-
|
|
981
|
+
log(
|
|
937
982
|
f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
|
|
938
|
-
"seconds."
|
|
983
|
+
"seconds.",
|
|
984
|
+
level=logging.DEBUG,
|
|
939
985
|
)
|
|
940
986
|
sleep(wait_time)
|
|
941
987
|
continue
|
|
@@ -958,34 +1004,37 @@ class Benchmarker:
|
|
|
958
1004
|
elif benchmark_config.raise_errors:
|
|
959
1005
|
raise e
|
|
960
1006
|
return e
|
|
1007
|
+
else:
|
|
1008
|
+
return InvalidBenchmark(
|
|
1009
|
+
f"Failed to benchmark model {model_config.model_id!r} on dataset "
|
|
1010
|
+
f"{dataset_config.name!r} after {num_attempts} attempts."
|
|
1011
|
+
)
|
|
961
1012
|
|
|
962
1013
|
def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
|
|
963
1014
|
"""Alias for `self.benchmark()`."""
|
|
964
|
-
|
|
1015
|
+
log(
|
|
965
1016
|
"Calling the `Benchmarker` class directly is deprecated. Please use the "
|
|
966
|
-
"`benchmark` function instead. This will be removed in a future version."
|
|
1017
|
+
"`benchmark` function instead. This will be removed in a future version.",
|
|
1018
|
+
level=logging.WARNING,
|
|
967
1019
|
)
|
|
968
1020
|
return self.benchmark(*args, **kwds)
|
|
969
1021
|
|
|
970
1022
|
|
|
971
1023
|
def model_has_been_benchmarked(
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
validation_split: bool,
|
|
1024
|
+
model_config: "ModelConfig",
|
|
1025
|
+
dataset_config: "DatasetConfig",
|
|
1026
|
+
benchmark_config: "BenchmarkConfig",
|
|
976
1027
|
benchmark_results: list[BenchmarkResult],
|
|
977
1028
|
) -> bool:
|
|
978
1029
|
"""Checks whether a model has already been benchmarked on a dataset.
|
|
979
1030
|
|
|
980
1031
|
Args:
|
|
981
|
-
|
|
982
|
-
The model
|
|
983
|
-
|
|
984
|
-
The dataset.
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
validation_split:
|
|
988
|
-
Whether the model was evaluated on the validation split.
|
|
1032
|
+
model_config:
|
|
1033
|
+
The configuration of the model we are evaluating.
|
|
1034
|
+
dataset_config:
|
|
1035
|
+
The configuration of the dataset we are evaluating on.
|
|
1036
|
+
benchmark_config:
|
|
1037
|
+
The general benchmark configuration.
|
|
989
1038
|
benchmark_results:
|
|
990
1039
|
The benchmark results.
|
|
991
1040
|
|
|
@@ -993,36 +1042,32 @@ def model_has_been_benchmarked(
|
|
|
993
1042
|
Whether the model has already been evaluated on the dataset.
|
|
994
1043
|
"""
|
|
995
1044
|
for record in benchmark_results:
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1045
|
+
model_id_components = split_model_id(model_id=record.model)
|
|
1046
|
+
same_model_id = model_id_components.model_id == model_config.model_id
|
|
1047
|
+
same_revision = model_id_components.revision == model_config.revision
|
|
1048
|
+
same_param = model_id_components.param == model_config.param
|
|
1049
|
+
same_dataset = record.dataset == dataset_config.name
|
|
1050
|
+
same_split = (
|
|
1051
|
+
record.validation_split != benchmark_config.evaluate_test_split
|
|
1052
|
+
or "val" not in dataset_config.splits
|
|
1053
|
+
)
|
|
1054
|
+
same_num_shots = (
|
|
1055
|
+
record.few_shot == benchmark_config.few_shot
|
|
1056
|
+
or not record.generative
|
|
1057
|
+
or dataset_config.task.requires_zero_shot
|
|
1058
|
+
)
|
|
1059
|
+
if (
|
|
1060
|
+
same_model_id
|
|
1061
|
+
and same_revision
|
|
1062
|
+
and same_param
|
|
1063
|
+
and same_dataset
|
|
1064
|
+
and same_split
|
|
1065
|
+
and same_num_shots
|
|
1066
|
+
):
|
|
1000
1067
|
return True
|
|
1001
1068
|
return False
|
|
1002
1069
|
|
|
1003
1070
|
|
|
1004
|
-
def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
|
|
1005
|
-
"""Adjust the logging level based on verbosity.
|
|
1006
|
-
|
|
1007
|
-
Args:
|
|
1008
|
-
verbose:
|
|
1009
|
-
Whether to output additional output.
|
|
1010
|
-
ignore_testing:
|
|
1011
|
-
Whether to ignore the testing flag.
|
|
1012
|
-
|
|
1013
|
-
Returns:
|
|
1014
|
-
The logging level that was set.
|
|
1015
|
-
"""
|
|
1016
|
-
if hasattr(sys, "_called_from_test") and not ignore_testing:
|
|
1017
|
-
logging_level = logging.CRITICAL
|
|
1018
|
-
elif verbose:
|
|
1019
|
-
logging_level = logging.DEBUG
|
|
1020
|
-
else:
|
|
1021
|
-
logging_level = logging.INFO
|
|
1022
|
-
logger.setLevel(logging_level)
|
|
1023
|
-
return logging_level
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
1071
|
def clear_model_cache_fn(cache_dir: str) -> None:
|
|
1027
1072
|
"""Clear the model cache.
|
|
1028
1073
|
|
|
@@ -1060,6 +1105,8 @@ def initial_logging(
|
|
|
1060
1105
|
model_config: "ModelConfig",
|
|
1061
1106
|
dataset_config: "DatasetConfig",
|
|
1062
1107
|
benchmark_config: "BenchmarkConfig",
|
|
1108
|
+
num_finished_benchmarks: int,
|
|
1109
|
+
num_total_benchmarks: int,
|
|
1063
1110
|
) -> None:
|
|
1064
1111
|
"""Initial logging at the start of the benchmarking process.
|
|
1065
1112
|
|
|
@@ -1070,6 +1117,10 @@ def initial_logging(
|
|
|
1070
1117
|
The configuration of the dataset we are evaluating on.
|
|
1071
1118
|
benchmark_config:
|
|
1072
1119
|
The general benchmark configuration.
|
|
1120
|
+
num_finished_benchmarks:
|
|
1121
|
+
The number of benchmarks that have already been finished.
|
|
1122
|
+
num_total_benchmarks:
|
|
1123
|
+
The total number of benchmarks to be run.
|
|
1073
1124
|
"""
|
|
1074
1125
|
model_id = model_config.model_id
|
|
1075
1126
|
if model_config.revision and model_config.revision != "main":
|
|
@@ -1086,21 +1137,25 @@ def initial_logging(
|
|
|
1086
1137
|
else:
|
|
1087
1138
|
eval_type = "Benchmarking"
|
|
1088
1139
|
|
|
1089
|
-
|
|
1090
|
-
f"{eval_type} {model_id} on the {split_type} split of "
|
|
1091
|
-
f"{dataset_config.pretty_name}"
|
|
1140
|
+
log_once(
|
|
1141
|
+
f"\n{eval_type} {model_id} on the {split_type} split of "
|
|
1142
|
+
f"{dataset_config.pretty_name} ({num_finished_benchmarks + 1}/"
|
|
1143
|
+
f"{num_total_benchmarks} benchmarks)...",
|
|
1144
|
+
prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
|
|
1092
1145
|
)
|
|
1093
1146
|
|
|
1094
1147
|
if dataset_config.unofficial:
|
|
1095
|
-
|
|
1148
|
+
log_once(
|
|
1096
1149
|
f"Note that the {dataset_config.name!r} dataset is unofficial, "
|
|
1097
1150
|
"meaning that the resulting evaluation will not be included in the "
|
|
1098
|
-
"official leaderboard."
|
|
1151
|
+
"official leaderboard.",
|
|
1152
|
+
level=logging.WARNING,
|
|
1099
1153
|
)
|
|
1100
1154
|
|
|
1101
1155
|
if benchmark_config.debug:
|
|
1102
|
-
|
|
1156
|
+
log_once(
|
|
1103
1157
|
"Running in debug mode. This will output additional information, as "
|
|
1104
1158
|
"well as store the model outputs in the current directory after each "
|
|
1105
|
-
"batch. For this reason, evaluation will be slower."
|
|
1159
|
+
"batch. For this reason, evaluation will be slower.",
|
|
1160
|
+
level=logging.WARNING,
|
|
1106
1161
|
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Caching utility functions."""
|
|
2
|
+
|
|
3
|
+
import typing as t
|
|
4
|
+
from functools import wraps
|
|
5
|
+
|
|
6
|
+
from .constants import T
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def cache_arguments(
|
|
10
|
+
*arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
|
|
11
|
+
) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
|
|
12
|
+
"""Cache specified arguments of a function.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
arguments:
|
|
16
|
+
The list of argument names to cache. If empty, all arguments are cached.
|
|
17
|
+
disable_condition:
|
|
18
|
+
A function that checks if cache should be disabled.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
A decorator that caches the specified arguments of a function.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
|
|
25
|
+
"""Decorator that caches the specified arguments of a function.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
func:
|
|
29
|
+
The function to decorate.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The decorated function.
|
|
33
|
+
"""
|
|
34
|
+
cache: dict[tuple, T] = dict()
|
|
35
|
+
|
|
36
|
+
@wraps(func)
|
|
37
|
+
def wrapper(*args, **kwargs) -> T:
|
|
38
|
+
"""Wrapper function that caches the specified arguments.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
*args:
|
|
42
|
+
The positional arguments to the function.
|
|
43
|
+
**kwargs:
|
|
44
|
+
The keyword arguments to the function.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The result of the function.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError:
|
|
51
|
+
If an argument name is not found in the function parameters.
|
|
52
|
+
"""
|
|
53
|
+
if not arguments:
|
|
54
|
+
key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
|
|
55
|
+
else:
|
|
56
|
+
func_params = func.__code__.co_varnames
|
|
57
|
+
key_items: list[t.Any] = []
|
|
58
|
+
for arg_name in arguments:
|
|
59
|
+
if arg_name in kwargs:
|
|
60
|
+
key_items.append(kwargs[arg_name])
|
|
61
|
+
else:
|
|
62
|
+
try:
|
|
63
|
+
arg_index = func_params.index(arg_name)
|
|
64
|
+
key_items.append(args[arg_index])
|
|
65
|
+
except (ValueError, IndexError):
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f"Argument {arg_name} not found in function "
|
|
68
|
+
f"{func.__name__} parameters."
|
|
69
|
+
)
|
|
70
|
+
key = tuple(key_items)
|
|
71
|
+
|
|
72
|
+
# Do not cache if the condition is met
|
|
73
|
+
if key not in cache or disable_condition():
|
|
74
|
+
cache[key] = func(*args, **kwargs)
|
|
75
|
+
return cache[key]
|
|
76
|
+
|
|
77
|
+
return wrapper
|
|
78
|
+
|
|
79
|
+
return caching_decorator
|
euroeval/callbacks.py
CHANGED
|
@@ -7,6 +7,8 @@ from collections.abc import Sized
|
|
|
7
7
|
from tqdm.auto import tqdm
|
|
8
8
|
from transformers.trainer_callback import ProgressCallback
|
|
9
9
|
|
|
10
|
+
from .logging_utils import get_pbar
|
|
11
|
+
|
|
10
12
|
if t.TYPE_CHECKING:
|
|
11
13
|
from torch.utils.data import DataLoader
|
|
12
14
|
from transformers.trainer_callback import TrainerControl, TrainerState
|
|
@@ -32,11 +34,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
32
34
|
"""Callback actions when training begins."""
|
|
33
35
|
if state.is_local_process_zero:
|
|
34
36
|
desc = "Finetuning model"
|
|
35
|
-
self.training_bar =
|
|
36
|
-
total=None,
|
|
37
|
-
leave=False,
|
|
38
|
-
desc=desc,
|
|
39
|
-
disable=hasattr(sys, "_called_from_test"),
|
|
37
|
+
self.training_bar = get_pbar(
|
|
38
|
+
total=None, desc=desc, disable=hasattr(sys, "_called_from_test")
|
|
40
39
|
)
|
|
41
40
|
self.current_step = 0
|
|
42
41
|
|
|
@@ -67,9 +66,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
67
66
|
if state.is_local_process_zero and correct_dtype:
|
|
68
67
|
if self.prediction_bar is None:
|
|
69
68
|
desc = "Evaluating model"
|
|
70
|
-
self.prediction_bar =
|
|
69
|
+
self.prediction_bar = get_pbar(
|
|
71
70
|
total=len(eval_dataloader),
|
|
72
|
-
leave=False,
|
|
73
71
|
desc=desc,
|
|
74
72
|
disable=hasattr(sys, "_called_from_test"),
|
|
75
73
|
)
|
euroeval/cli.py
CHANGED
|
@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
|
|
|
188
188
|
)
|
|
189
189
|
@click.option(
|
|
190
190
|
"--gpu-memory-utilization",
|
|
191
|
-
default=0.
|
|
191
|
+
default=0.8,
|
|
192
192
|
show_default=True,
|
|
193
193
|
help="The GPU memory utilization to use for vLLM. A larger value will result in "
|
|
194
194
|
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|