EuroEval 15.10.1__py3-none-any.whl → 15.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -0
- euroeval/benchmark_modules/base.py +29 -29
- euroeval/benchmark_modules/fresh.py +31 -19
- euroeval/benchmark_modules/hf.py +27 -23
- euroeval/benchmark_modules/litellm.py +50 -30
- euroeval/benchmark_modules/vllm.py +21 -25
- euroeval/benchmarker.py +1 -1
- euroeval/callbacks.py +17 -13
- euroeval/data_loading.py +10 -5
- euroeval/data_models.py +2 -40
- euroeval/dataset_configs/english.py +13 -4
- euroeval/dataset_configs/norwegian.py +8 -0
- euroeval/finetuning.py +9 -8
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +1 -0
- euroeval/human_evaluation.py +13 -13
- euroeval/metrics.py +452 -0
- euroeval/scores.py +14 -19
- euroeval/speed_benchmark.py +6 -7
- euroeval/task_group_utils/multiple_choice_classification.py +6 -4
- euroeval/task_group_utils/question_answering.py +5 -28
- euroeval/task_group_utils/sequence_classification.py +6 -30
- euroeval/task_group_utils/text_to_text.py +19 -34
- euroeval/task_group_utils/token_classification.py +18 -30
- euroeval/tasks.py +11 -136
- euroeval/types.py +6 -4
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/METADATA +10 -10
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/RECORD +31 -30
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/licenses/LICENSE +1 -1
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/WHEEL +0 -0
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/entry_points.txt +0 -0
|
@@ -13,14 +13,11 @@ from pathlib import Path
|
|
|
13
13
|
from time import sleep
|
|
14
14
|
|
|
15
15
|
import torch
|
|
16
|
-
from datasets import DatasetDict
|
|
17
16
|
from huggingface_hub import snapshot_download
|
|
18
17
|
from pydantic import conlist, create_model
|
|
19
18
|
from tqdm.auto import tqdm
|
|
20
19
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
21
20
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
22
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
23
|
-
from transformers.trainer import Trainer
|
|
24
21
|
from urllib3.exceptions import RequestError
|
|
25
22
|
|
|
26
23
|
from ..constants import (
|
|
@@ -34,13 +31,7 @@ from ..constants import (
|
|
|
34
31
|
TASKS_USING_JSON,
|
|
35
32
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
36
33
|
)
|
|
37
|
-
from ..data_models import
|
|
38
|
-
BenchmarkConfig,
|
|
39
|
-
DatasetConfig,
|
|
40
|
-
GenerativeModelOutput,
|
|
41
|
-
ModelConfig,
|
|
42
|
-
Task,
|
|
43
|
-
)
|
|
34
|
+
from ..data_models import GenerativeModelOutput, ModelConfig
|
|
44
35
|
from ..enums import (
|
|
45
36
|
BatchingPreference,
|
|
46
37
|
GenerativeType,
|
|
@@ -94,6 +85,13 @@ if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
|
94
85
|
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
95
86
|
import ray
|
|
96
87
|
|
|
88
|
+
if t.TYPE_CHECKING:
|
|
89
|
+
from datasets import DatasetDict
|
|
90
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
91
|
+
from transformers.trainer import Trainer
|
|
92
|
+
|
|
93
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, Task
|
|
94
|
+
|
|
97
95
|
logger = logging.getLogger("euroeval")
|
|
98
96
|
|
|
99
97
|
|
|
@@ -106,9 +104,9 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
106
104
|
|
|
107
105
|
def __init__(
|
|
108
106
|
self,
|
|
109
|
-
model_config: ModelConfig,
|
|
110
|
-
dataset_config: DatasetConfig,
|
|
111
|
-
benchmark_config: BenchmarkConfig,
|
|
107
|
+
model_config: "ModelConfig",
|
|
108
|
+
dataset_config: "DatasetConfig",
|
|
109
|
+
benchmark_config: "BenchmarkConfig",
|
|
112
110
|
) -> None:
|
|
113
111
|
"""Initialise the vLLM model.
|
|
114
112
|
|
|
@@ -129,8 +127,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
129
127
|
model, tokenizer = load_model_and_tokenizer(
|
|
130
128
|
model_config=model_config, benchmark_config=benchmark_config
|
|
131
129
|
)
|
|
132
|
-
self._model: LLM = model
|
|
133
|
-
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
130
|
+
self._model: "LLM" = model
|
|
131
|
+
self._tokenizer: "PreTrainedTokenizer" = tokenizer
|
|
134
132
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
135
133
|
model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
|
|
136
134
|
)
|
|
@@ -230,8 +228,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
230
228
|
)
|
|
231
229
|
|
|
232
230
|
def prepare_dataset(
|
|
233
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
234
|
-
) -> DatasetDict:
|
|
231
|
+
self, dataset: "DatasetDict", task: "Task", itr_idx: int
|
|
232
|
+
) -> "DatasetDict":
|
|
235
233
|
"""Prepare the dataset for the model.
|
|
236
234
|
|
|
237
235
|
This includes things like tokenisation.
|
|
@@ -293,7 +291,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
293
291
|
|
|
294
292
|
return dataset
|
|
295
293
|
|
|
296
|
-
def generate(self, inputs: dict) -> GenerativeModelOutput:
|
|
294
|
+
def generate(self, inputs: dict) -> "GenerativeModelOutput":
|
|
297
295
|
"""Generate outputs from the model.
|
|
298
296
|
|
|
299
297
|
Args:
|
|
@@ -524,7 +522,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
524
522
|
|
|
525
523
|
@classmethod
|
|
526
524
|
def model_exists(
|
|
527
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
525
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
528
526
|
) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
|
|
529
527
|
"""Check if a model exists.
|
|
530
528
|
|
|
@@ -558,8 +556,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
558
556
|
|
|
559
557
|
@classmethod
|
|
560
558
|
def get_model_config(
|
|
561
|
-
cls, model_id: str, benchmark_config: BenchmarkConfig
|
|
562
|
-
) -> ModelConfig:
|
|
559
|
+
cls, model_id: str, benchmark_config: "BenchmarkConfig"
|
|
560
|
+
) -> "ModelConfig":
|
|
563
561
|
"""Fetch the model configuration.
|
|
564
562
|
|
|
565
563
|
Args:
|
|
@@ -628,8 +626,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
628
626
|
|
|
629
627
|
|
|
630
628
|
def load_model_and_tokenizer(
|
|
631
|
-
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
632
|
-
) ->
|
|
629
|
+
model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
|
|
630
|
+
) -> tuple["LLM", "PreTrainedTokenizer"]:
|
|
633
631
|
"""Load the model and tokenizer.
|
|
634
632
|
|
|
635
633
|
Args:
|
|
@@ -1017,7 +1015,6 @@ def get_custom_stop_tokens(
|
|
|
1017
1015
|
"""
|
|
1018
1016
|
candidate_stop_tokens = CUSTOM_STOP_TOKENS
|
|
1019
1017
|
|
|
1020
|
-
# Create a prompt to check if the model uses the reasoning tokens
|
|
1021
1018
|
prompt = "Hello"
|
|
1022
1019
|
if tokenizer.chat_template is not None:
|
|
1023
1020
|
templated_prompt = tokenizer.apply_chat_template(
|
|
@@ -1028,7 +1025,6 @@ def get_custom_stop_tokens(
|
|
|
1028
1025
|
assert isinstance(templated_prompt, str)
|
|
1029
1026
|
prompt = templated_prompt
|
|
1030
1027
|
|
|
1031
|
-
# Check that the beginning-of-reasoning token is actually used by the model
|
|
1032
1028
|
max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
|
|
1033
1029
|
completion = (
|
|
1034
1030
|
model.generate(
|
euroeval/benchmarker.py
CHANGED
|
@@ -767,7 +767,7 @@ class Benchmarker:
|
|
|
767
767
|
|
|
768
768
|
results = log_scores(
|
|
769
769
|
dataset_name=dataset_config.pretty_name,
|
|
770
|
-
|
|
770
|
+
metrics=dataset_config.task.metrics,
|
|
771
771
|
scores=scores,
|
|
772
772
|
model_id=model_config.model_id,
|
|
773
773
|
model_revision=model_config.revision,
|
euroeval/callbacks.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
"""Callbacks for the Hugging Face Trainer."""
|
|
2
2
|
|
|
3
3
|
import sys
|
|
4
|
+
import typing as t
|
|
4
5
|
from collections.abc import Sized
|
|
5
6
|
|
|
6
|
-
from torch.utils.data import DataLoader
|
|
7
7
|
from tqdm.auto import tqdm
|
|
8
|
-
from transformers.trainer_callback import ProgressCallback
|
|
9
|
-
|
|
8
|
+
from transformers.trainer_callback import ProgressCallback
|
|
9
|
+
|
|
10
|
+
if t.TYPE_CHECKING:
|
|
11
|
+
from torch.utils.data import DataLoader
|
|
12
|
+
from transformers.trainer_callback import TrainerControl, TrainerState
|
|
13
|
+
from transformers.training_args import TrainingArguments
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
class NeverLeaveProgressCallback(ProgressCallback):
|
|
@@ -20,9 +24,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
20
24
|
|
|
21
25
|
def on_train_begin(
|
|
22
26
|
self,
|
|
23
|
-
args: TrainingArguments,
|
|
24
|
-
state: TrainerState,
|
|
25
|
-
control: TrainerControl,
|
|
27
|
+
args: "TrainingArguments",
|
|
28
|
+
state: "TrainerState",
|
|
29
|
+
control: "TrainerControl",
|
|
26
30
|
**kwargs: str,
|
|
27
31
|
) -> None:
|
|
28
32
|
"""Callback actions when training begins."""
|
|
@@ -38,9 +42,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
38
42
|
|
|
39
43
|
def on_step_end(
|
|
40
44
|
self,
|
|
41
|
-
args: TrainingArguments,
|
|
42
|
-
state: TrainerState,
|
|
43
|
-
control: TrainerControl,
|
|
45
|
+
args: "TrainingArguments",
|
|
46
|
+
state: "TrainerState",
|
|
47
|
+
control: "TrainerControl",
|
|
44
48
|
**kwargs: str,
|
|
45
49
|
) -> None:
|
|
46
50
|
"""Callback actions when a training step ends."""
|
|
@@ -50,10 +54,10 @@ class NeverLeaveProgressCallback(ProgressCallback):
|
|
|
50
54
|
|
|
51
55
|
def on_prediction_step(
|
|
52
56
|
self,
|
|
53
|
-
args: TrainingArguments,
|
|
54
|
-
state: TrainerState,
|
|
55
|
-
control: TrainerControl,
|
|
56
|
-
eval_dataloader: DataLoader | None = None,
|
|
57
|
+
args: "TrainingArguments",
|
|
58
|
+
state: "TrainerState",
|
|
59
|
+
control: "TrainerControl",
|
|
60
|
+
eval_dataloader: "DataLoader | None" = None,
|
|
57
61
|
**kwargs: str,
|
|
58
62
|
) -> None:
|
|
59
63
|
"""Callback actions when a prediction step ends."""
|
euroeval/data_loading.py
CHANGED
|
@@ -3,23 +3,28 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
5
|
import time
|
|
6
|
+
import typing as t
|
|
6
7
|
|
|
7
8
|
import requests
|
|
8
|
-
from datasets import
|
|
9
|
+
from datasets import DatasetDict, load_dataset
|
|
9
10
|
from datasets.exceptions import DatasetsError
|
|
10
11
|
from huggingface_hub.errors import HfHubHTTPError
|
|
11
12
|
from numpy.random import Generator
|
|
12
13
|
|
|
13
|
-
from .data_models import BenchmarkConfig, DatasetConfig
|
|
14
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
15
15
|
from .utils import unscramble
|
|
16
16
|
|
|
17
|
+
if t.TYPE_CHECKING:
|
|
18
|
+
from datasets import Dataset
|
|
19
|
+
|
|
20
|
+
from .data_models import BenchmarkConfig, DatasetConfig
|
|
21
|
+
|
|
17
22
|
logger = logging.getLogger("euroeval")
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
def load_data(
|
|
21
26
|
rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
|
|
22
|
-
) -> list[DatasetDict]:
|
|
27
|
+
) -> list["DatasetDict"]:
|
|
23
28
|
"""Load the raw bootstrapped datasets.
|
|
24
29
|
|
|
25
30
|
Args:
|
|
@@ -56,7 +61,7 @@ def load_data(
|
|
|
56
61
|
dataset["test"] = dataset["test"].select(range(1))
|
|
57
62
|
|
|
58
63
|
# Bootstrap the splits
|
|
59
|
-
bootstrapped_splits: dict[str, list[Dataset]] = dict()
|
|
64
|
+
bootstrapped_splits: dict[str, list["Dataset"]] = dict()
|
|
60
65
|
for split in ["train", "val", "test"]:
|
|
61
66
|
bootstrap_indices = rng.integers(
|
|
62
67
|
0,
|
|
@@ -80,7 +85,7 @@ def load_data(
|
|
|
80
85
|
return datasets
|
|
81
86
|
|
|
82
87
|
|
|
83
|
-
def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
|
|
88
|
+
def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
|
|
84
89
|
"""Load the raw dataset.
|
|
85
90
|
|
|
86
91
|
Args:
|
euroeval/data_models.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
|
-
import collections.abc as c
|
|
4
3
|
import json
|
|
5
4
|
import pathlib
|
|
6
5
|
import re
|
|
@@ -11,48 +10,11 @@ import pydantic
|
|
|
11
10
|
import torch
|
|
12
11
|
|
|
13
12
|
from .enums import Device, InferenceBackend, ModelType, TaskGroup
|
|
13
|
+
from .metrics import Metric
|
|
14
14
|
from .types import ScoreDict
|
|
15
15
|
from .utils import get_package_version
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@dataclass
|
|
19
|
-
class MetricConfig:
|
|
20
|
-
"""Configuration for a metric.
|
|
21
|
-
|
|
22
|
-
Attributes:
|
|
23
|
-
name:
|
|
24
|
-
The name of the metric.
|
|
25
|
-
pretty_name:
|
|
26
|
-
A longer prettier name for the metric, which allows cases and spaces. Used
|
|
27
|
-
for logging.
|
|
28
|
-
huggingface_id:
|
|
29
|
-
The Hugging Face ID of the metric.
|
|
30
|
-
results_key:
|
|
31
|
-
The name of the key used to extract the metric scores from the results
|
|
32
|
-
dictionary.
|
|
33
|
-
compute_kwargs:
|
|
34
|
-
Keyword arguments to pass to the metric's compute function. Defaults to
|
|
35
|
-
an empty dictionary.
|
|
36
|
-
postprocessing_fn:
|
|
37
|
-
A function to apply to the metric scores after they are computed, taking
|
|
38
|
-
the score to the postprocessed score along with its string representation.
|
|
39
|
-
Defaults to x -> (100 * x, f"{x:.2%}").
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
name: str
|
|
43
|
-
pretty_name: str
|
|
44
|
-
huggingface_id: str
|
|
45
|
-
results_key: str
|
|
46
|
-
compute_kwargs: dict[str, t.Any] = field(default_factory=dict)
|
|
47
|
-
postprocessing_fn: c.Callable[[float], tuple[float, str]] = field(
|
|
48
|
-
default_factory=lambda: lambda raw_score: (100 * raw_score, f"{raw_score:.2%}")
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
def __hash__(self) -> int:
|
|
52
|
-
"""Return a hash of the metric configuration."""
|
|
53
|
-
return hash(self.name)
|
|
54
|
-
|
|
55
|
-
|
|
56
18
|
@dataclass
|
|
57
19
|
class Language:
|
|
58
20
|
"""A benchmarkable language.
|
|
@@ -147,7 +109,7 @@ class Task:
|
|
|
147
109
|
name: str
|
|
148
110
|
task_group: TaskGroup
|
|
149
111
|
template_dict: dict["Language", "PromptConfig"]
|
|
150
|
-
metrics: list[
|
|
112
|
+
metrics: list[Metric]
|
|
151
113
|
default_num_few_shot_examples: int
|
|
152
114
|
default_max_generated_tokens: int
|
|
153
115
|
default_labels: list[str]
|
|
@@ -49,10 +49,10 @@ CNN_DAILYMAIL_CONFIG = DatasetConfig(
|
|
|
49
49
|
languages=[EN],
|
|
50
50
|
)
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
name="
|
|
54
|
-
pretty_name="the
|
|
55
|
-
huggingface_id="EuroEval/
|
|
52
|
+
LIFE_IN_THE_UK_CONFIG = DatasetConfig(
|
|
53
|
+
name="life-in-the-uk",
|
|
54
|
+
pretty_name="the English knowledge dataset Life in the UK",
|
|
55
|
+
huggingface_id="EuroEval/life-in-the-uk",
|
|
56
56
|
task=KNOW,
|
|
57
57
|
languages=[EN],
|
|
58
58
|
)
|
|
@@ -86,3 +86,12 @@ BELEBELE_CONFIG = DatasetConfig(
|
|
|
86
86
|
languages=[EN],
|
|
87
87
|
unofficial=True,
|
|
88
88
|
)
|
|
89
|
+
|
|
90
|
+
MMLU_CONFIG = DatasetConfig(
|
|
91
|
+
name="mmlu",
|
|
92
|
+
pretty_name="the truncated version of the English knowledge dataset MMLU",
|
|
93
|
+
huggingface_id="EuroEval/mmlu-mini",
|
|
94
|
+
task=KNOW,
|
|
95
|
+
languages=[EN],
|
|
96
|
+
unofficial=True,
|
|
97
|
+
)
|
|
@@ -76,6 +76,14 @@ NRK_QUIZ_QA_CONFIG = DatasetConfig(
|
|
|
76
76
|
languages=[NB, NN, NO],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
+
IDIOMS_NO_CONFIG = DatasetConfig(
|
|
80
|
+
name="idioms-no",
|
|
81
|
+
pretty_name="the Norwegian knowledge dataset Idioms-no",
|
|
82
|
+
huggingface_id="EuroEval/idioms-no",
|
|
83
|
+
task=KNOW,
|
|
84
|
+
languages=[NB, NN, NO],
|
|
85
|
+
)
|
|
86
|
+
|
|
79
87
|
NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
80
88
|
name="nor-common-sense-qa",
|
|
81
89
|
pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
|
euroeval/finetuning.py
CHANGED
|
@@ -5,7 +5,6 @@ import sys
|
|
|
5
5
|
import typing as t
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
|
-
from datasets import DatasetDict
|
|
9
8
|
from tqdm.auto import tqdm
|
|
10
9
|
from transformers.trainer_callback import (
|
|
11
10
|
EarlyStoppingCallback,
|
|
@@ -15,7 +14,6 @@ from transformers.trainer_callback import (
|
|
|
15
14
|
from transformers.trainer_utils import IntervalStrategy
|
|
16
15
|
from transformers.training_args import OptimizerNames, TrainingArguments
|
|
17
16
|
|
|
18
|
-
from .benchmark_modules import BenchmarkModule
|
|
19
17
|
from .callbacks import NeverLeaveProgressCallback
|
|
20
18
|
from .enums import DataType
|
|
21
19
|
from .exceptions import InvalidBenchmark, NaNValueInModelOutput
|
|
@@ -28,14 +26,17 @@ from .utils import (
|
|
|
28
26
|
)
|
|
29
27
|
|
|
30
28
|
if t.TYPE_CHECKING:
|
|
29
|
+
from datasets import DatasetDict
|
|
30
|
+
|
|
31
|
+
from .benchmark_modules import BenchmarkModule
|
|
31
32
|
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger("euroeval")
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
def finetune(
|
|
37
|
-
model: BenchmarkModule,
|
|
38
|
-
datasets: list[DatasetDict],
|
|
38
|
+
model: "BenchmarkModule",
|
|
39
|
+
datasets: list["DatasetDict"],
|
|
39
40
|
model_config: "ModelConfig",
|
|
40
41
|
dataset_config: "DatasetConfig",
|
|
41
42
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -155,9 +156,9 @@ def finetune(
|
|
|
155
156
|
|
|
156
157
|
|
|
157
158
|
def finetune_single_iteration(
|
|
158
|
-
model: BenchmarkModule | None,
|
|
159
|
-
dataset: DatasetDict,
|
|
160
|
-
training_args: TrainingArguments,
|
|
159
|
+
model: "BenchmarkModule | None",
|
|
160
|
+
dataset: "DatasetDict",
|
|
161
|
+
training_args: "TrainingArguments",
|
|
161
162
|
model_config: "ModelConfig",
|
|
162
163
|
dataset_config: "DatasetConfig",
|
|
163
164
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -254,7 +255,7 @@ def get_training_args(
|
|
|
254
255
|
iteration_idx: int,
|
|
255
256
|
dtype: DataType,
|
|
256
257
|
batch_size: int | None = None,
|
|
257
|
-
) -> TrainingArguments:
|
|
258
|
+
) -> "TrainingArguments":
|
|
258
259
|
"""Get the training arguments for the current iteration.
|
|
259
260
|
|
|
260
261
|
Args:
|
euroeval/generation.py
CHANGED
|
@@ -6,10 +6,8 @@ import typing as t
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
8
|
import more_itertools as mit
|
|
9
|
-
from datasets import Dataset, DatasetDict
|
|
10
9
|
from tqdm.auto import tqdm
|
|
11
10
|
|
|
12
|
-
from .benchmark_modules import BenchmarkModule
|
|
13
11
|
from .enums import BatchingPreference, TaskGroup
|
|
14
12
|
from .exceptions import InvalidBenchmark
|
|
15
13
|
from .model_cache import (
|
|
@@ -20,6 +18,9 @@ from .model_cache import (
|
|
|
20
18
|
from .utils import clear_memory
|
|
21
19
|
|
|
22
20
|
if t.TYPE_CHECKING:
|
|
21
|
+
from datasets import Dataset, DatasetDict
|
|
22
|
+
|
|
23
|
+
from .benchmark_modules import BenchmarkModule
|
|
23
24
|
from .data_models import (
|
|
24
25
|
BenchmarkConfig,
|
|
25
26
|
DatasetConfig,
|
|
@@ -32,7 +33,7 @@ logger = logging.getLogger("euroeval")
|
|
|
32
33
|
|
|
33
34
|
def generate(
|
|
34
35
|
model: "BenchmarkModule",
|
|
35
|
-
datasets: list[DatasetDict],
|
|
36
|
+
datasets: list["DatasetDict"],
|
|
36
37
|
model_config: "ModelConfig",
|
|
37
38
|
dataset_config: "DatasetConfig",
|
|
38
39
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -100,7 +101,7 @@ def generate(
|
|
|
100
101
|
|
|
101
102
|
|
|
102
103
|
def generate_single_iteration(
|
|
103
|
-
dataset: Dataset,
|
|
104
|
+
dataset: "Dataset",
|
|
104
105
|
model: "BenchmarkModule",
|
|
105
106
|
dataset_config: "DatasetConfig",
|
|
106
107
|
benchmark_config: "BenchmarkConfig",
|
euroeval/generation_utils.py
CHANGED
euroeval/human_evaluation.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import importlib.util
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
+
import typing as t
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from functools import partial
|
|
8
9
|
from pathlib import Path
|
|
@@ -24,13 +25,15 @@ from .task_group_utils import (
|
|
|
24
25
|
token_classification,
|
|
25
26
|
)
|
|
26
27
|
from .tasks import NER
|
|
27
|
-
from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
|
|
28
28
|
from .utils import enforce_reproducibility
|
|
29
29
|
|
|
30
30
|
if importlib.util.find_spec("gradio") is not None:
|
|
31
31
|
import gradio as gr
|
|
32
32
|
from gradio.components import HTML, Button, Dropdown, Markdown, Textbox
|
|
33
33
|
|
|
34
|
+
if t.TYPE_CHECKING:
|
|
35
|
+
from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
|
|
36
|
+
|
|
34
37
|
logger = logging.getLogger("euroeval")
|
|
35
38
|
|
|
36
39
|
|
|
@@ -86,8 +89,8 @@ class HumanEvaluator:
|
|
|
86
89
|
}
|
|
87
90
|
)
|
|
88
91
|
|
|
89
|
-
self.extract_labels_from_generation: ExtractLabelsFunction
|
|
90
|
-
self.compute_metrics: ComputeMetricsFunction
|
|
92
|
+
self.extract_labels_from_generation: "ExtractLabelsFunction"
|
|
93
|
+
self.compute_metrics: "ComputeMetricsFunction"
|
|
91
94
|
|
|
92
95
|
def create_app(self) -> "gr.Blocks":
|
|
93
96
|
"""Create the Gradio app for human evaluation.
|
|
@@ -342,7 +345,6 @@ class HumanEvaluator:
|
|
|
342
345
|
self.compute_metrics = partial(
|
|
343
346
|
sequence_classification.compute_metrics,
|
|
344
347
|
dataset_config=self.dataset_config,
|
|
345
|
-
benchmark_config=benchmark_config,
|
|
346
348
|
)
|
|
347
349
|
self.extract_labels_from_generation = partial(
|
|
348
350
|
sequence_classification.extract_labels_from_generation,
|
|
@@ -362,7 +364,6 @@ class HumanEvaluator:
|
|
|
362
364
|
token_classification.compute_metrics,
|
|
363
365
|
has_misc_tags=self.has_misc_tags,
|
|
364
366
|
dataset_config=self.dataset_config,
|
|
365
|
-
benchmark_config=benchmark_config,
|
|
366
367
|
)
|
|
367
368
|
self.extract_labels_from_generation = partial(
|
|
368
369
|
token_classification.extract_labels_from_generation,
|
|
@@ -372,7 +373,6 @@ class HumanEvaluator:
|
|
|
372
373
|
self.compute_metrics = partial(
|
|
373
374
|
question_answering.compute_metrics,
|
|
374
375
|
dataset_config=self.dataset_config,
|
|
375
|
-
benchmark_config=benchmark_config,
|
|
376
376
|
)
|
|
377
377
|
self.extract_labels_from_generation = (
|
|
378
378
|
question_answering.extract_labels_from_generation
|
|
@@ -641,7 +641,7 @@ class HumanEvaluator:
|
|
|
641
641
|
# only a single iteration, so the results from the current annotation should be
|
|
642
642
|
# added to the previous results.
|
|
643
643
|
results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
|
|
644
|
-
results: ScoreDict = defaultdict(list)
|
|
644
|
+
results: "ScoreDict" = defaultdict(list)
|
|
645
645
|
if results_path.exists():
|
|
646
646
|
all_results = [
|
|
647
647
|
json.loads(line.strip())
|
|
@@ -664,15 +664,15 @@ class HumanEvaluator:
|
|
|
664
664
|
|
|
665
665
|
# Aggregate scores
|
|
666
666
|
total_dict: dict[str, float] = dict()
|
|
667
|
-
for
|
|
667
|
+
for metric in self.dataset_config.task.metrics:
|
|
668
668
|
test_score, test_se = aggregate_scores(
|
|
669
669
|
scores=results["raw"], # type: ignore[arg-type]
|
|
670
|
-
|
|
670
|
+
metric=metric,
|
|
671
671
|
)
|
|
672
|
-
test_score, _ =
|
|
673
|
-
test_se, _ =
|
|
674
|
-
total_dict[f"test_{
|
|
675
|
-
total_dict[f"test_{
|
|
672
|
+
test_score, _ = metric.postprocessing_fn(test_score)
|
|
673
|
+
test_se, _ = metric.postprocessing_fn(test_se)
|
|
674
|
+
total_dict[f"test_{metric.name}"] = test_score
|
|
675
|
+
total_dict[f"test_{metric.name}_se"] = test_se
|
|
676
676
|
results["total"] = total_dict
|
|
677
677
|
|
|
678
678
|
benchmark_result = BenchmarkResult(
|