EuroEval 15.10.1__py3-none-any.whl → 15.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -0
- euroeval/benchmark_modules/base.py +29 -29
- euroeval/benchmark_modules/fresh.py +31 -19
- euroeval/benchmark_modules/hf.py +27 -23
- euroeval/benchmark_modules/litellm.py +50 -30
- euroeval/benchmark_modules/vllm.py +21 -25
- euroeval/benchmarker.py +1 -1
- euroeval/callbacks.py +17 -13
- euroeval/data_loading.py +10 -5
- euroeval/data_models.py +2 -40
- euroeval/dataset_configs/english.py +13 -4
- euroeval/dataset_configs/norwegian.py +8 -0
- euroeval/finetuning.py +9 -8
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +1 -0
- euroeval/human_evaluation.py +13 -13
- euroeval/metrics.py +452 -0
- euroeval/scores.py +14 -19
- euroeval/speed_benchmark.py +6 -7
- euroeval/task_group_utils/multiple_choice_classification.py +6 -4
- euroeval/task_group_utils/question_answering.py +5 -28
- euroeval/task_group_utils/sequence_classification.py +6 -30
- euroeval/task_group_utils/text_to_text.py +19 -34
- euroeval/task_group_utils/token_classification.py +18 -30
- euroeval/tasks.py +11 -136
- euroeval/types.py +6 -4
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/METADATA +10 -10
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/RECORD +31 -30
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/licenses/LICENSE +1 -1
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/WHEEL +0 -0
- {euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/entry_points.txt +0 -0
|
@@ -4,19 +4,16 @@ import logging
|
|
|
4
4
|
import re
|
|
5
5
|
import typing as t
|
|
6
6
|
|
|
7
|
-
import evaluate
|
|
8
7
|
import Levenshtein
|
|
9
8
|
import numpy as np
|
|
10
|
-
from evaluate import EvaluationModule
|
|
11
9
|
|
|
12
|
-
from ..data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
10
|
from ..exceptions import InvalidBenchmark
|
|
14
11
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
15
12
|
|
|
16
13
|
if t.TYPE_CHECKING:
|
|
17
14
|
from transformers.trainer_utils import EvalPrediction
|
|
18
15
|
|
|
19
|
-
from ..data_models import DatasetConfig
|
|
16
|
+
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
20
17
|
from ..types import Labels, Predictions
|
|
21
18
|
|
|
22
19
|
|
|
@@ -26,7 +23,6 @@ logger = logging.getLogger("euroeval")
|
|
|
26
23
|
def compute_metrics(
|
|
27
24
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
28
25
|
dataset_config: "DatasetConfig",
|
|
29
|
-
benchmark_config: "BenchmarkConfig",
|
|
30
26
|
) -> dict[str, float]:
|
|
31
27
|
"""Compute the metrics needed for evaluation.
|
|
32
28
|
|
|
@@ -36,8 +32,6 @@ def compute_metrics(
|
|
|
36
32
|
contains the true labels.
|
|
37
33
|
dataset_config:
|
|
38
34
|
The configuration of the dataset.
|
|
39
|
-
benchmark_config:
|
|
40
|
-
The configuration of the benchmark.
|
|
41
35
|
|
|
42
36
|
Returns:
|
|
43
37
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -51,17 +45,6 @@ def compute_metrics(
|
|
|
51
45
|
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
52
46
|
model_outputs = model_outputs[0]
|
|
53
47
|
|
|
54
|
-
metrics = {
|
|
55
|
-
metric_cfg.name: (
|
|
56
|
-
evaluate.load(
|
|
57
|
-
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
58
|
-
)
|
|
59
|
-
if metric_cfg.huggingface_id != ""
|
|
60
|
-
else None
|
|
61
|
-
)
|
|
62
|
-
for metric_cfg in dataset_config.task.metrics
|
|
63
|
-
}
|
|
64
|
-
|
|
65
48
|
model_output_dtype = np.asarray(model_outputs).dtype
|
|
66
49
|
if model_output_dtype in [np.float16, np.float32, np.float64]:
|
|
67
50
|
predictions = np.asarray(model_outputs).argmax(axis=-1)
|
|
@@ -89,27 +72,20 @@ def compute_metrics(
|
|
|
89
72
|
]
|
|
90
73
|
|
|
91
74
|
results: dict[str, float] = dict()
|
|
92
|
-
for
|
|
93
|
-
metric =
|
|
94
|
-
assert isinstance(metric, EvaluationModule)
|
|
95
|
-
score_dict: dict[str, float] | None = metric.compute(
|
|
96
|
-
predictions=predictions, references=label_ids, **cfg.compute_kwargs
|
|
97
|
-
)
|
|
75
|
+
for metric in dataset_config.task.metrics:
|
|
76
|
+
score: float | None = metric(predictions=predictions, references=label_ids)
|
|
98
77
|
|
|
99
78
|
# The metric returns None if we are running on multi-GPU and the current
|
|
100
79
|
# process is not the main process
|
|
101
|
-
if
|
|
102
|
-
|
|
103
|
-
if isinstance(scores, list):
|
|
104
|
-
scores = sum(scores) / len(scores)
|
|
105
|
-
results[cfg.name] = scores
|
|
80
|
+
if score is not None:
|
|
81
|
+
results[metric.name] = score
|
|
106
82
|
|
|
107
83
|
return results
|
|
108
84
|
|
|
109
85
|
|
|
110
86
|
def extract_labels_from_generation(
|
|
111
87
|
input_batch: dict[str, list],
|
|
112
|
-
model_output: GenerativeModelOutput,
|
|
88
|
+
model_output: "GenerativeModelOutput",
|
|
113
89
|
dataset_config: "DatasetConfig",
|
|
114
90
|
first_label_token_mapping: dict[str, str] | bool,
|
|
115
91
|
) -> list[str]:
|
|
@@ -3,18 +3,17 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import typing as t
|
|
5
5
|
|
|
6
|
-
import evaluate
|
|
7
6
|
import numpy as np
|
|
8
|
-
from evaluate import EvaluationModule
|
|
9
7
|
|
|
10
8
|
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
11
|
-
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
12
9
|
from ..exceptions import InvalidBenchmark
|
|
13
|
-
from ..
|
|
10
|
+
from ..metrics import HuggingFaceMetric
|
|
11
|
+
from ..utils import raise_if_model_output_contains_nan_values
|
|
14
12
|
|
|
15
13
|
if t.TYPE_CHECKING:
|
|
16
14
|
from transformers.trainer_utils import EvalPrediction
|
|
17
15
|
|
|
16
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
18
17
|
from ..types import Labels, Predictions
|
|
19
18
|
|
|
20
19
|
|
|
@@ -51,17 +50,6 @@ def compute_metrics(
|
|
|
51
50
|
assert not isinstance(model_outputs, tuple)
|
|
52
51
|
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
53
52
|
|
|
54
|
-
metrics = {
|
|
55
|
-
metric_cfg.name: (
|
|
56
|
-
evaluate.load(
|
|
57
|
-
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
58
|
-
)
|
|
59
|
-
if metric_cfg.huggingface_id != ""
|
|
60
|
-
else None
|
|
61
|
-
)
|
|
62
|
-
for metric_cfg in dataset_config.task.metrics
|
|
63
|
-
}
|
|
64
|
-
|
|
65
53
|
model_output_dtype = np.asarray(model_outputs).dtype
|
|
66
54
|
output_is_prob = model_output_dtype in [np.float16, np.float32, np.float64]
|
|
67
55
|
if output_is_prob:
|
|
@@ -70,21 +58,18 @@ def compute_metrics(
|
|
|
70
58
|
predictions = model_outputs
|
|
71
59
|
|
|
72
60
|
results: dict[str, float] = dict()
|
|
73
|
-
for
|
|
74
|
-
metric = metrics[cfg.name]
|
|
75
|
-
assert isinstance(metric, EvaluationModule)
|
|
76
|
-
|
|
61
|
+
for metric in dataset_config.task.metrics:
|
|
77
62
|
# Some metrics can be computed on hardware accelerators. In this case we
|
|
78
63
|
# start by setting the device to the same device as the model
|
|
79
|
-
if
|
|
80
|
-
|
|
64
|
+
if (
|
|
65
|
+
isinstance(metric, HuggingFaceMetric)
|
|
66
|
+
and metric.compute_kwargs.get("device", None) == "auto"
|
|
67
|
+
):
|
|
68
|
+
metric.compute_kwargs["device"] = benchmark_config.device.type
|
|
81
69
|
|
|
82
70
|
while True:
|
|
83
71
|
try:
|
|
84
|
-
|
|
85
|
-
score_dict: dict[str, float] | None = metric.compute(
|
|
86
|
-
predictions=predictions, references=labels, **cfg.compute_kwargs
|
|
87
|
-
)
|
|
72
|
+
score: float | None = metric(predictions=predictions, references=labels)
|
|
88
73
|
break
|
|
89
74
|
except Exception as e:
|
|
90
75
|
oom_error = [
|
|
@@ -95,11 +80,14 @@ def compute_metrics(
|
|
|
95
80
|
if not any(error in str(e) for error in oom_error):
|
|
96
81
|
raise InvalidBenchmark(str(e))
|
|
97
82
|
|
|
98
|
-
if
|
|
99
|
-
|
|
83
|
+
if (
|
|
84
|
+
isinstance(metric, HuggingFaceMetric)
|
|
85
|
+
and metric.compute_kwargs.get("device", "cpu") != "cpu"
|
|
86
|
+
):
|
|
87
|
+
metric.compute_kwargs["device"] = "cpu"
|
|
100
88
|
logger.debug(
|
|
101
89
|
"Out of memory error occurred during the computation of "
|
|
102
|
-
f"the metric {
|
|
90
|
+
f"the metric {metric.pretty_name}. Moving the computation to "
|
|
103
91
|
"the CPU."
|
|
104
92
|
)
|
|
105
93
|
else:
|
|
@@ -109,17 +97,14 @@ def compute_metrics(
|
|
|
109
97
|
if hasattr(metric, attribute):
|
|
110
98
|
logger.debug(
|
|
111
99
|
f"Deleting the {attribute!r} attribute of the metric "
|
|
112
|
-
f"{
|
|
100
|
+
f"{metric.pretty_name} to free up memory."
|
|
113
101
|
)
|
|
114
102
|
delattr(metric, attribute)
|
|
115
103
|
|
|
116
104
|
# The metric returns None if we are running on multi-GPU and the current
|
|
117
105
|
# process is not the main process
|
|
118
|
-
if
|
|
119
|
-
|
|
120
|
-
if isinstance(scores, list):
|
|
121
|
-
scores = sum(scores) / len(scores)
|
|
122
|
-
results[cfg.name] = scores
|
|
106
|
+
if score is not None:
|
|
107
|
+
results[metric.name] = score
|
|
123
108
|
|
|
124
109
|
return results
|
|
125
110
|
|
|
@@ -6,19 +6,17 @@ import typing as t
|
|
|
6
6
|
from copy import deepcopy
|
|
7
7
|
|
|
8
8
|
import demjson3
|
|
9
|
-
import evaluate
|
|
10
9
|
import numpy as np
|
|
11
|
-
from evaluate import EvaluationModule
|
|
12
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
13
10
|
|
|
14
|
-
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
11
|
from ..exceptions import InvalidBenchmark
|
|
16
12
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
17
13
|
|
|
18
14
|
if t.TYPE_CHECKING:
|
|
15
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
19
16
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
20
17
|
from transformers.trainer_utils import EvalPrediction
|
|
21
18
|
|
|
19
|
+
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
22
20
|
from ..types import Labels, Predictions
|
|
23
21
|
|
|
24
22
|
|
|
@@ -29,7 +27,6 @@ def compute_metrics(
|
|
|
29
27
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
30
28
|
has_misc_tags: bool,
|
|
31
29
|
dataset_config: "DatasetConfig",
|
|
32
|
-
benchmark_config: "BenchmarkConfig",
|
|
33
30
|
) -> dict[str, float]:
|
|
34
31
|
"""Compute the metrics needed for evaluation.
|
|
35
32
|
|
|
@@ -41,8 +38,6 @@ def compute_metrics(
|
|
|
41
38
|
Whether the dataset has MISC tags.
|
|
42
39
|
dataset_config:
|
|
43
40
|
The configuration of the dataset.
|
|
44
|
-
benchmark_config:
|
|
45
|
-
The configuration of the benchmark.
|
|
46
41
|
|
|
47
42
|
Returns:
|
|
48
43
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -55,17 +50,6 @@ def compute_metrics(
|
|
|
55
50
|
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
56
51
|
model_outputs = model_outputs[0]
|
|
57
52
|
|
|
58
|
-
metrics = {
|
|
59
|
-
metric_cfg.name: (
|
|
60
|
-
evaluate.load(
|
|
61
|
-
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
62
|
-
)
|
|
63
|
-
if metric_cfg.huggingface_id != ""
|
|
64
|
-
else None
|
|
65
|
-
)
|
|
66
|
-
for metric_cfg in dataset_config.task.metrics
|
|
67
|
-
}
|
|
68
|
-
|
|
69
53
|
predictions: list[list[str]]
|
|
70
54
|
if not isinstance(model_outputs[0][0], str):
|
|
71
55
|
raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
|
|
@@ -145,11 +129,14 @@ def compute_metrics(
|
|
|
145
129
|
all(ner_tag == "o" for ner_tag in label_list) for label_list in labels
|
|
146
130
|
)
|
|
147
131
|
if predictions_all_zero and labels_all_zero:
|
|
148
|
-
|
|
132
|
+
micro_f1_score: float | None = 1.0
|
|
149
133
|
else:
|
|
150
|
-
metric =
|
|
151
|
-
|
|
152
|
-
|
|
134
|
+
metric = next(
|
|
135
|
+
metric
|
|
136
|
+
for metric in dataset_config.task.metrics
|
|
137
|
+
if metric.name == "micro_f1"
|
|
138
|
+
)
|
|
139
|
+
micro_f1_score = metric(predictions=predictions, references=list(labels))
|
|
153
140
|
|
|
154
141
|
# Compute the metrics without MISC tags
|
|
155
142
|
# We manually set the F1 metric to be 100% if both the labels and the models
|
|
@@ -163,21 +150,22 @@ def compute_metrics(
|
|
|
163
150
|
all(ner_tag == "o" for ner_tag in label_list) for label_list in labels_no_misc
|
|
164
151
|
)
|
|
165
152
|
if predictions_no_misc_all_zero and labels_no_misc_all_zero:
|
|
166
|
-
|
|
153
|
+
micro_f1_no_misc_score: float | None = 1.0
|
|
167
154
|
else:
|
|
168
|
-
metric =
|
|
169
|
-
|
|
170
|
-
|
|
155
|
+
metric = next(
|
|
156
|
+
metric
|
|
157
|
+
for metric in dataset_config.task.metrics
|
|
158
|
+
if metric.name == "micro_f1_no_misc"
|
|
159
|
+
)
|
|
160
|
+
micro_f1_no_misc_score = metric(
|
|
171
161
|
predictions=predictions_no_misc, references=labels_no_misc
|
|
172
162
|
)
|
|
173
163
|
|
|
174
164
|
# Raise error if the metrics are invalid
|
|
175
|
-
if
|
|
165
|
+
if micro_f1_score is None or micro_f1_no_misc_score is None:
|
|
176
166
|
raise InvalidBenchmark("The predictions and labels are not of the same length.")
|
|
177
167
|
|
|
178
|
-
return dict(
|
|
179
|
-
micro_f1_no_misc=results_no_misc["overall_f1"], micro_f1=results["overall_f1"]
|
|
180
|
-
)
|
|
168
|
+
return dict(micro_f1_no_misc=micro_f1_no_misc_score, micro_f1=micro_f1_score)
|
|
181
169
|
|
|
182
170
|
|
|
183
171
|
def extract_labels_from_generation(
|
euroeval/tasks.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All benchmarks tasks used in EuroEval."""
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from . import metrics as m
|
|
4
|
+
from .data_models import Task
|
|
4
5
|
from .enums import TaskGroup
|
|
5
6
|
from .prompt_templates import (
|
|
6
7
|
LA_TEMPLATES,
|
|
@@ -25,21 +26,7 @@ LA = Task(
|
|
|
25
26
|
name="linguistic-acceptability",
|
|
26
27
|
task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
27
28
|
template_dict=LA_TEMPLATES,
|
|
28
|
-
metrics=[
|
|
29
|
-
MetricConfig(
|
|
30
|
-
name="mcc",
|
|
31
|
-
pretty_name="Matthew's Correlation Coefficient",
|
|
32
|
-
huggingface_id="matthews_correlation",
|
|
33
|
-
results_key="matthews_correlation",
|
|
34
|
-
),
|
|
35
|
-
MetricConfig(
|
|
36
|
-
name="macro_f1",
|
|
37
|
-
pretty_name="Macro-average F1-score",
|
|
38
|
-
huggingface_id="f1",
|
|
39
|
-
results_key="f1",
|
|
40
|
-
compute_kwargs=dict(average="macro"),
|
|
41
|
-
),
|
|
42
|
-
],
|
|
29
|
+
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
43
30
|
default_num_few_shot_examples=12,
|
|
44
31
|
default_max_generated_tokens=5,
|
|
45
32
|
default_labels=["correct", "incorrect"],
|
|
@@ -50,20 +37,7 @@ NER = Task(
|
|
|
50
37
|
name="named-entity-recognition",
|
|
51
38
|
task_group=TaskGroup.TOKEN_CLASSIFICATION,
|
|
52
39
|
template_dict=NER_TEMPLATES,
|
|
53
|
-
metrics=[
|
|
54
|
-
MetricConfig(
|
|
55
|
-
name="micro_f1_no_misc",
|
|
56
|
-
pretty_name="Micro-average F1-score without MISC tags",
|
|
57
|
-
huggingface_id="seqeval",
|
|
58
|
-
results_key="overall_f1",
|
|
59
|
-
),
|
|
60
|
-
MetricConfig(
|
|
61
|
-
name="micro_f1",
|
|
62
|
-
pretty_name="Micro-average F1-score with MISC tags",
|
|
63
|
-
huggingface_id="seqeval",
|
|
64
|
-
results_key="overall_f1",
|
|
65
|
-
),
|
|
66
|
-
],
|
|
40
|
+
metrics=[m.micro_f1_no_misc_metric, m.micro_f1_metric],
|
|
67
41
|
default_num_few_shot_examples=8,
|
|
68
42
|
default_max_generated_tokens=128,
|
|
69
43
|
default_labels=[
|
|
@@ -84,22 +58,7 @@ RC = Task(
|
|
|
84
58
|
name="reading-comprehension",
|
|
85
59
|
task_group=TaskGroup.QUESTION_ANSWERING,
|
|
86
60
|
template_dict=RC_TEMPLATES,
|
|
87
|
-
metrics=[
|
|
88
|
-
MetricConfig(
|
|
89
|
-
name="f1",
|
|
90
|
-
pretty_name="F1-score",
|
|
91
|
-
huggingface_id="squad_v2",
|
|
92
|
-
results_key="f1",
|
|
93
|
-
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:.2f}%"),
|
|
94
|
-
),
|
|
95
|
-
MetricConfig(
|
|
96
|
-
name="em",
|
|
97
|
-
pretty_name="Exact Match",
|
|
98
|
-
huggingface_id="squad_v2",
|
|
99
|
-
results_key="exact",
|
|
100
|
-
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:.2f}%"),
|
|
101
|
-
),
|
|
102
|
-
],
|
|
61
|
+
metrics=[m.f1_metric, m.em_metric],
|
|
103
62
|
default_num_few_shot_examples=4,
|
|
104
63
|
default_max_generated_tokens=32,
|
|
105
64
|
default_labels=["start_positions", "end_positions"],
|
|
@@ -110,21 +69,7 @@ SENT = Task(
|
|
|
110
69
|
name="sentiment-classification",
|
|
111
70
|
task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
112
71
|
template_dict=SENT_TEMPLATES,
|
|
113
|
-
metrics=[
|
|
114
|
-
MetricConfig(
|
|
115
|
-
name="mcc",
|
|
116
|
-
pretty_name="Matthew's Correlation Coefficient",
|
|
117
|
-
huggingface_id="matthews_correlation",
|
|
118
|
-
results_key="matthews_correlation",
|
|
119
|
-
),
|
|
120
|
-
MetricConfig(
|
|
121
|
-
name="macro_f1",
|
|
122
|
-
pretty_name="Macro-average F1-score",
|
|
123
|
-
huggingface_id="f1",
|
|
124
|
-
results_key="f1",
|
|
125
|
-
compute_kwargs=dict(average="macro"),
|
|
126
|
-
),
|
|
127
|
-
],
|
|
72
|
+
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
128
73
|
default_num_few_shot_examples=12,
|
|
129
74
|
default_max_generated_tokens=5,
|
|
130
75
|
default_labels=["positive", "neutral", "negative"],
|
|
@@ -135,23 +80,7 @@ SUMM = Task(
|
|
|
135
80
|
name="summarization",
|
|
136
81
|
task_group=TaskGroup.TEXT_TO_TEXT,
|
|
137
82
|
template_dict=SUMM_TEMPLATES,
|
|
138
|
-
metrics=[
|
|
139
|
-
MetricConfig(
|
|
140
|
-
name="bertscore",
|
|
141
|
-
pretty_name="BERTScore",
|
|
142
|
-
huggingface_id="bertscore",
|
|
143
|
-
results_key="f1",
|
|
144
|
-
compute_kwargs=dict(
|
|
145
|
-
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
|
|
146
|
-
),
|
|
147
|
-
),
|
|
148
|
-
MetricConfig(
|
|
149
|
-
name="rouge_l",
|
|
150
|
-
pretty_name="ROUGE-L",
|
|
151
|
-
huggingface_id="rouge",
|
|
152
|
-
results_key="rougeL",
|
|
153
|
-
),
|
|
154
|
-
],
|
|
83
|
+
metrics=[m.bert_score_metric, m.rouge_l_metric],
|
|
155
84
|
default_num_few_shot_examples=1,
|
|
156
85
|
default_max_generated_tokens=256,
|
|
157
86
|
default_labels=[],
|
|
@@ -162,20 +91,7 @@ KNOW = Task(
|
|
|
162
91
|
name="knowledge",
|
|
163
92
|
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
164
93
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
165
|
-
metrics=[
|
|
166
|
-
MetricConfig(
|
|
167
|
-
name="mcc",
|
|
168
|
-
pretty_name="Matthew's Correlation Coefficient",
|
|
169
|
-
huggingface_id="matthews_correlation",
|
|
170
|
-
results_key="matthews_correlation",
|
|
171
|
-
),
|
|
172
|
-
MetricConfig(
|
|
173
|
-
name="accuracy",
|
|
174
|
-
pretty_name="Accuracy",
|
|
175
|
-
huggingface_id="accuracy",
|
|
176
|
-
results_key="accuracy",
|
|
177
|
-
),
|
|
178
|
-
],
|
|
94
|
+
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
179
95
|
default_num_few_shot_examples=5,
|
|
180
96
|
default_max_generated_tokens=5,
|
|
181
97
|
default_labels=["a", "b", "c", "d"],
|
|
@@ -186,20 +102,7 @@ MCRC = Task(
|
|
|
186
102
|
name="multiple-choice-reading-comprehension",
|
|
187
103
|
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
188
104
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
189
|
-
metrics=[
|
|
190
|
-
MetricConfig(
|
|
191
|
-
name="mcc",
|
|
192
|
-
pretty_name="Matthew's Correlation Coefficient",
|
|
193
|
-
huggingface_id="matthews_correlation",
|
|
194
|
-
results_key="matthews_correlation",
|
|
195
|
-
),
|
|
196
|
-
MetricConfig(
|
|
197
|
-
name="accuracy",
|
|
198
|
-
pretty_name="Accuracy",
|
|
199
|
-
huggingface_id="accuracy",
|
|
200
|
-
results_key="accuracy",
|
|
201
|
-
),
|
|
202
|
-
],
|
|
105
|
+
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
203
106
|
default_num_few_shot_examples=5,
|
|
204
107
|
default_max_generated_tokens=5,
|
|
205
108
|
default_labels=["a", "b", "c", "d"],
|
|
@@ -210,20 +113,7 @@ COMMON_SENSE = Task(
|
|
|
210
113
|
name="common-sense-reasoning",
|
|
211
114
|
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
212
115
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
213
|
-
metrics=[
|
|
214
|
-
MetricConfig(
|
|
215
|
-
name="mcc",
|
|
216
|
-
pretty_name="Matthew's Correlation Coefficient",
|
|
217
|
-
huggingface_id="matthews_correlation",
|
|
218
|
-
results_key="matthews_correlation",
|
|
219
|
-
),
|
|
220
|
-
MetricConfig(
|
|
221
|
-
name="accuracy",
|
|
222
|
-
pretty_name="Accuracy",
|
|
223
|
-
huggingface_id="accuracy",
|
|
224
|
-
results_key="accuracy",
|
|
225
|
-
),
|
|
226
|
-
],
|
|
116
|
+
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
227
117
|
default_num_few_shot_examples=5,
|
|
228
118
|
default_max_generated_tokens=5,
|
|
229
119
|
default_labels=["a", "b", "c", "d"],
|
|
@@ -234,22 +124,7 @@ SPEED = Task(
|
|
|
234
124
|
name="speed",
|
|
235
125
|
task_group=TaskGroup.SPEED,
|
|
236
126
|
template_dict={},
|
|
237
|
-
metrics=[
|
|
238
|
-
MetricConfig(
|
|
239
|
-
name="speed",
|
|
240
|
-
pretty_name="Tokens per second",
|
|
241
|
-
huggingface_id="",
|
|
242
|
-
results_key="speed",
|
|
243
|
-
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
|
|
244
|
-
),
|
|
245
|
-
MetricConfig(
|
|
246
|
-
name="speed_short",
|
|
247
|
-
pretty_name="Tokens per second on short documents",
|
|
248
|
-
huggingface_id="",
|
|
249
|
-
results_key="speed",
|
|
250
|
-
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
|
|
251
|
-
),
|
|
252
|
-
],
|
|
127
|
+
metrics=[m.speed_metric, m.speed_short_metric],
|
|
253
128
|
default_num_few_shot_examples=0,
|
|
254
129
|
default_max_generated_tokens=5,
|
|
255
130
|
default_labels=[],
|
euroeval/types.py
CHANGED
|
@@ -2,16 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
import typing as t
|
|
4
4
|
|
|
5
|
-
from numpy.typing import NDArray
|
|
6
5
|
from transformers.trainer_utils import EvalPrediction
|
|
7
6
|
|
|
8
7
|
if t.TYPE_CHECKING:
|
|
8
|
+
from numpy.typing import NDArray
|
|
9
|
+
|
|
9
10
|
from .data_models import GenerativeModelOutput
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
|
|
13
|
-
Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
|
|
14
|
-
Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
|
|
14
|
+
Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
|
|
15
|
+
Labels: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class ComputeMetricsFunction(t.Protocol):
|
|
@@ -21,7 +22,8 @@ class ComputeMetricsFunction(t.Protocol):
|
|
|
21
22
|
self,
|
|
22
23
|
model_outputs_and_labels: EvalPrediction
|
|
23
24
|
| tuple[
|
|
24
|
-
NDArray | list[str] | list[list[str]],
|
|
25
|
+
"NDArray | list[str] | list[list[str]]",
|
|
26
|
+
"NDArray | list[str] | list[list[str]]",
|
|
25
27
|
],
|
|
26
28
|
) -> dict[str, float]:
|
|
27
29
|
"""Compute the metrics.
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.11.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
7
|
-
Author-email: Dan Saattrup
|
|
8
|
-
Maintainer-email: Dan Saattrup
|
|
7
|
+
Author-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
|
|
8
|
+
Maintainer-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
|
|
9
9
|
License: MIT License
|
|
10
10
|
|
|
11
|
-
Copyright (c) 2022-
|
|
11
|
+
Copyright (c) 2022-2025 Dan Saattrup Smart
|
|
12
12
|
|
|
13
13
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
14
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -43,6 +43,7 @@ Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
|
43
43
|
Requires-Dist: ollama>=0.5.1
|
|
44
44
|
Requires-Dist: pandas>=2.2.0
|
|
45
45
|
Requires-Dist: peft>=0.15.0
|
|
46
|
+
Requires-Dist: protobuf>=2.0.0
|
|
46
47
|
Requires-Dist: pydantic>=2.6.0
|
|
47
48
|
Requires-Dist: pyinfer>=0.0.3
|
|
48
49
|
Requires-Dist: python-dotenv>=1.0.1
|
|
@@ -94,8 +95,7 @@ ______________________________________________________________________
|
|
|
94
95
|
|
|
95
96
|
## Maintainer
|
|
96
97
|
|
|
97
|
-
- Dan Saattrup
|
|
98
|
-
dan.nielsen@alexandra.dk)
|
|
98
|
+
- Dan Saattrup Smart ([@saattrupdan](https://github.com/saattrupdan), dan.smart@alexandra.dk)
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
## Installation
|
|
@@ -268,14 +268,14 @@ contributing new datasets, your help makes this project better for everyone.
|
|
|
268
268
|
If you want to cite the framework then feel free to use this:
|
|
269
269
|
|
|
270
270
|
```
|
|
271
|
-
@article{
|
|
271
|
+
@article{smart2024encoder,
|
|
272
272
|
title={Encoder vs Decoder: Comparative Analysis of Encoder and Decoder Language Models on Multilingual NLU Tasks},
|
|
273
|
-
author={
|
|
273
|
+
author={Smart, Dan Saattrup and Enevoldsen, Kenneth and Schneider-Kamp, Peter},
|
|
274
274
|
journal={arXiv preprint arXiv:2406.13469},
|
|
275
275
|
year={2024}
|
|
276
276
|
}
|
|
277
|
-
@inproceedings{
|
|
278
|
-
author = {
|
|
277
|
+
@inproceedings{smart2023scandeval,
|
|
278
|
+
author = {Smart, Dan Saattrup},
|
|
279
279
|
booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)},
|
|
280
280
|
month = may,
|
|
281
281
|
pages = {185--201},
|