EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +7 -0
- euroeval/benchmark_config_factory.py +7 -0
- euroeval/benchmark_modules/base.py +29 -29
- euroeval/benchmark_modules/fresh.py +31 -19
- euroeval/benchmark_modules/hf.py +27 -23
- euroeval/benchmark_modules/litellm.py +50 -30
- euroeval/benchmark_modules/vllm.py +22 -26
- euroeval/benchmarker.py +8 -1
- euroeval/callbacks.py +17 -13
- euroeval/cli.py +10 -0
- euroeval/data_loading.py +10 -5
- euroeval/data_models.py +9 -40
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/english.py +13 -4
- euroeval/dataset_configs/norwegian.py +8 -0
- euroeval/dataset_configs/portuguese.py +74 -0
- euroeval/dataset_configs/spanish.py +4 -3
- euroeval/finetuning.py +9 -8
- euroeval/generation.py +27 -8
- euroeval/human_evaluation.py +14 -13
- euroeval/languages.py +1 -2
- euroeval/metrics.py +452 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +9 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +8 -1
- euroeval/scores.py +14 -19
- euroeval/speed_benchmark.py +6 -7
- euroeval/task_group_utils/multiple_choice_classification.py +6 -4
- euroeval/task_group_utils/question_answering.py +5 -28
- euroeval/task_group_utils/sequence_classification.py +6 -30
- euroeval/task_group_utils/text_to_text.py +19 -34
- euroeval/task_group_utils/token_classification.py +18 -30
- euroeval/tasks.py +11 -136
- euroeval/types.py +6 -4
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
- euroeval-15.12.0.dist-info/RECORD +63 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
- euroeval-15.10.1.dist-info/RECORD +0 -61
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
euroeval/generation.py
CHANGED
|
@@ -6,10 +6,8 @@ import typing as t
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
8
|
import more_itertools as mit
|
|
9
|
-
from datasets import Dataset, DatasetDict
|
|
10
9
|
from tqdm.auto import tqdm
|
|
11
10
|
|
|
12
|
-
from .benchmark_modules import BenchmarkModule
|
|
13
11
|
from .enums import BatchingPreference, TaskGroup
|
|
14
12
|
from .exceptions import InvalidBenchmark
|
|
15
13
|
from .model_cache import (
|
|
@@ -20,6 +18,9 @@ from .model_cache import (
|
|
|
20
18
|
from .utils import clear_memory
|
|
21
19
|
|
|
22
20
|
if t.TYPE_CHECKING:
|
|
21
|
+
from datasets import Dataset, DatasetDict
|
|
22
|
+
|
|
23
|
+
from .benchmark_modules import BenchmarkModule
|
|
23
24
|
from .data_models import (
|
|
24
25
|
BenchmarkConfig,
|
|
25
26
|
DatasetConfig,
|
|
@@ -32,7 +33,7 @@ logger = logging.getLogger("euroeval")
|
|
|
32
33
|
|
|
33
34
|
def generate(
|
|
34
35
|
model: "BenchmarkModule",
|
|
35
|
-
datasets: list[DatasetDict],
|
|
36
|
+
datasets: list["DatasetDict"],
|
|
36
37
|
model_config: "ModelConfig",
|
|
37
38
|
dataset_config: "DatasetConfig",
|
|
38
39
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -100,7 +101,7 @@ def generate(
|
|
|
100
101
|
|
|
101
102
|
|
|
102
103
|
def generate_single_iteration(
|
|
103
|
-
dataset: Dataset,
|
|
104
|
+
dataset: "Dataset",
|
|
104
105
|
model: "BenchmarkModule",
|
|
105
106
|
dataset_config: "DatasetConfig",
|
|
106
107
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -199,17 +200,35 @@ def generate_single_iteration(
|
|
|
199
200
|
all_preds.extend(extracted_labels)
|
|
200
201
|
|
|
201
202
|
if "label" in non_cached_dataset.column_names:
|
|
203
|
+
non_cached_labels = non_cached_dataset["label"]
|
|
204
|
+
if not isinstance(non_cached_labels, list):
|
|
205
|
+
non_cached_labels = list(non_cached_labels)
|
|
206
|
+
cached_labels = cached_dataset["label"]
|
|
207
|
+
if not isinstance(cached_labels, list):
|
|
208
|
+
cached_labels = list(cached_labels)
|
|
202
209
|
ground_truth = [
|
|
203
210
|
label.lower() if isinstance(label, str) else label
|
|
204
|
-
for label in
|
|
211
|
+
for label in non_cached_labels + cached_labels
|
|
205
212
|
]
|
|
206
213
|
elif "labels" in non_cached_dataset.column_names:
|
|
214
|
+
non_cached_labels = non_cached_dataset["labels"]
|
|
215
|
+
if not isinstance(non_cached_labels, list):
|
|
216
|
+
non_cached_labels = list(non_cached_labels)
|
|
217
|
+
cached_labels = cached_dataset["labels"]
|
|
218
|
+
if not isinstance(cached_labels, list):
|
|
219
|
+
cached_labels = list(cached_labels)
|
|
207
220
|
ground_truth = [
|
|
208
221
|
[label.lower() if isinstance(label, str) else label for label in label_list]
|
|
209
|
-
for label_list in
|
|
222
|
+
for label_list in non_cached_labels + cached_labels
|
|
210
223
|
]
|
|
211
224
|
elif "target_text" in non_cached_dataset.column_names:
|
|
212
|
-
|
|
225
|
+
non_cached_labels = non_cached_dataset["target_text"]
|
|
226
|
+
if not isinstance(non_cached_labels, list):
|
|
227
|
+
non_cached_labels = list(non_cached_labels)
|
|
228
|
+
cached_labels = cached_dataset["target_text"]
|
|
229
|
+
if not isinstance(cached_labels, list):
|
|
230
|
+
cached_labels = list(cached_labels)
|
|
231
|
+
ground_truth = non_cached_labels + cached_labels
|
|
213
232
|
else:
|
|
214
233
|
raise ValueError(
|
|
215
234
|
"The dataset must have either a 'label', 'labels', or 'target_text' column"
|
|
@@ -305,7 +324,7 @@ def debug_log(
|
|
|
305
324
|
):
|
|
306
325
|
logger.info(
|
|
307
326
|
f"Input: '{input_text}'\n"
|
|
308
|
-
f"Raw
|
|
327
|
+
f"Raw output: '{raw_output}'\n"
|
|
309
328
|
f"Prediction: '{prediction}'\n"
|
|
310
329
|
f"Label: '{label}'"
|
|
311
330
|
)
|
euroeval/human_evaluation.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import importlib.util
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
|
+
import typing as t
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from functools import partial
|
|
8
9
|
from pathlib import Path
|
|
@@ -24,13 +25,15 @@ from .task_group_utils import (
|
|
|
24
25
|
token_classification,
|
|
25
26
|
)
|
|
26
27
|
from .tasks import NER
|
|
27
|
-
from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
|
|
28
28
|
from .utils import enforce_reproducibility
|
|
29
29
|
|
|
30
30
|
if importlib.util.find_spec("gradio") is not None:
|
|
31
31
|
import gradio as gr
|
|
32
32
|
from gradio.components import HTML, Button, Dropdown, Markdown, Textbox
|
|
33
33
|
|
|
34
|
+
if t.TYPE_CHECKING:
|
|
35
|
+
from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
|
|
36
|
+
|
|
34
37
|
logger = logging.getLogger("euroeval")
|
|
35
38
|
|
|
36
39
|
|
|
@@ -86,8 +89,8 @@ class HumanEvaluator:
|
|
|
86
89
|
}
|
|
87
90
|
)
|
|
88
91
|
|
|
89
|
-
self.extract_labels_from_generation: ExtractLabelsFunction
|
|
90
|
-
self.compute_metrics: ComputeMetricsFunction
|
|
92
|
+
self.extract_labels_from_generation: "ExtractLabelsFunction"
|
|
93
|
+
self.compute_metrics: "ComputeMetricsFunction"
|
|
91
94
|
|
|
92
95
|
def create_app(self) -> "gr.Blocks":
|
|
93
96
|
"""Create the Gradio app for human evaluation.
|
|
@@ -269,6 +272,7 @@ class HumanEvaluator:
|
|
|
269
272
|
num_iterations=iteration + 1,
|
|
270
273
|
api_base=None,
|
|
271
274
|
api_version=None,
|
|
275
|
+
gpu_memory_utilization=0.9,
|
|
272
276
|
debug=False,
|
|
273
277
|
run_with_cli=True,
|
|
274
278
|
only_allow_safetensors=False,
|
|
@@ -342,7 +346,6 @@ class HumanEvaluator:
|
|
|
342
346
|
self.compute_metrics = partial(
|
|
343
347
|
sequence_classification.compute_metrics,
|
|
344
348
|
dataset_config=self.dataset_config,
|
|
345
|
-
benchmark_config=benchmark_config,
|
|
346
349
|
)
|
|
347
350
|
self.extract_labels_from_generation = partial(
|
|
348
351
|
sequence_classification.extract_labels_from_generation,
|
|
@@ -362,7 +365,6 @@ class HumanEvaluator:
|
|
|
362
365
|
token_classification.compute_metrics,
|
|
363
366
|
has_misc_tags=self.has_misc_tags,
|
|
364
367
|
dataset_config=self.dataset_config,
|
|
365
|
-
benchmark_config=benchmark_config,
|
|
366
368
|
)
|
|
367
369
|
self.extract_labels_from_generation = partial(
|
|
368
370
|
token_classification.extract_labels_from_generation,
|
|
@@ -372,7 +374,6 @@ class HumanEvaluator:
|
|
|
372
374
|
self.compute_metrics = partial(
|
|
373
375
|
question_answering.compute_metrics,
|
|
374
376
|
dataset_config=self.dataset_config,
|
|
375
|
-
benchmark_config=benchmark_config,
|
|
376
377
|
)
|
|
377
378
|
self.extract_labels_from_generation = (
|
|
378
379
|
question_answering.extract_labels_from_generation
|
|
@@ -641,7 +642,7 @@ class HumanEvaluator:
|
|
|
641
642
|
# only a single iteration, so the results from the current annotation should be
|
|
642
643
|
# added to the previous results.
|
|
643
644
|
results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
|
|
644
|
-
results: ScoreDict = defaultdict(list)
|
|
645
|
+
results: "ScoreDict" = defaultdict(list)
|
|
645
646
|
if results_path.exists():
|
|
646
647
|
all_results = [
|
|
647
648
|
json.loads(line.strip())
|
|
@@ -664,15 +665,15 @@ class HumanEvaluator:
|
|
|
664
665
|
|
|
665
666
|
# Aggregate scores
|
|
666
667
|
total_dict: dict[str, float] = dict()
|
|
667
|
-
for
|
|
668
|
+
for metric in self.dataset_config.task.metrics:
|
|
668
669
|
test_score, test_se = aggregate_scores(
|
|
669
670
|
scores=results["raw"], # type: ignore[arg-type]
|
|
670
|
-
|
|
671
|
+
metric=metric,
|
|
671
672
|
)
|
|
672
|
-
test_score, _ =
|
|
673
|
-
test_se, _ =
|
|
674
|
-
total_dict[f"test_{
|
|
675
|
-
total_dict[f"test_{
|
|
673
|
+
test_score, _ = metric.postprocessing_fn(test_score)
|
|
674
|
+
test_se, _ = metric.postprocessing_fn(test_se)
|
|
675
|
+
total_dict[f"test_{metric.name}"] = test_score
|
|
676
|
+
total_dict[f"test_{metric.name}_se"] = test_se
|
|
676
677
|
results["total"] = total_dict
|
|
677
678
|
|
|
678
679
|
benchmark_result = BenchmarkResult(
|
euroeval/languages.py
CHANGED
|
@@ -36,7 +36,7 @@ NN = Language(
|
|
|
36
36
|
)
|
|
37
37
|
ES = Language(code="es", name="Spanish", _and_separator="y", _or_separator="o")
|
|
38
38
|
SV = Language(code="sv", name="Swedish", _and_separator="och", _or_separator="eller")
|
|
39
|
-
|
|
39
|
+
PT = Language(code="pt", name="Portuguese", _and_separator="e", _or_separator="ou")
|
|
40
40
|
|
|
41
41
|
AB = Language(code="ab", name="Abkhazian")
|
|
42
42
|
AA = Language(code="aa", name="Afar")
|
|
@@ -152,7 +152,6 @@ PI = Language(code="pi", name="Pali")
|
|
|
152
152
|
PS = Language(code="ps", name="Pashto")
|
|
153
153
|
FA = Language(code="fa", name="Persian")
|
|
154
154
|
PL = Language(code="pl", name="Polish")
|
|
155
|
-
PT = Language(code="pt", name="Portuguese")
|
|
156
155
|
PA = Language(code="pa", name="Punjabi")
|
|
157
156
|
QU = Language(code="qu", name="Quechua")
|
|
158
157
|
RO = Language(code="ro", name="Romanian")
|
euroeval/metrics.py
ADDED
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
"""All the metrics used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
import logging
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
import evaluate
|
|
8
|
+
import litellm
|
|
9
|
+
from litellm.types.utils import Choices, ModelResponse
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
from tqdm.auto import tqdm
|
|
12
|
+
|
|
13
|
+
from .exceptions import InvalidBenchmark
|
|
14
|
+
from .utils import HiddenPrints
|
|
15
|
+
|
|
16
|
+
if t.TYPE_CHECKING:
|
|
17
|
+
from evaluate import EvaluationModule
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Metric(abc.ABC):
|
|
23
|
+
"""Abstract base class for all metrics."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
name: str,
|
|
28
|
+
pretty_name: str,
|
|
29
|
+
postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Initialise the metric.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name:
|
|
35
|
+
The name of the metric in snake_case.
|
|
36
|
+
pretty_name:
|
|
37
|
+
The pretty name of the metric, used for display purposes.
|
|
38
|
+
postprocessing_fn:
|
|
39
|
+
A function to apply to the metric scores after they are computed,
|
|
40
|
+
taking the score to the postprocessed score along with its string
|
|
41
|
+
representation. Defaults to x -> (100 * x, f"{x:.2%}").
|
|
42
|
+
"""
|
|
43
|
+
self.name = name
|
|
44
|
+
self.pretty_name = pretty_name
|
|
45
|
+
self.postprocessing_fn = (
|
|
46
|
+
postprocessing_fn
|
|
47
|
+
if postprocessing_fn is not None
|
|
48
|
+
else lambda x: (100 * x, f"{x:.2%}")
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
@abc.abstractmethod
|
|
52
|
+
def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
|
|
53
|
+
"""Calculate the metric score.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
predictions:
|
|
57
|
+
The model predictions.
|
|
58
|
+
references:
|
|
59
|
+
The ground truth references.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
The calculated metric score, or None if the score should be ignored.
|
|
63
|
+
"""
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
def __hash__(self) -> int:
|
|
67
|
+
"""Return a hash of the metric configuration."""
|
|
68
|
+
return hash(self.name)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class HuggingFaceMetric(Metric):
|
|
72
|
+
"""A metric which is implemented in the `evaluate` package.
|
|
73
|
+
|
|
74
|
+
Attributes:
|
|
75
|
+
name:
|
|
76
|
+
The name of the metric in snake_case.
|
|
77
|
+
pretty_name:
|
|
78
|
+
The pretty name of the metric, used for display purposes.
|
|
79
|
+
huggingface_id:
|
|
80
|
+
The Hugging Face ID of the metric.
|
|
81
|
+
results_key:
|
|
82
|
+
The name of the key used to extract the metric scores from the results
|
|
83
|
+
dictionary.
|
|
84
|
+
compute_kwargs:
|
|
85
|
+
Keyword arguments to pass to the metric's compute function. Defaults to
|
|
86
|
+
an empty dictionary.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
name: str,
|
|
92
|
+
pretty_name: str,
|
|
93
|
+
huggingface_id: str,
|
|
94
|
+
results_key: str,
|
|
95
|
+
compute_kwargs: dict[str, t.Any] | None = None,
|
|
96
|
+
postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
|
|
97
|
+
) -> None:
|
|
98
|
+
"""Initialise the Hugging Face metric.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
name:
|
|
102
|
+
The name of the metric in snake_case.
|
|
103
|
+
pretty_name:
|
|
104
|
+
The pretty name of the metric, used for display purposes.
|
|
105
|
+
huggingface_id:
|
|
106
|
+
The Hugging Face ID of the metric.
|
|
107
|
+
results_key:
|
|
108
|
+
The name of the key used to extract the metric scores from the results
|
|
109
|
+
dictionary.
|
|
110
|
+
compute_kwargs:
|
|
111
|
+
Keyword arguments to pass to the metric's compute function. Defaults to
|
|
112
|
+
an empty dictionary.
|
|
113
|
+
postprocessing_fn:
|
|
114
|
+
A function to apply to the metric scores after they are computed, taking
|
|
115
|
+
the score to the postprocessed score along with its string
|
|
116
|
+
representation. Defaults to x -> (100 * x, f"{x:.2%}").
|
|
117
|
+
"""
|
|
118
|
+
super().__init__(
|
|
119
|
+
name=name, pretty_name=pretty_name, postprocessing_fn=postprocessing_fn
|
|
120
|
+
)
|
|
121
|
+
self.huggingface_id = huggingface_id
|
|
122
|
+
self.results_key = results_key
|
|
123
|
+
self.compute_kwargs: dict[str, t.Any] = (
|
|
124
|
+
dict() if compute_kwargs is None else compute_kwargs
|
|
125
|
+
)
|
|
126
|
+
self.metric: "EvaluationModule | None" = None
|
|
127
|
+
|
|
128
|
+
def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
|
|
129
|
+
"""Calculate the metric score.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
predictions:
|
|
133
|
+
The model predictions.
|
|
134
|
+
references:
|
|
135
|
+
The ground truth references.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
The calculated metric score, or None if the score should be ignored.
|
|
139
|
+
"""
|
|
140
|
+
if self.metric is None:
|
|
141
|
+
self.metric = evaluate.load(path=self.huggingface_id)
|
|
142
|
+
|
|
143
|
+
with HiddenPrints():
|
|
144
|
+
results = self.metric.compute(
|
|
145
|
+
predictions=predictions, references=references, **self.compute_kwargs
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# The metric returns None if we are running on multi-GPU and the current
|
|
149
|
+
# process is not the main process
|
|
150
|
+
if results is None:
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
score = results[self.results_key]
|
|
154
|
+
if isinstance(score, list):
|
|
155
|
+
score = sum(score) / len(score)
|
|
156
|
+
|
|
157
|
+
return score
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class LLMAsAJudgeMetric(Metric):
|
|
161
|
+
"""Use an LLM to judge the quality of the predictions."""
|
|
162
|
+
|
|
163
|
+
def __init__(
|
|
164
|
+
self,
|
|
165
|
+
name: str,
|
|
166
|
+
pretty_name: str,
|
|
167
|
+
judge_id: str,
|
|
168
|
+
judge_kwargs: dict[str, t.Any],
|
|
169
|
+
user_prompt: str,
|
|
170
|
+
response_format: t.Type[BaseModel],
|
|
171
|
+
scoring_fn: t.Callable[[BaseModel], float],
|
|
172
|
+
condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
|
|
173
|
+
system_prompt: str | None = None,
|
|
174
|
+
) -> None:
|
|
175
|
+
"""Initialise the LLM as a judge metric.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
name:
|
|
179
|
+
The name of the metric in snake_case.
|
|
180
|
+
pretty_name:
|
|
181
|
+
The pretty name of the metric, used for display purposes.
|
|
182
|
+
judge_id:
|
|
183
|
+
The model ID of the LLM to use as a judge.
|
|
184
|
+
judge_kwargs:
|
|
185
|
+
Generation parameters for the judge model, such as temperature.
|
|
186
|
+
user_prompt:
|
|
187
|
+
The user prompt to use for the judge model. The prompt should be
|
|
188
|
+
formatted with the variables `prediction` and `condition`, to
|
|
189
|
+
include the model predictions and a description of what the prediction
|
|
190
|
+
should be judged on, respectively. If the condition is not needed,
|
|
191
|
+
it can be omitted from the prompt, but the `prediction` variable must
|
|
192
|
+
still be present.
|
|
193
|
+
response_format:
|
|
194
|
+
The response format to use for the judge model. This should be a
|
|
195
|
+
Pydantic model that defines the expected structure of the judge's
|
|
196
|
+
response.
|
|
197
|
+
scoring_fn:
|
|
198
|
+
A function that takes the judge's response and returns a score.
|
|
199
|
+
condition_formatting_fn (optional):
|
|
200
|
+
A function to format the condition string before it is included in the
|
|
201
|
+
user prompt. Defaults to a no-op function that returns the input
|
|
202
|
+
unchanged.
|
|
203
|
+
system_prompt (optional):
|
|
204
|
+
The system prompt to use for the judge model. If not provided, no system
|
|
205
|
+
prompt will be used.
|
|
206
|
+
"""
|
|
207
|
+
super().__init__(name=name, pretty_name=pretty_name)
|
|
208
|
+
self.judge_id = judge_id
|
|
209
|
+
self.judge_kwargs = judge_kwargs
|
|
210
|
+
self.user_prompt = user_prompt
|
|
211
|
+
self.response_format = response_format
|
|
212
|
+
self.scoring_fn = scoring_fn
|
|
213
|
+
self.condition_formatting_fn = condition_formatting_fn
|
|
214
|
+
self.system_prompt = system_prompt
|
|
215
|
+
|
|
216
|
+
def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
|
|
217
|
+
"""Calculate the metric score using the judge model.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
predictions:
|
|
221
|
+
The model predictions.
|
|
222
|
+
references:
|
|
223
|
+
The ground truth references.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
The calculated metric score, or None if the score should be ignored.
|
|
227
|
+
|
|
228
|
+
Raises:
|
|
229
|
+
InvalidBenchmark:
|
|
230
|
+
If the number of predictions does not match the number of references,
|
|
231
|
+
or if the user prompt requires a condition but none is provided.
|
|
232
|
+
"""
|
|
233
|
+
if not predictions or not references:
|
|
234
|
+
return None
|
|
235
|
+
elif len(predictions) != len(references):
|
|
236
|
+
raise InvalidBenchmark(
|
|
237
|
+
f"The number of predictions ({len(predictions):,}) does not match the "
|
|
238
|
+
f"number of references ({len(references):,})."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Prepare the messages for the LLM
|
|
242
|
+
conversations: list[list[dict[str, str]]] = [
|
|
243
|
+
[
|
|
244
|
+
dict(
|
|
245
|
+
role="user",
|
|
246
|
+
content=self._apply_user_prompt(
|
|
247
|
+
prediction=prediction, condition=condition
|
|
248
|
+
),
|
|
249
|
+
)
|
|
250
|
+
]
|
|
251
|
+
for prediction, condition in zip(predictions, references)
|
|
252
|
+
]
|
|
253
|
+
if self.system_prompt:
|
|
254
|
+
conversations = [
|
|
255
|
+
[dict(role="system", content=self.system_prompt), *conversation]
|
|
256
|
+
for conversation in conversations
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
# Get the judge generations
|
|
260
|
+
generations = [
|
|
261
|
+
litellm.completion(
|
|
262
|
+
model=self.judge_id,
|
|
263
|
+
messages=conversation,
|
|
264
|
+
response_format=self.response_format,
|
|
265
|
+
**self.judge_kwargs,
|
|
266
|
+
)
|
|
267
|
+
for conversation in tqdm(
|
|
268
|
+
iterable=conversations,
|
|
269
|
+
desc=f"Computing {self.pretty_name} scores",
|
|
270
|
+
unit="sample",
|
|
271
|
+
)
|
|
272
|
+
]
|
|
273
|
+
|
|
274
|
+
# Extract the outputs from the generations
|
|
275
|
+
outputs: list[BaseModel] = list()
|
|
276
|
+
for generation in generations:
|
|
277
|
+
assert isinstance(generation, ModelResponse), (
|
|
278
|
+
f"The judge model did not return a valid response: {generation!r}"
|
|
279
|
+
)
|
|
280
|
+
choice = generation.choices[0]
|
|
281
|
+
assert isinstance(choice, Choices), (
|
|
282
|
+
f"The judge model did not return a valid choice: {choice!r}"
|
|
283
|
+
)
|
|
284
|
+
json_content = choice.message.content
|
|
285
|
+
assert json_content is not None, (
|
|
286
|
+
"The judge model returned a None content in the response message."
|
|
287
|
+
)
|
|
288
|
+
output = self.response_format.model_validate_json(json_data=json_content)
|
|
289
|
+
outputs.append(output)
|
|
290
|
+
|
|
291
|
+
# Calculate the scores using the scoring function
|
|
292
|
+
scores = [self.scoring_fn(output) for output in outputs]
|
|
293
|
+
if not scores:
|
|
294
|
+
logger.warning(f"No scores were calculated for {self.pretty_name}.")
|
|
295
|
+
return None
|
|
296
|
+
return sum(scores) / len(scores)
|
|
297
|
+
|
|
298
|
+
def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
|
|
299
|
+
"""Apply the user prompt to the prediction and condition.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
prediction:
|
|
303
|
+
The model prediction.
|
|
304
|
+
condition (optional):
|
|
305
|
+
A description of what the prediction should be judged on. If not
|
|
306
|
+
provided, it will be omitted from the prompt.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
The formatted user prompt with the prediction and reference.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
InvalidBenchmark:
|
|
313
|
+
If the user prompt requires a reference but none is provided.
|
|
314
|
+
"""
|
|
315
|
+
condition_required = "{condition}" in self.user_prompt
|
|
316
|
+
if condition_required and condition is None:
|
|
317
|
+
raise InvalidBenchmark(
|
|
318
|
+
f"The user prompt for the {self.pretty_name!r} metric requires a "
|
|
319
|
+
"condition, but none was provided."
|
|
320
|
+
)
|
|
321
|
+
if condition is not None:
|
|
322
|
+
return self.user_prompt.format(
|
|
323
|
+
prediction=prediction, condition=self.condition_formatting_fn(condition)
|
|
324
|
+
)
|
|
325
|
+
return self.user_prompt.format(prediction=prediction)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class SpeedMetric(Metric):
|
|
329
|
+
"""Speed metric."""
|
|
330
|
+
|
|
331
|
+
def __init__(self, name: str, pretty_name: str) -> None:
|
|
332
|
+
"""Initialise the speed metric.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
name:
|
|
336
|
+
The name of the metric in snake_case.
|
|
337
|
+
pretty_name:
|
|
338
|
+
The pretty name of the metric, used for display purposes.
|
|
339
|
+
"""
|
|
340
|
+
super().__init__(
|
|
341
|
+
name=name,
|
|
342
|
+
pretty_name=pretty_name,
|
|
343
|
+
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
def __call__(self, _: t.Sequence, __: t.Sequence) -> float | None:
|
|
347
|
+
"""Not used with the speed metric, but required for consistency."""
|
|
348
|
+
raise NotImplementedError
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
mcc_metric = HuggingFaceMetric(
|
|
352
|
+
name="mcc",
|
|
353
|
+
pretty_name="Matthew's Correlation Coefficient",
|
|
354
|
+
huggingface_id="matthews_correlation",
|
|
355
|
+
results_key="matthews_correlation",
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
macro_f1_metric = HuggingFaceMetric(
|
|
359
|
+
name="macro_f1",
|
|
360
|
+
pretty_name="Macro-average F1-score",
|
|
361
|
+
huggingface_id="f1",
|
|
362
|
+
results_key="f1",
|
|
363
|
+
compute_kwargs=dict(average="macro"),
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
micro_f1_metric = HuggingFaceMetric(
|
|
367
|
+
name="micro_f1",
|
|
368
|
+
pretty_name="Micro-average F1-score with MISC tags",
|
|
369
|
+
huggingface_id="seqeval",
|
|
370
|
+
results_key="overall_f1",
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
micro_f1_no_misc_metric = HuggingFaceMetric(
|
|
374
|
+
name="micro_f1_no_misc",
|
|
375
|
+
pretty_name="Micro-average F1-score without MISC tags",
|
|
376
|
+
huggingface_id="seqeval",
|
|
377
|
+
results_key="overall_f1",
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
f1_metric = HuggingFaceMetric(
|
|
381
|
+
name="f1",
|
|
382
|
+
pretty_name="F1-score",
|
|
383
|
+
huggingface_id="squad_v2",
|
|
384
|
+
results_key="f1",
|
|
385
|
+
postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
em_metric = HuggingFaceMetric(
|
|
389
|
+
name="em",
|
|
390
|
+
pretty_name="Exact Match",
|
|
391
|
+
huggingface_id="squad_v2",
|
|
392
|
+
results_key="exact",
|
|
393
|
+
postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
bert_score_metric = HuggingFaceMetric(
|
|
397
|
+
name="bertscore",
|
|
398
|
+
pretty_name="BERTScore",
|
|
399
|
+
huggingface_id="bertscore",
|
|
400
|
+
results_key="f1",
|
|
401
|
+
compute_kwargs=dict(
|
|
402
|
+
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
|
|
403
|
+
),
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
rouge_l_metric = HuggingFaceMetric(
|
|
407
|
+
name="rouge_l", pretty_name="ROUGE-L", huggingface_id="rouge", results_key="rougeL"
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
accuracy_metric = HuggingFaceMetric(
|
|
411
|
+
name="accuracy",
|
|
412
|
+
pretty_name="Accuracy",
|
|
413
|
+
huggingface_id="accuracy",
|
|
414
|
+
results_key="accuracy",
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
class Fluency(BaseModel):
|
|
419
|
+
"""Response format for the fluency metric.
|
|
420
|
+
|
|
421
|
+
Attributes:
|
|
422
|
+
fluency:
|
|
423
|
+
The fluency rating, an integer between 1 and 5.
|
|
424
|
+
"""
|
|
425
|
+
|
|
426
|
+
fluency: t.Annotated[int, Field(ge=1, le=5)]
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
# Example LLM-as-a-judge metric, to measure the fluency of the LLM output
|
|
430
|
+
fluency_metric = LLMAsAJudgeMetric(
|
|
431
|
+
name="fluency",
|
|
432
|
+
pretty_name="Fluency",
|
|
433
|
+
judge_id="gpt-4o-mini",
|
|
434
|
+
judge_kwargs=dict(temperature=0.0),
|
|
435
|
+
user_prompt="Please rate the fluency of the following text on a scale from 1 to 5, "
|
|
436
|
+
"with the following definitions:\n"
|
|
437
|
+
"- 1: Very poor fluency, many grammatical errors\n"
|
|
438
|
+
"- 2: Poor fluency, several grammatical errors\n"
|
|
439
|
+
"- 3: Average fluency, a few grammatical errors\n"
|
|
440
|
+
"- 4: Good fluency, no grammatical errors but sounds a bit off\n"
|
|
441
|
+
"- 5: Excellent fluency, no grammatical errors and sounds natural\n\n"
|
|
442
|
+
"Text: {prediction!r}\n\n"
|
|
443
|
+
"Output your rating as a JSON object with a single key 'fluency'.",
|
|
444
|
+
response_format=Fluency,
|
|
445
|
+
scoring_fn=lambda output: (output.fluency - 1) / 4.0,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
speed_metric = SpeedMetric(name="speed", pretty_name="Tokens per second")
|
|
449
|
+
|
|
450
|
+
speed_short_metric = SpeedMetric(
|
|
451
|
+
name="speed_short", pretty_name="Tokens per second on short documents"
|
|
452
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Linguistic Acceptability task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
LA_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
|
|
|
36
36
|
default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
|
|
37
37
|
"gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
|
|
38
38
|
),
|
|
39
|
+
PT: PromptConfig(
|
|
40
|
+
default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
|
|
41
|
+
default_prompt_prefix="Seguem-se abaixo textos e se são "
|
|
42
|
+
"gramaticalmente correctos",
|
|
43
|
+
default_prompt_template="Texto: {text}\nGramaticalmente correcto: {label}",
|
|
44
|
+
default_instruction_prompt="Texto: {text}\n\nDetermina se o texto é "
|
|
45
|
+
"gramaticalmente correcto ou não. Responde com {labels_str}, e nada mais.",
|
|
46
|
+
),
|
|
39
47
|
FI: PromptConfig(
|
|
40
48
|
default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
|
|
41
49
|
default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
|