EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +161 -114
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -470
  61. euroeval-15.16.0.dist-info/RECORD +0 -63
  62. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,6 @@
1
+ """All the metrics used in EuroEval."""
2
+
3
+ from .huggingface import * # noqa: F403
4
+ from .llm_as_a_judge import * # noqa: F403
5
+ from .pipeline import * # noqa: F403
6
+ from .speed import * # noqa: F403
@@ -0,0 +1,76 @@
1
+ """The abstract base class for all metrics."""
2
+
3
+ import abc
4
+ import collections.abc as c
5
+ import logging
6
+ import typing as t
7
+
8
+ if t.TYPE_CHECKING:
9
+ from datasets.arrow_dataset import Dataset
10
+
11
+ from ..data_models import BenchmarkConfig, DatasetConfig
12
+
13
+ logger: logging.Logger = logging.getLogger("euroeval")
14
+
15
+
16
+ class Metric(abc.ABC):
17
+ """Abstract base class for all metrics."""
18
+
19
+ def __init__(
20
+ self,
21
+ name: str,
22
+ pretty_name: str,
23
+ postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
24
+ ) -> None:
25
+ """Initialise the metric.
26
+
27
+ Args:
28
+ name:
29
+ The name of the metric in snake_case.
30
+ pretty_name:
31
+ The pretty name of the metric, used for display purposes.
32
+ postprocessing_fn:
33
+ A function to apply to the metric scores after they are computed,
34
+ taking the score to the postprocessed score along with its string
35
+ representation. Defaults to x -> (100 * x, f"{x:.2%}").
36
+ """
37
+ self.name = name
38
+ self.pretty_name = pretty_name
39
+ self.postprocessing_fn = (
40
+ postprocessing_fn
41
+ if postprocessing_fn is not None
42
+ else lambda x: (100 * x, f"{x:.2%}")
43
+ )
44
+
45
+ @abc.abstractmethod
46
+ def __call__(
47
+ self,
48
+ predictions: c.Sequence,
49
+ references: c.Sequence,
50
+ dataset: "Dataset",
51
+ dataset_config: "DatasetConfig",
52
+ benchmark_config: "BenchmarkConfig",
53
+ ) -> float | None:
54
+ """Calculate the metric score.
55
+
56
+ Args:
57
+ predictions:
58
+ The model predictions.
59
+ references:
60
+ The ground truth references.
61
+ dataset:
62
+ The dataset used for evaluation. This is only used in case any
63
+ additional metadata is used to compute the metrics.
64
+ dataset_config:
65
+ The dataset configuration.
66
+ benchmark_config:
67
+ The benchmark configuration.
68
+
69
+ Returns:
70
+ The calculated metric score, or None if the score should be ignored.
71
+ """
72
+ ...
73
+
74
+ def __hash__(self) -> int:
75
+ """Return a hash of the metric configuration."""
76
+ return hash(self.name)
@@ -0,0 +1,192 @@
1
+ """All the Hugging Face metrics used in EuroEval."""
2
+
3
+ import collections.abc as c
4
+ import logging
5
+ import typing as t
6
+
7
+ import evaluate
8
+ import numpy as np
9
+
10
+ from ..utils import HiddenPrints
11
+ from .base import Metric
12
+
13
+ if t.TYPE_CHECKING:
14
+ from datasets.arrow_dataset import Dataset
15
+ from evaluate import EvaluationModule
16
+
17
+ from ..data_models import BenchmarkConfig, DatasetConfig
18
+
19
+ logger: logging.Logger = logging.getLogger("euroeval")
20
+
21
+
22
+ class HuggingFaceMetric(Metric):
23
+ """A metric which is implemented in the `evaluate` package.
24
+
25
+ Attributes:
26
+ name:
27
+ The name of the metric in snake_case.
28
+ pretty_name:
29
+ The pretty name of the metric, used for display purposes.
30
+ huggingface_id:
31
+ The Hugging Face ID of the metric.
32
+ results_key:
33
+ The name of the key used to extract the metric scores from the results
34
+ dictionary.
35
+ compute_kwargs:
36
+ Keyword arguments to pass to the metric's compute function. Defaults to
37
+ an empty dictionary.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ name: str,
43
+ pretty_name: str,
44
+ huggingface_id: str,
45
+ results_key: str,
46
+ compute_kwargs: dict[str, t.Any] | None = None,
47
+ postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
48
+ ) -> None:
49
+ """Initialise the Hugging Face metric.
50
+
51
+ Args:
52
+ name:
53
+ The name of the metric in snake_case.
54
+ pretty_name:
55
+ The pretty name of the metric, used for display purposes.
56
+ huggingface_id:
57
+ The Hugging Face ID of the metric.
58
+ results_key:
59
+ The name of the key used to extract the metric scores from the results
60
+ dictionary.
61
+ compute_kwargs:
62
+ Keyword arguments to pass to the metric's compute function. Defaults to
63
+ an empty dictionary.
64
+ postprocessing_fn:
65
+ A function to apply to the metric scores after they are computed, taking
66
+ the score to the postprocessed score along with its string
67
+ representation. Defaults to x -> (100 * x, f"{x:.2%}").
68
+ """
69
+ super().__init__(
70
+ name=name, pretty_name=pretty_name, postprocessing_fn=postprocessing_fn
71
+ )
72
+ self.huggingface_id = huggingface_id
73
+ self.results_key = results_key
74
+ self.compute_kwargs: dict[str, t.Any] = (
75
+ dict() if compute_kwargs is None else compute_kwargs
76
+ )
77
+ self.metric: "EvaluationModule | None" = None
78
+
79
+ def __call__(
80
+ self,
81
+ predictions: c.Sequence,
82
+ references: c.Sequence,
83
+ dataset: "Dataset",
84
+ dataset_config: "DatasetConfig",
85
+ benchmark_config: "BenchmarkConfig",
86
+ ) -> float | None:
87
+ """Calculate the metric score.
88
+
89
+ Args:
90
+ predictions:
91
+ The model predictions.
92
+ references:
93
+ The ground truth references.
94
+ dataset:
95
+ The dataset used for evaluation. This is only used in case any
96
+ additional metadata is used to compute the metrics.
97
+ dataset_config:
98
+ The dataset configuration.
99
+ benchmark_config:
100
+ The benchmark configuration.
101
+
102
+ Returns:
103
+ The calculated metric score, or None if the score should be ignored.
104
+ """
105
+ if self.metric is None:
106
+ self.metric = evaluate.load(path=self.huggingface_id)
107
+
108
+ with HiddenPrints():
109
+ results = self.metric.compute(
110
+ predictions=predictions, references=references, **self.compute_kwargs
111
+ )
112
+
113
+ # The metric returns None if we are running on multi-GPU and the current
114
+ # process is not the main process
115
+ if results is None:
116
+ return None
117
+
118
+ # Convert the results to a float score
119
+ score = results[self.results_key]
120
+ if isinstance(score, list):
121
+ score = sum(score) / len(score)
122
+ if isinstance(score, np.floating):
123
+ score = float(score)
124
+
125
+ return score
126
+
127
+
128
+ mcc_metric = HuggingFaceMetric(
129
+ name="mcc",
130
+ pretty_name="Matthew's Correlation Coefficient",
131
+ huggingface_id="matthews_correlation",
132
+ results_key="matthews_correlation",
133
+ )
134
+
135
+ macro_f1_metric = HuggingFaceMetric(
136
+ name="macro_f1",
137
+ pretty_name="Macro-average F1-score",
138
+ huggingface_id="f1",
139
+ results_key="f1",
140
+ compute_kwargs=dict(average="macro"),
141
+ )
142
+
143
+ micro_f1_metric = HuggingFaceMetric(
144
+ name="micro_f1",
145
+ pretty_name="Micro-average F1-score with MISC tags",
146
+ huggingface_id="seqeval",
147
+ results_key="overall_f1",
148
+ )
149
+
150
+ micro_f1_no_misc_metric = HuggingFaceMetric(
151
+ name="micro_f1_no_misc",
152
+ pretty_name="Micro-average F1-score without MISC tags",
153
+ huggingface_id="seqeval",
154
+ results_key="overall_f1",
155
+ )
156
+
157
+ f1_metric = HuggingFaceMetric(
158
+ name="f1",
159
+ pretty_name="F1-score",
160
+ huggingface_id="squad_v2",
161
+ results_key="f1",
162
+ postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
163
+ )
164
+
165
+ em_metric = HuggingFaceMetric(
166
+ name="em",
167
+ pretty_name="Exact Match",
168
+ huggingface_id="squad_v2",
169
+ results_key="exact",
170
+ postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
171
+ )
172
+
173
+ bert_score_metric = HuggingFaceMetric(
174
+ name="bertscore",
175
+ pretty_name="BERTScore",
176
+ huggingface_id="bertscore",
177
+ results_key="f1",
178
+ compute_kwargs=dict(
179
+ model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
180
+ ),
181
+ )
182
+
183
+ rouge_l_metric = HuggingFaceMetric(
184
+ name="rouge_l", pretty_name="ROUGE-L", huggingface_id="rouge", results_key="rougeL"
185
+ )
186
+
187
+ accuracy_metric = HuggingFaceMetric(
188
+ name="accuracy",
189
+ pretty_name="Accuracy",
190
+ huggingface_id="accuracy",
191
+ results_key="accuracy",
192
+ )
@@ -0,0 +1,257 @@
1
+ """Metrics based on LLM-as-a-judge."""
2
+
3
+ import collections.abc as c
4
+ import logging
5
+ import typing as t
6
+ from pathlib import Path
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from ..exceptions import InvalidBenchmark
11
+ from ..model_cache import ModelCache
12
+ from ..utils import extract_json_dict_from_string
13
+ from .base import Metric
14
+
15
+ if t.TYPE_CHECKING:
16
+ from datasets.arrow_dataset import Dataset
17
+
18
+ from ..data_models import BenchmarkConfig, DatasetConfig
19
+
20
+ logger: logging.Logger = logging.getLogger("euroeval")
21
+
22
+
23
+ class LLMAsAJudgeMetric(Metric):
24
+ """Use an LLM to judge the quality of the predictions."""
25
+
26
+ def __init__(
27
+ self,
28
+ name: str,
29
+ pretty_name: str,
30
+ judge_id: str,
31
+ judge_kwargs: dict[str, t.Any],
32
+ user_prompt: str,
33
+ response_format: t.Type[BaseModel],
34
+ scoring_fn: t.Callable[[BaseModel | None], float],
35
+ condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
36
+ system_prompt: str | None = None,
37
+ ) -> None:
38
+ """Initialise the LLM as a judge metric.
39
+
40
+ Args:
41
+ name:
42
+ The name of the metric in snake_case.
43
+ pretty_name:
44
+ The pretty name of the metric, used for display purposes.
45
+ judge_id:
46
+ The model ID of the LLM to use as a judge.
47
+ judge_kwargs:
48
+ Generation parameters for the judge model, such as temperature.
49
+ user_prompt:
50
+ The user prompt to use for the judge model. The prompt should be
51
+ formatted with the variables `prediction` and `condition`, to
52
+ include the model predictions and a description of what the prediction
53
+ should be judged on, respectively. If the condition is not needed,
54
+ it can be omitted from the prompt, but the `prediction` variable must
55
+ still be present.
56
+ response_format:
57
+ The response format to use for the judge model. This should be a
58
+ Pydantic model that defines the expected structure of the judge's
59
+ response.
60
+ scoring_fn:
61
+ A function that takes the judge's response and returns a score.
62
+ condition_formatting_fn (optional):
63
+ A function to format the condition string before it is included in the
64
+ user prompt. Defaults to a no-op function that returns the input
65
+ unchanged.
66
+ system_prompt (optional):
67
+ The system prompt to use for the judge model. If not provided, no system
68
+ prompt will be used.
69
+ """
70
+ super().__init__(name=name, pretty_name=pretty_name)
71
+ self.judge_id = judge_id
72
+ self.judge_kwargs = judge_kwargs
73
+ self.user_prompt = user_prompt
74
+ self.response_format = response_format
75
+ self.scoring_fn = scoring_fn
76
+ self.condition_formatting_fn = condition_formatting_fn
77
+ self.system_prompt = system_prompt
78
+
79
+ # Add response format to the generation kwargs
80
+ self.judge_kwargs["response_format"] = self.response_format
81
+
82
+ def __call__(
83
+ self,
84
+ predictions: c.Sequence,
85
+ references: c.Sequence,
86
+ dataset: "Dataset",
87
+ dataset_config: "DatasetConfig",
88
+ benchmark_config: "BenchmarkConfig",
89
+ ) -> float | None:
90
+ """Calculate the metric score using the judge model.
91
+
92
+ Args:
93
+ predictions:
94
+ The model predictions.
95
+ references:
96
+ The ground truth references.
97
+ dataset:
98
+ The dataset used for evaluation. This is only used in case any
99
+ additional metadata is used to compute the metrics.
100
+ dataset_config:
101
+ The dataset configuration.
102
+ benchmark_config:
103
+ The benchmark configuration.
104
+
105
+ Returns:
106
+ The calculated metric score, or None if the score should be ignored.
107
+
108
+ Raises:
109
+ InvalidBenchmark:
110
+ If the number of predictions does not match the number of references,
111
+ or if the user prompt requires a condition but none is provided.
112
+ """
113
+ # Importing here to avoid circular imports
114
+ from ..benchmark_modules import LiteLLMModel
115
+
116
+ if not predictions or not references:
117
+ return None
118
+ elif len(predictions) != len(references):
119
+ raise InvalidBenchmark(
120
+ f"The number of predictions ({len(predictions):,}) does not match the "
121
+ f"number of references ({len(references):,})."
122
+ )
123
+
124
+ # Load the judge model
125
+ judge_model_config = LiteLLMModel.get_model_config(
126
+ model_id=self.judge_id, benchmark_config=benchmark_config
127
+ )
128
+ self.judge = LiteLLMModel(
129
+ model_config=judge_model_config,
130
+ dataset_config=dataset_config,
131
+ benchmark_config=benchmark_config,
132
+ log_metadata=False,
133
+ **self.judge_kwargs,
134
+ )
135
+
136
+ # Create a cache for the judge model
137
+ judge_cache = ModelCache(
138
+ model_cache_dir=Path(judge_model_config.model_cache_dir),
139
+ cache_name=f"{dataset_config.name}-model-outputs.json",
140
+ max_generated_tokens=dataset_config.max_generated_tokens,
141
+ )
142
+ judge_cache.load()
143
+
144
+ # Prepare the messages for the LLM
145
+ conversations = [
146
+ [
147
+ dict(
148
+ role="user",
149
+ content=self._apply_user_prompt(
150
+ prediction=prediction, condition=condition
151
+ ),
152
+ )
153
+ ]
154
+ for prediction, condition in zip(predictions, references)
155
+ ]
156
+ if self.system_prompt:
157
+ conversations = [
158
+ [dict(role="system", content=self.system_prompt), *conversation]
159
+ for conversation in conversations
160
+ ]
161
+
162
+ # Get the non-cached conversations and generate the completions for them
163
+ non_cached_conversations = [
164
+ (idx, conversation)
165
+ for idx, conversation in enumerate(conversations)
166
+ if conversation not in judge_cache
167
+ ]
168
+ if non_cached_conversations:
169
+ model_inputs = dict(messages=[c for _, c in non_cached_conversations])
170
+ non_cached_outputs = self.judge.generate(inputs=model_inputs)
171
+
172
+ # Store the non-cached outputs in the cache
173
+ judge_cache.add_to_cache(
174
+ model_inputs=model_inputs, model_output=non_cached_outputs
175
+ )
176
+ judge_cache.save()
177
+
178
+ # Load all the outputs from the cache, in the original order, and parse them
179
+ raw_outputs = [judge_cache[conversation] for conversation in conversations]
180
+ json_dicts = [
181
+ extract_json_dict_from_string(s=output.sequence) for output in raw_outputs
182
+ ]
183
+ outputs = [
184
+ self.response_format.model_validate(obj=json_dict)
185
+ if json_dict is not None
186
+ else None
187
+ for json_dict in json_dicts
188
+ ]
189
+
190
+ # Calculate the scores using the scoring function
191
+ scores = [self.scoring_fn(output) for output in outputs]
192
+ if not scores:
193
+ logger.warning(f"No scores were calculated for {self.pretty_name}.")
194
+ return None
195
+ return sum(scores) / len(scores)
196
+
197
+ def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
198
+ """Apply the user prompt to the prediction and condition.
199
+
200
+ Args:
201
+ prediction:
202
+ The model prediction.
203
+ condition (optional):
204
+ A description of what the prediction should be judged on. If not
205
+ provided, it will be omitted from the prompt.
206
+
207
+ Returns:
208
+ The formatted user prompt with the prediction and reference.
209
+
210
+ Raises:
211
+ InvalidBenchmark:
212
+ If the user prompt requires a reference but none is provided.
213
+ """
214
+ condition_required = "{condition}" in self.user_prompt
215
+ if condition_required and condition is None:
216
+ raise InvalidBenchmark(
217
+ f"The user prompt for the {self.pretty_name!r} metric requires a "
218
+ "condition, but none was provided."
219
+ )
220
+ if condition is not None:
221
+ return self.user_prompt.format(
222
+ prediction=prediction, condition=self.condition_formatting_fn(condition)
223
+ )
224
+ return self.user_prompt.format(prediction=prediction)
225
+
226
+
227
+ ### Fluency metric ###
228
+
229
+
230
+ class Fluency(BaseModel):
231
+ """Response format for the fluency metric.
232
+
233
+ Attributes:
234
+ fluency:
235
+ The fluency rating, an integer between 1 and 5.
236
+ """
237
+
238
+ fluency: t.Annotated[int, Field(ge=1, le=5)]
239
+
240
+
241
+ fluency_metric = LLMAsAJudgeMetric(
242
+ name="fluency",
243
+ pretty_name="Fluency",
244
+ judge_id="gpt-5-2025-08-07",
245
+ judge_kwargs=dict(temperature=1.0),
246
+ user_prompt="Please rate the fluency of the following text on a scale from 1 to 5, "
247
+ "with the following definitions:\n"
248
+ "- 1: Very poor fluency, many grammatical errors\n"
249
+ "- 2: Poor fluency, several grammatical errors\n"
250
+ "- 3: Average fluency, a few grammatical errors\n"
251
+ "- 4: Good fluency, no grammatical errors but sounds a bit off\n"
252
+ "- 5: Excellent fluency, no grammatical errors and sounds natural\n\n"
253
+ "Text: {prediction!r}\n\n"
254
+ "Output your rating as a JSON object with a single key 'fluency'.",
255
+ response_format=Fluency,
256
+ scoring_fn=lambda output: (output.fluency - 1) / 4.0 if output is not None else 0.0,
257
+ )