EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,260 @@
1
+ """Metrics based on LLM-as-a-judge."""
2
+
3
+ import collections.abc as c
4
+ import logging
5
+ import typing as t
6
+ from pathlib import Path
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from ..exceptions import InvalidBenchmark
11
+ from ..logging_utils import log
12
+ from ..utils import extract_json_dict_from_string
13
+ from .base import Metric
14
+
15
+ if t.TYPE_CHECKING:
16
+ from datasets.arrow_dataset import Dataset
17
+
18
+ from ..data_models import BenchmarkConfig, DatasetConfig
19
+
20
+
21
+ class LLMAsAJudgeMetric(Metric):
22
+ """Use an LLM to judge the quality of the predictions."""
23
+
24
+ def __init__(
25
+ self,
26
+ name: str,
27
+ pretty_name: str,
28
+ judge_id: str,
29
+ judge_kwargs: dict[str, t.Any],
30
+ user_prompt: str,
31
+ response_format: t.Type[BaseModel],
32
+ scoring_fn: t.Callable[[BaseModel | None], float],
33
+ condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
34
+ system_prompt: str | None = None,
35
+ ) -> None:
36
+ """Initialise the LLM as a judge metric.
37
+
38
+ Args:
39
+ name:
40
+ The name of the metric in snake_case.
41
+ pretty_name:
42
+ The pretty name of the metric, used for display purposes.
43
+ judge_id:
44
+ The model ID of the LLM to use as a judge.
45
+ judge_kwargs:
46
+ Generation parameters for the judge model, such as temperature.
47
+ user_prompt:
48
+ The user prompt to use for the judge model. The prompt should be
49
+ formatted with the variables `prediction` and `condition`, to
50
+ include the model predictions and a description of what the prediction
51
+ should be judged on, respectively. If the condition is not needed,
52
+ it can be omitted from the prompt, but the `prediction` variable must
53
+ still be present.
54
+ response_format:
55
+ The response format to use for the judge model. This should be a
56
+ Pydantic model that defines the expected structure of the judge's
57
+ response.
58
+ scoring_fn:
59
+ A function that takes the judge's response and returns a score.
60
+ condition_formatting_fn (optional):
61
+ A function to format the condition string before it is included in the
62
+ user prompt. Defaults to a no-op function that returns the input
63
+ unchanged.
64
+ system_prompt (optional):
65
+ The system prompt to use for the judge model. If not provided, no system
66
+ prompt will be used.
67
+ """
68
+ super().__init__(name=name, pretty_name=pretty_name)
69
+ self.judge_id = judge_id
70
+ self.judge_kwargs = judge_kwargs
71
+ self.user_prompt = user_prompt
72
+ self.response_format = response_format
73
+ self.scoring_fn = scoring_fn
74
+ self.condition_formatting_fn = condition_formatting_fn
75
+ self.system_prompt = system_prompt
76
+
77
+ # Add response format to the generation kwargs
78
+ self.judge_kwargs["response_format"] = self.response_format
79
+
80
+ def __call__(
81
+ self,
82
+ predictions: c.Sequence,
83
+ references: c.Sequence,
84
+ dataset: "Dataset",
85
+ dataset_config: "DatasetConfig",
86
+ benchmark_config: "BenchmarkConfig",
87
+ ) -> float | None:
88
+ """Calculate the metric score using the judge model.
89
+
90
+ Args:
91
+ predictions:
92
+ The model predictions.
93
+ references:
94
+ The ground truth references.
95
+ dataset:
96
+ The dataset used for evaluation. This is only used in case any
97
+ additional metadata is used to compute the metrics.
98
+ dataset_config:
99
+ The dataset configuration.
100
+ benchmark_config:
101
+ The benchmark configuration.
102
+
103
+ Returns:
104
+ The calculated metric score, or None if the score should be ignored.
105
+
106
+ Raises:
107
+ InvalidBenchmark:
108
+ If the number of predictions does not match the number of references,
109
+ or if the user prompt requires a condition but none is provided.
110
+ """
111
+ # Importing here to avoid circular imports
112
+ from ..benchmark_modules import LiteLLMModel
113
+ from ..model_cache import ModelCache
114
+
115
+ if not predictions or not references:
116
+ return None
117
+ elif len(predictions) != len(references):
118
+ raise InvalidBenchmark(
119
+ f"The number of predictions ({len(predictions):,}) does not match the "
120
+ f"number of references ({len(references):,})."
121
+ )
122
+
123
+ # Load the judge model
124
+ judge_model_config = LiteLLMModel.get_model_config(
125
+ model_id=self.judge_id, benchmark_config=benchmark_config
126
+ )
127
+ self.judge = LiteLLMModel(
128
+ model_config=judge_model_config,
129
+ dataset_config=dataset_config,
130
+ benchmark_config=benchmark_config,
131
+ log_metadata=False,
132
+ **self.judge_kwargs,
133
+ )
134
+
135
+ # Create a cache for the judge model
136
+ judge_cache = ModelCache(
137
+ model_cache_dir=Path(judge_model_config.model_cache_dir),
138
+ cache_name=f"{dataset_config.name}-model-outputs.json",
139
+ max_generated_tokens=dataset_config.max_generated_tokens,
140
+ progress_bar=benchmark_config.progress_bar,
141
+ )
142
+ judge_cache.load()
143
+
144
+ # Prepare the messages for the LLM
145
+ conversations = [
146
+ [
147
+ dict(
148
+ role="user",
149
+ content=self._apply_user_prompt(
150
+ prediction=prediction, condition=condition
151
+ ),
152
+ )
153
+ ]
154
+ for prediction, condition in zip(predictions, references)
155
+ ]
156
+ if self.system_prompt:
157
+ conversations = [
158
+ [dict(role="system", content=self.system_prompt), *conversation]
159
+ for conversation in conversations
160
+ ]
161
+
162
+ # Get the non-cached conversations and generate the completions for them
163
+ non_cached_conversations = [
164
+ (idx, conversation)
165
+ for idx, conversation in enumerate(conversations)
166
+ if conversation not in judge_cache
167
+ ]
168
+ if non_cached_conversations:
169
+ model_inputs = dict(messages=[c for _, c in non_cached_conversations])
170
+ non_cached_outputs = self.judge.generate(inputs=model_inputs)
171
+
172
+ # Store the non-cached outputs in the cache
173
+ judge_cache.add_to_cache(
174
+ model_inputs=model_inputs, model_output=non_cached_outputs
175
+ )
176
+ judge_cache.save()
177
+
178
+ # Load all the outputs from the cache, in the original order, and parse them
179
+ raw_outputs = [judge_cache[conversation] for conversation in conversations]
180
+ json_dicts = [
181
+ extract_json_dict_from_string(s=output.sequence) for output in raw_outputs
182
+ ]
183
+ outputs = [
184
+ self.response_format.model_validate(obj=json_dict)
185
+ if json_dict is not None
186
+ else None
187
+ for json_dict in json_dicts
188
+ ]
189
+
190
+ # Calculate the scores using the scoring function
191
+ scores = [self.scoring_fn(output) for output in outputs]
192
+ if not scores:
193
+ log(
194
+ f"No scores were calculated for {self.pretty_name}.",
195
+ level=logging.WARNING,
196
+ )
197
+ return None
198
+ return sum(scores) / len(scores)
199
+
200
+ def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
201
+ """Apply the user prompt to the prediction and condition.
202
+
203
+ Args:
204
+ prediction:
205
+ The model prediction.
206
+ condition (optional):
207
+ A description of what the prediction should be judged on. If not
208
+ provided, it will be omitted from the prompt.
209
+
210
+ Returns:
211
+ The formatted user prompt with the prediction and reference.
212
+
213
+ Raises:
214
+ InvalidBenchmark:
215
+ If the user prompt requires a reference but none is provided.
216
+ """
217
+ condition_required = "{condition}" in self.user_prompt
218
+ if condition_required and condition is None:
219
+ raise InvalidBenchmark(
220
+ f"The user prompt for the {self.pretty_name!r} metric requires a "
221
+ "condition, but none was provided."
222
+ )
223
+ if condition is not None:
224
+ return self.user_prompt.format(
225
+ prediction=prediction, condition=self.condition_formatting_fn(condition)
226
+ )
227
+ return self.user_prompt.format(prediction=prediction)
228
+
229
+
230
+ ### Fluency metric ###
231
+
232
+
233
+ class Fluency(BaseModel):
234
+ """Response format for the fluency metric.
235
+
236
+ Attributes:
237
+ fluency:
238
+ The fluency rating, an integer between 1 and 5.
239
+ """
240
+
241
+ fluency: t.Annotated[int, Field(ge=1, le=5)]
242
+
243
+
244
+ fluency_metric = LLMAsAJudgeMetric(
245
+ name="fluency",
246
+ pretty_name="Fluency",
247
+ judge_id="gpt-5-2025-08-07",
248
+ judge_kwargs=dict(temperature=1.0),
249
+ user_prompt="Please rate the fluency of the following text on a scale from 1 to 5, "
250
+ "with the following definitions:\n"
251
+ "- 1: Very poor fluency, many grammatical errors\n"
252
+ "- 2: Poor fluency, several grammatical errors\n"
253
+ "- 3: Average fluency, a few grammatical errors\n"
254
+ "- 4: Good fluency, no grammatical errors but sounds a bit off\n"
255
+ "- 5: Excellent fluency, no grammatical errors and sounds natural\n\n"
256
+ "Text: {prediction!r}\n\n"
257
+ "Output your rating as a JSON object with a single key 'fluency'.",
258
+ response_format=Fluency,
259
+ scoring_fn=lambda output: (output.fluency - 1) / 4.0 if output is not None else 0.0,
260
+ )
@@ -0,0 +1,289 @@
1
+ """Metrics based on a scikit-learn Pipeline."""
2
+
3
+ import collections.abc as c
4
+ import logging
5
+ import typing as t
6
+ from pathlib import Path
7
+
8
+ import cloudpickle
9
+ import huggingface_hub as hf_hub
10
+ import numpy as np
11
+ from scipy.special import expit as sigmoid
12
+
13
+ from ..exceptions import InvalidBenchmark
14
+ from ..logging_utils import log, no_terminal_output
15
+ from ..utils import unscramble
16
+ from .base import Metric
17
+
18
+ if t.TYPE_CHECKING:
19
+ from datasets.arrow_dataset import Dataset
20
+ from sklearn.pipeline import Pipeline
21
+
22
+ from ..data_models import BenchmarkConfig, DatasetConfig
23
+
24
+
25
+ T = t.TypeVar("T", bound=int | float | str | bool)
26
+
27
+
28
+ class PreprocessingFunction(t.Protocol):
29
+ """A protocol for a preprocessing function."""
30
+
31
+ def __call__(
32
+ self, predictions: c.Sequence[int], dataset: "Dataset"
33
+ ) -> c.Sequence[int]:
34
+ """Preprocess the model predictions before they are passed to the pipeline.
35
+
36
+ Args:
37
+ predictions:
38
+ The model predictions.
39
+ dataset:
40
+ The dataset used for evaluation. This is only used in case any
41
+ additional metadata is used to compute the metrics.
42
+
43
+ Returns:
44
+ The preprocessed model predictions.
45
+ """
46
+ ...
47
+
48
+
49
+ class PipelineMetric(Metric):
50
+ """Load a scikit-learn pipeline and use it to get scores from the predictions."""
51
+
52
+ def __init__(
53
+ self,
54
+ name: str,
55
+ pretty_name: str,
56
+ pipeline_repo: str,
57
+ pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
58
+ pipeline_file_name: str = "pipeline.pkl",
59
+ preprocessing_fn: PreprocessingFunction | None = None,
60
+ postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
61
+ ) -> None:
62
+ """Initialise the pipeline transform metric.
63
+
64
+ Args:
65
+ name:
66
+ The name of the metric in snake_case.
67
+ pretty_name:
68
+ The pretty name of the metric, used for display purposes.
69
+ pipeline_repo:
70
+ The Hugging Face repository ID of the scikit-learn pipeline to load.
71
+ pipeline_scoring_method:
72
+ The method to use for scoring the predictions with the pipeline. Takes
73
+ a 1D sequence of predictions and returns a float score.
74
+ pipeline_file_name (optional):
75
+ The name of the file to download from the Hugging Face repository.
76
+ Defaults to "pipeline.joblib".
77
+ preprocessing_fn (optional):
78
+ A function to apply to the predictions before they are passed to the
79
+ pipeline. This is useful for preprocessing the predictions to match
80
+ the expected input format of the pipeline. Defaults to a no-op function
81
+ that returns the input unchanged.
82
+ postprocessing_fn (optional):
83
+ A function to apply to the metric scores after they are computed,
84
+ taking the score to the postprocessed score along with its string
85
+ representation. Defaults to x -> (100 * x, f"{x:.2%}").
86
+ """
87
+ super().__init__(
88
+ name=name, pretty_name=pretty_name, postprocessing_fn=postprocessing_fn
89
+ )
90
+ self.pipeline_repo = pipeline_repo
91
+ self.pipeline_file_name = pipeline_file_name
92
+ self.pipeline_scoring_function = pipeline_scoring_function
93
+ self.pipeline: "Pipeline | None" = None
94
+ self.preprocessing_fn = preprocessing_fn
95
+
96
+ def __call__(
97
+ self,
98
+ predictions: c.Sequence,
99
+ references: c.Sequence,
100
+ dataset: "Dataset",
101
+ dataset_config: "DatasetConfig",
102
+ benchmark_config: "BenchmarkConfig",
103
+ ) -> float | None:
104
+ """Calculate the metric score using the scikit-learn pipeline.
105
+
106
+ Args:
107
+ predictions:
108
+ The model predictions.
109
+ references:
110
+ Not used, but required for consistency with the Metric interface.
111
+ dataset:
112
+ The dataset used for evaluation. This is only used in case any
113
+ additional metadata is used to compute the metrics.
114
+ dataset_config:
115
+ The dataset configuration.
116
+ benchmark_config:
117
+ The benchmark configuration.
118
+
119
+ Returns:
120
+ The calculated metric score, or None if the score should be ignored.
121
+ """
122
+ if self.pipeline is None:
123
+ self.pipeline = self._download_pipeline(
124
+ cache_dir=benchmark_config.cache_dir
125
+ )
126
+ if self.preprocessing_fn is not None:
127
+ predictions = self.preprocessing_fn(
128
+ predictions=predictions, dataset=dataset
129
+ )
130
+ return self.pipeline_scoring_function(self.pipeline, predictions)
131
+
132
+ def _download_pipeline(self, cache_dir: str) -> "Pipeline":
133
+ """Download the scikit-learn pipeline from the given URL.
134
+
135
+ Args:
136
+ cache_dir:
137
+ The directory to use for caching the downloaded pipeline.
138
+
139
+ Returns:
140
+ The downloaded scikit-learn pipeline.
141
+
142
+ Raises:
143
+ InvalidBenchmark:
144
+ If the loading of the pipeline fails for any reason.
145
+ """
146
+ log(f"Loading pipeline from {self.pipeline_repo}...", level=logging.DEBUG)
147
+ with no_terminal_output():
148
+ folder_path = hf_hub.HfApi(
149
+ token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_")
150
+ ).snapshot_download(
151
+ repo_id=self.pipeline_repo, repo_type="model", cache_dir=cache_dir
152
+ )
153
+ model_path = Path(folder_path, self.pipeline_file_name)
154
+ try:
155
+ with model_path.open(mode="rb") as f:
156
+ pipeline = cloudpickle.load(f)
157
+ except Exception as e:
158
+ raise InvalidBenchmark(
159
+ f"Failed to load pipeline from {self.pipeline_repo!r}: {e}"
160
+ ) from e
161
+ log(f"Successfully loaded pipeline: {pipeline}", level=logging.DEBUG)
162
+ return pipeline
163
+
164
+
165
+ ### European Values Metric ###
166
+
167
+
168
+ def european_values_preprocessing_fn(
169
+ predictions: c.Sequence[int], dataset: "Dataset"
170
+ ) -> c.Sequence[int]:
171
+ """Preprocess the model predictions for the European Values metric.
172
+
173
+ Args:
174
+ predictions:
175
+ The model predictions, a sequence of integers representing the predicted
176
+ choices for each question.
177
+ dataset:
178
+ The dataset used for evaluation. This is only used in case any additional
179
+ metadata is used to compute the metrics.
180
+
181
+ Returns:
182
+ The preprocessed model predictions, a sequence of integers representing the
183
+ final predicted choices for each question after any necessary aggregation and
184
+ mapping.
185
+
186
+ Raises:
187
+ AssertionError:
188
+ If the number of predictions is not a multiple of 53, which is required
189
+ for the European Values metric.
190
+ """
191
+ num_questions = 53
192
+ num_phrasings_per_question = 5
193
+
194
+ # Convert the predictions to integers
195
+ integer_predictions = []
196
+ for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
197
+ idx_to_choice = {
198
+ int(idx): int(choice)
199
+ for idx, choice in idx_to_choice.items()
200
+ if choice is not None
201
+ }
202
+ if prediction not in idx_to_choice:
203
+ raise InvalidBenchmark(
204
+ f"The prediction {prediction} is not a valid index for the "
205
+ f"question with choices {idx_to_choice}."
206
+ )
207
+ integer_prediction = idx_to_choice[prediction]
208
+ integer_predictions.append(integer_prediction)
209
+
210
+ assert len(predictions) % num_questions == 0, (
211
+ f"The number of predictions ({len(predictions)}) is not a multiple of "
212
+ f"{num_questions}, which is required for the European Values metric."
213
+ )
214
+
215
+ # When we are using the situational version of the dataset, there are 5 phrasings
216
+ # for each question, so we need to aggregate the predictions by question, which we
217
+ # do using majority voting.
218
+ using_situational = len(predictions) == num_questions * num_phrasings_per_question
219
+ if using_situational:
220
+ # Reshape the predictions to a 2D array with `num_phrasings_per_question` rows
221
+ # (one for each phrasing) and `num_questions` columns (one for each question).
222
+ # The five phrasings for each question appear right after each other, e.g.,
223
+ # (0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, ...)
224
+ # Shape: (num_questions, num_phrasings_per_question)
225
+ arr = np.array(
226
+ [
227
+ integer_predictions[i : i + num_phrasings_per_question]
228
+ for i in range(0, len(predictions), num_phrasings_per_question)
229
+ ]
230
+ )
231
+
232
+ # Double check that we reshaped the predictions correctly
233
+ for idx, pred in enumerate(integer_predictions):
234
+ assert arr[idx // 5, idx % 5] == pred, (
235
+ f"Reshaped predictions do not match the original predictions at index "
236
+ f"{idx}: {arr[idx // 5, idx % 5]} != {pred}."
237
+ )
238
+
239
+ # Use majority voting to get the final prediction for each question
240
+ # Shape: (53,)
241
+ arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
242
+
243
+ # Convert the array to a list
244
+ integer_predictions = arr.tolist()
245
+
246
+ # Some of the questions are categorical and we're only interested in whether the
247
+ # model chooses a specific choice or not. This mapping takes the question index
248
+ # to the choice value that we're interested in.
249
+ question_choices = {
250
+ 0: 1,
251
+ 1: 5,
252
+ 3: 3,
253
+ 6: 1,
254
+ 15: 4,
255
+ 20: 2,
256
+ 47: 8,
257
+ 48: 7,
258
+ 49: 4,
259
+ 51: 4,
260
+ 52: 4,
261
+ }
262
+
263
+ # Map the predictions to the choices we're interested in
264
+ integer_predictions = list(integer_predictions)
265
+ for question_idx, choice in question_choices.items():
266
+ integer_predictions[question_idx] = (
267
+ 1 if integer_predictions[question_idx] == choice else 0
268
+ )
269
+
270
+ return integer_predictions
271
+
272
+
273
+ def european_values_scoring_function(
274
+ pipeline: "Pipeline", predictions: c.Sequence[int]
275
+ ) -> float:
276
+ """Scoring function for the European Values metric."""
277
+ normalised_predictions = pipeline[0].transform([predictions])
278
+ log_likelihoods = pipeline[1].transform(normalised_predictions)[0]
279
+ score = sigmoid(pipeline[2].alpha_ * (log_likelihoods - pipeline[2].center_))
280
+ return score.item()
281
+
282
+
283
+ european_values_metric = PipelineMetric(
284
+ name="european_values",
285
+ pretty_name="European Values",
286
+ pipeline_repo="EuroEval/european-values-pipeline",
287
+ pipeline_scoring_function=european_values_scoring_function,
288
+ preprocessing_fn=european_values_preprocessing_fn,
289
+ )
@@ -0,0 +1,48 @@
1
+ """Inference speed metric."""
2
+
3
+ import collections.abc as c
4
+ import typing as t
5
+
6
+ from .base import Metric
7
+
8
+ if t.TYPE_CHECKING:
9
+ from datasets.arrow_dataset import Dataset
10
+
11
+ from ..data_models import BenchmarkConfig, DatasetConfig
12
+
13
+
14
+ class SpeedMetric(Metric):
15
+ """Speed metric."""
16
+
17
+ def __init__(self, name: str, pretty_name: str) -> None:
18
+ """Initialise the speed metric.
19
+
20
+ Args:
21
+ name:
22
+ The name of the metric in snake_case.
23
+ pretty_name:
24
+ The pretty name of the metric, used for display purposes.
25
+ """
26
+ super().__init__(
27
+ name=name,
28
+ pretty_name=pretty_name,
29
+ postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
30
+ )
31
+
32
+ def __call__(
33
+ self,
34
+ predictions: c.Sequence,
35
+ references: c.Sequence,
36
+ dataset: "Dataset",
37
+ dataset_config: "DatasetConfig",
38
+ benchmark_config: "BenchmarkConfig",
39
+ ) -> float | None:
40
+ """Not used with the speed metric, but required for consistency."""
41
+ raise NotImplementedError
42
+
43
+
44
+ speed_metric = SpeedMetric(name="speed", pretty_name="Tokens per second")
45
+
46
+ speed_short_metric = SpeedMetric(
47
+ name="speed_short", pretty_name="Tokens per second on short documents"
48
+ )