EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. euroeval/__init__.py +7 -0
  2. euroeval/benchmark_config_factory.py +7 -0
  3. euroeval/benchmark_modules/base.py +29 -29
  4. euroeval/benchmark_modules/fresh.py +31 -19
  5. euroeval/benchmark_modules/hf.py +27 -23
  6. euroeval/benchmark_modules/litellm.py +50 -30
  7. euroeval/benchmark_modules/vllm.py +22 -26
  8. euroeval/benchmarker.py +8 -1
  9. euroeval/callbacks.py +17 -13
  10. euroeval/cli.py +10 -0
  11. euroeval/data_loading.py +10 -5
  12. euroeval/data_models.py +9 -40
  13. euroeval/dataset_configs/__init__.py +1 -0
  14. euroeval/dataset_configs/english.py +13 -4
  15. euroeval/dataset_configs/norwegian.py +8 -0
  16. euroeval/dataset_configs/portuguese.py +74 -0
  17. euroeval/dataset_configs/spanish.py +4 -3
  18. euroeval/finetuning.py +9 -8
  19. euroeval/generation.py +27 -8
  20. euroeval/human_evaluation.py +14 -13
  21. euroeval/languages.py +1 -2
  22. euroeval/metrics.py +452 -0
  23. euroeval/prompt_templates/linguistic_acceptability.py +9 -1
  24. euroeval/prompt_templates/multiple_choice.py +9 -1
  25. euroeval/prompt_templates/named_entity_recognition.py +20 -1
  26. euroeval/prompt_templates/sentiment_classification.py +11 -1
  27. euroeval/prompt_templates/summarization.py +8 -1
  28. euroeval/scores.py +14 -19
  29. euroeval/speed_benchmark.py +6 -7
  30. euroeval/task_group_utils/multiple_choice_classification.py +6 -4
  31. euroeval/task_group_utils/question_answering.py +5 -28
  32. euroeval/task_group_utils/sequence_classification.py +6 -30
  33. euroeval/task_group_utils/text_to_text.py +19 -34
  34. euroeval/task_group_utils/token_classification.py +18 -30
  35. euroeval/tasks.py +11 -136
  36. euroeval/types.py +6 -4
  37. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
  38. euroeval-15.12.0.dist-info/RECORD +63 -0
  39. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
  40. euroeval-15.10.1.dist-info/RECORD +0 -61
  41. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
  42. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
euroeval/generation.py CHANGED
@@ -6,10 +6,8 @@ import typing as t
6
6
  from pathlib import Path
7
7
 
8
8
  import more_itertools as mit
9
- from datasets import Dataset, DatasetDict
10
9
  from tqdm.auto import tqdm
11
10
 
12
- from .benchmark_modules import BenchmarkModule
13
11
  from .enums import BatchingPreference, TaskGroup
14
12
  from .exceptions import InvalidBenchmark
15
13
  from .model_cache import (
@@ -20,6 +18,9 @@ from .model_cache import (
20
18
  from .utils import clear_memory
21
19
 
22
20
  if t.TYPE_CHECKING:
21
+ from datasets import Dataset, DatasetDict
22
+
23
+ from .benchmark_modules import BenchmarkModule
23
24
  from .data_models import (
24
25
  BenchmarkConfig,
25
26
  DatasetConfig,
@@ -32,7 +33,7 @@ logger = logging.getLogger("euroeval")
32
33
 
33
34
  def generate(
34
35
  model: "BenchmarkModule",
35
- datasets: list[DatasetDict],
36
+ datasets: list["DatasetDict"],
36
37
  model_config: "ModelConfig",
37
38
  dataset_config: "DatasetConfig",
38
39
  benchmark_config: "BenchmarkConfig",
@@ -100,7 +101,7 @@ def generate(
100
101
 
101
102
 
102
103
  def generate_single_iteration(
103
- dataset: Dataset,
104
+ dataset: "Dataset",
104
105
  model: "BenchmarkModule",
105
106
  dataset_config: "DatasetConfig",
106
107
  benchmark_config: "BenchmarkConfig",
@@ -199,17 +200,35 @@ def generate_single_iteration(
199
200
  all_preds.extend(extracted_labels)
200
201
 
201
202
  if "label" in non_cached_dataset.column_names:
203
+ non_cached_labels = non_cached_dataset["label"]
204
+ if not isinstance(non_cached_labels, list):
205
+ non_cached_labels = list(non_cached_labels)
206
+ cached_labels = cached_dataset["label"]
207
+ if not isinstance(cached_labels, list):
208
+ cached_labels = list(cached_labels)
202
209
  ground_truth = [
203
210
  label.lower() if isinstance(label, str) else label
204
- for label in non_cached_dataset["label"] + cached_dataset["label"]
211
+ for label in non_cached_labels + cached_labels
205
212
  ]
206
213
  elif "labels" in non_cached_dataset.column_names:
214
+ non_cached_labels = non_cached_dataset["labels"]
215
+ if not isinstance(non_cached_labels, list):
216
+ non_cached_labels = list(non_cached_labels)
217
+ cached_labels = cached_dataset["labels"]
218
+ if not isinstance(cached_labels, list):
219
+ cached_labels = list(cached_labels)
207
220
  ground_truth = [
208
221
  [label.lower() if isinstance(label, str) else label for label in label_list]
209
- for label_list in non_cached_dataset["labels"] + cached_dataset["labels"]
222
+ for label_list in non_cached_labels + cached_labels
210
223
  ]
211
224
  elif "target_text" in non_cached_dataset.column_names:
212
- ground_truth = non_cached_dataset["target_text"] + cached_dataset["target_text"]
225
+ non_cached_labels = non_cached_dataset["target_text"]
226
+ if not isinstance(non_cached_labels, list):
227
+ non_cached_labels = list(non_cached_labels)
228
+ cached_labels = cached_dataset["target_text"]
229
+ if not isinstance(cached_labels, list):
230
+ cached_labels = list(cached_labels)
231
+ ground_truth = non_cached_labels + cached_labels
213
232
  else:
214
233
  raise ValueError(
215
234
  "The dataset must have either a 'label', 'labels', or 'target_text' column"
@@ -305,7 +324,7 @@ def debug_log(
305
324
  ):
306
325
  logger.info(
307
326
  f"Input: '{input_text}'\n"
308
- f"Raw outout: '{raw_output}'\n"
327
+ f"Raw output: '{raw_output}'\n"
309
328
  f"Prediction: '{prediction}'\n"
310
329
  f"Label: '{label}'"
311
330
  )
@@ -3,6 +3,7 @@
3
3
  import importlib.util
4
4
  import json
5
5
  import logging
6
+ import typing as t
6
7
  from collections import defaultdict
7
8
  from functools import partial
8
9
  from pathlib import Path
@@ -24,13 +25,15 @@ from .task_group_utils import (
24
25
  token_classification,
25
26
  )
26
27
  from .tasks import NER
27
- from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
28
28
  from .utils import enforce_reproducibility
29
29
 
30
30
  if importlib.util.find_spec("gradio") is not None:
31
31
  import gradio as gr
32
32
  from gradio.components import HTML, Button, Dropdown, Markdown, Textbox
33
33
 
34
+ if t.TYPE_CHECKING:
35
+ from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
36
+
34
37
  logger = logging.getLogger("euroeval")
35
38
 
36
39
 
@@ -86,8 +89,8 @@ class HumanEvaluator:
86
89
  }
87
90
  )
88
91
 
89
- self.extract_labels_from_generation: ExtractLabelsFunction
90
- self.compute_metrics: ComputeMetricsFunction
92
+ self.extract_labels_from_generation: "ExtractLabelsFunction"
93
+ self.compute_metrics: "ComputeMetricsFunction"
91
94
 
92
95
  def create_app(self) -> "gr.Blocks":
93
96
  """Create the Gradio app for human evaluation.
@@ -269,6 +272,7 @@ class HumanEvaluator:
269
272
  num_iterations=iteration + 1,
270
273
  api_base=None,
271
274
  api_version=None,
275
+ gpu_memory_utilization=0.9,
272
276
  debug=False,
273
277
  run_with_cli=True,
274
278
  only_allow_safetensors=False,
@@ -342,7 +346,6 @@ class HumanEvaluator:
342
346
  self.compute_metrics = partial(
343
347
  sequence_classification.compute_metrics,
344
348
  dataset_config=self.dataset_config,
345
- benchmark_config=benchmark_config,
346
349
  )
347
350
  self.extract_labels_from_generation = partial(
348
351
  sequence_classification.extract_labels_from_generation,
@@ -362,7 +365,6 @@ class HumanEvaluator:
362
365
  token_classification.compute_metrics,
363
366
  has_misc_tags=self.has_misc_tags,
364
367
  dataset_config=self.dataset_config,
365
- benchmark_config=benchmark_config,
366
368
  )
367
369
  self.extract_labels_from_generation = partial(
368
370
  token_classification.extract_labels_from_generation,
@@ -372,7 +374,6 @@ class HumanEvaluator:
372
374
  self.compute_metrics = partial(
373
375
  question_answering.compute_metrics,
374
376
  dataset_config=self.dataset_config,
375
- benchmark_config=benchmark_config,
376
377
  )
377
378
  self.extract_labels_from_generation = (
378
379
  question_answering.extract_labels_from_generation
@@ -641,7 +642,7 @@ class HumanEvaluator:
641
642
  # only a single iteration, so the results from the current annotation should be
642
643
  # added to the previous results.
643
644
  results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
644
- results: ScoreDict = defaultdict(list)
645
+ results: "ScoreDict" = defaultdict(list)
645
646
  if results_path.exists():
646
647
  all_results = [
647
648
  json.loads(line.strip())
@@ -664,15 +665,15 @@ class HumanEvaluator:
664
665
 
665
666
  # Aggregate scores
666
667
  total_dict: dict[str, float] = dict()
667
- for metric_cfg in self.dataset_config.task.metrics:
668
+ for metric in self.dataset_config.task.metrics:
668
669
  test_score, test_se = aggregate_scores(
669
670
  scores=results["raw"], # type: ignore[arg-type]
670
- metric_config=metric_cfg,
671
+ metric=metric,
671
672
  )
672
- test_score, _ = metric_cfg.postprocessing_fn(test_score)
673
- test_se, _ = metric_cfg.postprocessing_fn(test_se)
674
- total_dict[f"test_{metric_cfg.name}"] = test_score
675
- total_dict[f"test_{metric_cfg.name}_se"] = test_se
673
+ test_score, _ = metric.postprocessing_fn(test_score)
674
+ test_se, _ = metric.postprocessing_fn(test_se)
675
+ total_dict[f"test_{metric.name}"] = test_score
676
+ total_dict[f"test_{metric.name}_se"] = test_se
676
677
  results["total"] = total_dict
677
678
 
678
679
  benchmark_result = BenchmarkResult(
euroeval/languages.py CHANGED
@@ -36,7 +36,7 @@ NN = Language(
36
36
  )
37
37
  ES = Language(code="es", name="Spanish", _and_separator="y", _or_separator="o")
38
38
  SV = Language(code="sv", name="Swedish", _and_separator="och", _or_separator="eller")
39
-
39
+ PT = Language(code="pt", name="Portuguese", _and_separator="e", _or_separator="ou")
40
40
 
41
41
  AB = Language(code="ab", name="Abkhazian")
42
42
  AA = Language(code="aa", name="Afar")
@@ -152,7 +152,6 @@ PI = Language(code="pi", name="Pali")
152
152
  PS = Language(code="ps", name="Pashto")
153
153
  FA = Language(code="fa", name="Persian")
154
154
  PL = Language(code="pl", name="Polish")
155
- PT = Language(code="pt", name="Portuguese")
156
155
  PA = Language(code="pa", name="Punjabi")
157
156
  QU = Language(code="qu", name="Quechua")
158
157
  RO = Language(code="ro", name="Romanian")
euroeval/metrics.py ADDED
@@ -0,0 +1,452 @@
1
+ """All the metrics used in EuroEval."""
2
+
3
+ import abc
4
+ import logging
5
+ import typing as t
6
+
7
+ import evaluate
8
+ import litellm
9
+ from litellm.types.utils import Choices, ModelResponse
10
+ from pydantic import BaseModel, Field
11
+ from tqdm.auto import tqdm
12
+
13
+ from .exceptions import InvalidBenchmark
14
+ from .utils import HiddenPrints
15
+
16
+ if t.TYPE_CHECKING:
17
+ from evaluate import EvaluationModule
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class Metric(abc.ABC):
23
+ """Abstract base class for all metrics."""
24
+
25
+ def __init__(
26
+ self,
27
+ name: str,
28
+ pretty_name: str,
29
+ postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
30
+ ) -> None:
31
+ """Initialise the metric.
32
+
33
+ Args:
34
+ name:
35
+ The name of the metric in snake_case.
36
+ pretty_name:
37
+ The pretty name of the metric, used for display purposes.
38
+ postprocessing_fn:
39
+ A function to apply to the metric scores after they are computed,
40
+ taking the score to the postprocessed score along with its string
41
+ representation. Defaults to x -> (100 * x, f"{x:.2%}").
42
+ """
43
+ self.name = name
44
+ self.pretty_name = pretty_name
45
+ self.postprocessing_fn = (
46
+ postprocessing_fn
47
+ if postprocessing_fn is not None
48
+ else lambda x: (100 * x, f"{x:.2%}")
49
+ )
50
+
51
+ @abc.abstractmethod
52
+ def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
53
+ """Calculate the metric score.
54
+
55
+ Args:
56
+ predictions:
57
+ The model predictions.
58
+ references:
59
+ The ground truth references.
60
+
61
+ Returns:
62
+ The calculated metric score, or None if the score should be ignored.
63
+ """
64
+ ...
65
+
66
+ def __hash__(self) -> int:
67
+ """Return a hash of the metric configuration."""
68
+ return hash(self.name)
69
+
70
+
71
+ class HuggingFaceMetric(Metric):
72
+ """A metric which is implemented in the `evaluate` package.
73
+
74
+ Attributes:
75
+ name:
76
+ The name of the metric in snake_case.
77
+ pretty_name:
78
+ The pretty name of the metric, used for display purposes.
79
+ huggingface_id:
80
+ The Hugging Face ID of the metric.
81
+ results_key:
82
+ The name of the key used to extract the metric scores from the results
83
+ dictionary.
84
+ compute_kwargs:
85
+ Keyword arguments to pass to the metric's compute function. Defaults to
86
+ an empty dictionary.
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ name: str,
92
+ pretty_name: str,
93
+ huggingface_id: str,
94
+ results_key: str,
95
+ compute_kwargs: dict[str, t.Any] | None = None,
96
+ postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
97
+ ) -> None:
98
+ """Initialise the Hugging Face metric.
99
+
100
+ Args:
101
+ name:
102
+ The name of the metric in snake_case.
103
+ pretty_name:
104
+ The pretty name of the metric, used for display purposes.
105
+ huggingface_id:
106
+ The Hugging Face ID of the metric.
107
+ results_key:
108
+ The name of the key used to extract the metric scores from the results
109
+ dictionary.
110
+ compute_kwargs:
111
+ Keyword arguments to pass to the metric's compute function. Defaults to
112
+ an empty dictionary.
113
+ postprocessing_fn:
114
+ A function to apply to the metric scores after they are computed, taking
115
+ the score to the postprocessed score along with its string
116
+ representation. Defaults to x -> (100 * x, f"{x:.2%}").
117
+ """
118
+ super().__init__(
119
+ name=name, pretty_name=pretty_name, postprocessing_fn=postprocessing_fn
120
+ )
121
+ self.huggingface_id = huggingface_id
122
+ self.results_key = results_key
123
+ self.compute_kwargs: dict[str, t.Any] = (
124
+ dict() if compute_kwargs is None else compute_kwargs
125
+ )
126
+ self.metric: "EvaluationModule | None" = None
127
+
128
+ def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
129
+ """Calculate the metric score.
130
+
131
+ Args:
132
+ predictions:
133
+ The model predictions.
134
+ references:
135
+ The ground truth references.
136
+
137
+ Returns:
138
+ The calculated metric score, or None if the score should be ignored.
139
+ """
140
+ if self.metric is None:
141
+ self.metric = evaluate.load(path=self.huggingface_id)
142
+
143
+ with HiddenPrints():
144
+ results = self.metric.compute(
145
+ predictions=predictions, references=references, **self.compute_kwargs
146
+ )
147
+
148
+ # The metric returns None if we are running on multi-GPU and the current
149
+ # process is not the main process
150
+ if results is None:
151
+ return None
152
+
153
+ score = results[self.results_key]
154
+ if isinstance(score, list):
155
+ score = sum(score) / len(score)
156
+
157
+ return score
158
+
159
+
160
+ class LLMAsAJudgeMetric(Metric):
161
+ """Use an LLM to judge the quality of the predictions."""
162
+
163
+ def __init__(
164
+ self,
165
+ name: str,
166
+ pretty_name: str,
167
+ judge_id: str,
168
+ judge_kwargs: dict[str, t.Any],
169
+ user_prompt: str,
170
+ response_format: t.Type[BaseModel],
171
+ scoring_fn: t.Callable[[BaseModel], float],
172
+ condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
173
+ system_prompt: str | None = None,
174
+ ) -> None:
175
+ """Initialise the LLM as a judge metric.
176
+
177
+ Args:
178
+ name:
179
+ The name of the metric in snake_case.
180
+ pretty_name:
181
+ The pretty name of the metric, used for display purposes.
182
+ judge_id:
183
+ The model ID of the LLM to use as a judge.
184
+ judge_kwargs:
185
+ Generation parameters for the judge model, such as temperature.
186
+ user_prompt:
187
+ The user prompt to use for the judge model. The prompt should be
188
+ formatted with the variables `prediction` and `condition`, to
189
+ include the model predictions and a description of what the prediction
190
+ should be judged on, respectively. If the condition is not needed,
191
+ it can be omitted from the prompt, but the `prediction` variable must
192
+ still be present.
193
+ response_format:
194
+ The response format to use for the judge model. This should be a
195
+ Pydantic model that defines the expected structure of the judge's
196
+ response.
197
+ scoring_fn:
198
+ A function that takes the judge's response and returns a score.
199
+ condition_formatting_fn (optional):
200
+ A function to format the condition string before it is included in the
201
+ user prompt. Defaults to a no-op function that returns the input
202
+ unchanged.
203
+ system_prompt (optional):
204
+ The system prompt to use for the judge model. If not provided, no system
205
+ prompt will be used.
206
+ """
207
+ super().__init__(name=name, pretty_name=pretty_name)
208
+ self.judge_id = judge_id
209
+ self.judge_kwargs = judge_kwargs
210
+ self.user_prompt = user_prompt
211
+ self.response_format = response_format
212
+ self.scoring_fn = scoring_fn
213
+ self.condition_formatting_fn = condition_formatting_fn
214
+ self.system_prompt = system_prompt
215
+
216
+ def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
217
+ """Calculate the metric score using the judge model.
218
+
219
+ Args:
220
+ predictions:
221
+ The model predictions.
222
+ references:
223
+ The ground truth references.
224
+
225
+ Returns:
226
+ The calculated metric score, or None if the score should be ignored.
227
+
228
+ Raises:
229
+ InvalidBenchmark:
230
+ If the number of predictions does not match the number of references,
231
+ or if the user prompt requires a condition but none is provided.
232
+ """
233
+ if not predictions or not references:
234
+ return None
235
+ elif len(predictions) != len(references):
236
+ raise InvalidBenchmark(
237
+ f"The number of predictions ({len(predictions):,}) does not match the "
238
+ f"number of references ({len(references):,})."
239
+ )
240
+
241
+ # Prepare the messages for the LLM
242
+ conversations: list[list[dict[str, str]]] = [
243
+ [
244
+ dict(
245
+ role="user",
246
+ content=self._apply_user_prompt(
247
+ prediction=prediction, condition=condition
248
+ ),
249
+ )
250
+ ]
251
+ for prediction, condition in zip(predictions, references)
252
+ ]
253
+ if self.system_prompt:
254
+ conversations = [
255
+ [dict(role="system", content=self.system_prompt), *conversation]
256
+ for conversation in conversations
257
+ ]
258
+
259
+ # Get the judge generations
260
+ generations = [
261
+ litellm.completion(
262
+ model=self.judge_id,
263
+ messages=conversation,
264
+ response_format=self.response_format,
265
+ **self.judge_kwargs,
266
+ )
267
+ for conversation in tqdm(
268
+ iterable=conversations,
269
+ desc=f"Computing {self.pretty_name} scores",
270
+ unit="sample",
271
+ )
272
+ ]
273
+
274
+ # Extract the outputs from the generations
275
+ outputs: list[BaseModel] = list()
276
+ for generation in generations:
277
+ assert isinstance(generation, ModelResponse), (
278
+ f"The judge model did not return a valid response: {generation!r}"
279
+ )
280
+ choice = generation.choices[0]
281
+ assert isinstance(choice, Choices), (
282
+ f"The judge model did not return a valid choice: {choice!r}"
283
+ )
284
+ json_content = choice.message.content
285
+ assert json_content is not None, (
286
+ "The judge model returned a None content in the response message."
287
+ )
288
+ output = self.response_format.model_validate_json(json_data=json_content)
289
+ outputs.append(output)
290
+
291
+ # Calculate the scores using the scoring function
292
+ scores = [self.scoring_fn(output) for output in outputs]
293
+ if not scores:
294
+ logger.warning(f"No scores were calculated for {self.pretty_name}.")
295
+ return None
296
+ return sum(scores) / len(scores)
297
+
298
+ def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
299
+ """Apply the user prompt to the prediction and condition.
300
+
301
+ Args:
302
+ prediction:
303
+ The model prediction.
304
+ condition (optional):
305
+ A description of what the prediction should be judged on. If not
306
+ provided, it will be omitted from the prompt.
307
+
308
+ Returns:
309
+ The formatted user prompt with the prediction and reference.
310
+
311
+ Raises:
312
+ InvalidBenchmark:
313
+ If the user prompt requires a reference but none is provided.
314
+ """
315
+ condition_required = "{condition}" in self.user_prompt
316
+ if condition_required and condition is None:
317
+ raise InvalidBenchmark(
318
+ f"The user prompt for the {self.pretty_name!r} metric requires a "
319
+ "condition, but none was provided."
320
+ )
321
+ if condition is not None:
322
+ return self.user_prompt.format(
323
+ prediction=prediction, condition=self.condition_formatting_fn(condition)
324
+ )
325
+ return self.user_prompt.format(prediction=prediction)
326
+
327
+
328
+ class SpeedMetric(Metric):
329
+ """Speed metric."""
330
+
331
+ def __init__(self, name: str, pretty_name: str) -> None:
332
+ """Initialise the speed metric.
333
+
334
+ Args:
335
+ name:
336
+ The name of the metric in snake_case.
337
+ pretty_name:
338
+ The pretty name of the metric, used for display purposes.
339
+ """
340
+ super().__init__(
341
+ name=name,
342
+ pretty_name=pretty_name,
343
+ postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
344
+ )
345
+
346
+ def __call__(self, _: t.Sequence, __: t.Sequence) -> float | None:
347
+ """Not used with the speed metric, but required for consistency."""
348
+ raise NotImplementedError
349
+
350
+
351
+ mcc_metric = HuggingFaceMetric(
352
+ name="mcc",
353
+ pretty_name="Matthew's Correlation Coefficient",
354
+ huggingface_id="matthews_correlation",
355
+ results_key="matthews_correlation",
356
+ )
357
+
358
+ macro_f1_metric = HuggingFaceMetric(
359
+ name="macro_f1",
360
+ pretty_name="Macro-average F1-score",
361
+ huggingface_id="f1",
362
+ results_key="f1",
363
+ compute_kwargs=dict(average="macro"),
364
+ )
365
+
366
+ micro_f1_metric = HuggingFaceMetric(
367
+ name="micro_f1",
368
+ pretty_name="Micro-average F1-score with MISC tags",
369
+ huggingface_id="seqeval",
370
+ results_key="overall_f1",
371
+ )
372
+
373
+ micro_f1_no_misc_metric = HuggingFaceMetric(
374
+ name="micro_f1_no_misc",
375
+ pretty_name="Micro-average F1-score without MISC tags",
376
+ huggingface_id="seqeval",
377
+ results_key="overall_f1",
378
+ )
379
+
380
+ f1_metric = HuggingFaceMetric(
381
+ name="f1",
382
+ pretty_name="F1-score",
383
+ huggingface_id="squad_v2",
384
+ results_key="f1",
385
+ postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
386
+ )
387
+
388
+ em_metric = HuggingFaceMetric(
389
+ name="em",
390
+ pretty_name="Exact Match",
391
+ huggingface_id="squad_v2",
392
+ results_key="exact",
393
+ postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
394
+ )
395
+
396
+ bert_score_metric = HuggingFaceMetric(
397
+ name="bertscore",
398
+ pretty_name="BERTScore",
399
+ huggingface_id="bertscore",
400
+ results_key="f1",
401
+ compute_kwargs=dict(
402
+ model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
403
+ ),
404
+ )
405
+
406
+ rouge_l_metric = HuggingFaceMetric(
407
+ name="rouge_l", pretty_name="ROUGE-L", huggingface_id="rouge", results_key="rougeL"
408
+ )
409
+
410
+ accuracy_metric = HuggingFaceMetric(
411
+ name="accuracy",
412
+ pretty_name="Accuracy",
413
+ huggingface_id="accuracy",
414
+ results_key="accuracy",
415
+ )
416
+
417
+
418
+ class Fluency(BaseModel):
419
+ """Response format for the fluency metric.
420
+
421
+ Attributes:
422
+ fluency:
423
+ The fluency rating, an integer between 1 and 5.
424
+ """
425
+
426
+ fluency: t.Annotated[int, Field(ge=1, le=5)]
427
+
428
+
429
+ # Example LLM-as-a-judge metric, to measure the fluency of the LLM output
430
+ fluency_metric = LLMAsAJudgeMetric(
431
+ name="fluency",
432
+ pretty_name="Fluency",
433
+ judge_id="gpt-4o-mini",
434
+ judge_kwargs=dict(temperature=0.0),
435
+ user_prompt="Please rate the fluency of the following text on a scale from 1 to 5, "
436
+ "with the following definitions:\n"
437
+ "- 1: Very poor fluency, many grammatical errors\n"
438
+ "- 2: Poor fluency, several grammatical errors\n"
439
+ "- 3: Average fluency, a few grammatical errors\n"
440
+ "- 4: Good fluency, no grammatical errors but sounds a bit off\n"
441
+ "- 5: Excellent fluency, no grammatical errors and sounds natural\n\n"
442
+ "Text: {prediction!r}\n\n"
443
+ "Output your rating as a JSON object with a single key 'fluency'.",
444
+ response_format=Fluency,
445
+ scoring_fn=lambda output: (output.fluency - 1) / 4.0,
446
+ )
447
+
448
+ speed_metric = SpeedMetric(name="speed", pretty_name="Tokens per second")
449
+
450
+ speed_short_metric = SpeedMetric(
451
+ name="speed_short", pretty_name="Tokens per second on short documents"
452
+ )
@@ -1,7 +1,7 @@
1
1
  """Templates for the Linguistic Acceptability task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  LA_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
36
36
  default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
37
37
  "gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
38
38
  ),
39
+ PT: PromptConfig(
40
+ default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
41
+ default_prompt_prefix="Seguem-se abaixo textos e se são "
42
+ "gramaticalmente correctos",
43
+ default_prompt_template="Texto: {text}\nGramaticalmente correcto: {label}",
44
+ default_instruction_prompt="Texto: {text}\n\nDetermina se o texto é "
45
+ "gramaticalmente correcto ou não. Responde com {labels_str}, e nada mais.",
46
+ ),
39
47
  FI: PromptConfig(
40
48
  default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
41
49
  default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "