EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. euroeval/__init__.py +7 -0
  2. euroeval/benchmark_config_factory.py +7 -0
  3. euroeval/benchmark_modules/base.py +29 -29
  4. euroeval/benchmark_modules/fresh.py +31 -19
  5. euroeval/benchmark_modules/hf.py +27 -23
  6. euroeval/benchmark_modules/litellm.py +50 -30
  7. euroeval/benchmark_modules/vllm.py +22 -26
  8. euroeval/benchmarker.py +8 -1
  9. euroeval/callbacks.py +17 -13
  10. euroeval/cli.py +10 -0
  11. euroeval/data_loading.py +10 -5
  12. euroeval/data_models.py +9 -40
  13. euroeval/dataset_configs/__init__.py +1 -0
  14. euroeval/dataset_configs/english.py +13 -4
  15. euroeval/dataset_configs/norwegian.py +8 -0
  16. euroeval/dataset_configs/portuguese.py +74 -0
  17. euroeval/dataset_configs/spanish.py +4 -3
  18. euroeval/finetuning.py +9 -8
  19. euroeval/generation.py +27 -8
  20. euroeval/human_evaluation.py +14 -13
  21. euroeval/languages.py +1 -2
  22. euroeval/metrics.py +452 -0
  23. euroeval/prompt_templates/linguistic_acceptability.py +9 -1
  24. euroeval/prompt_templates/multiple_choice.py +9 -1
  25. euroeval/prompt_templates/named_entity_recognition.py +20 -1
  26. euroeval/prompt_templates/sentiment_classification.py +11 -1
  27. euroeval/prompt_templates/summarization.py +8 -1
  28. euroeval/scores.py +14 -19
  29. euroeval/speed_benchmark.py +6 -7
  30. euroeval/task_group_utils/multiple_choice_classification.py +6 -4
  31. euroeval/task_group_utils/question_answering.py +5 -28
  32. euroeval/task_group_utils/sequence_classification.py +6 -30
  33. euroeval/task_group_utils/text_to_text.py +19 -34
  34. euroeval/task_group_utils/token_classification.py +18 -30
  35. euroeval/tasks.py +11 -136
  36. euroeval/types.py +6 -4
  37. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
  38. euroeval-15.12.0.dist-info/RECORD +63 -0
  39. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
  40. euroeval-15.10.1.dist-info/RECORD +0 -61
  41. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
  42. {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  """Templates for all multiple choice tasks."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  # TODO: Missing Faroese
7
7
  MULTIPLE_CHOICE_TEMPLATES = {
@@ -36,6 +36,14 @@ MULTIPLE_CHOICE_TEMPLATES = {
36
36
  "usando solo {labels_str}, y nada más.",
37
37
  default_prompt_label_mapping="auto",
38
38
  ),
39
+ PT: PromptConfig(
40
+ default_prompt_prefix="As seguintes são perguntas de escolha múltipla "
41
+ "(com respostas).",
42
+ default_prompt_template="Pergunta: {text}\nResposta: {label}",
43
+ default_instruction_prompt="Pergunta: {text}\n\nResponde à pergunta "
44
+ "acima usando só {labels_str}, e nada mais.",
45
+ default_prompt_label_mapping="auto",
46
+ ),
39
47
  FI: PromptConfig(
40
48
  default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
41
49
  default_prompt_template="Kysymys: {text}\nVastaus: {label}",
@@ -1,7 +1,7 @@
1
1
  """Templates for the Named Entity Recognition task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  NER_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
80
80
  "claves {labels_str}. Los valores deben ser listas de las "
81
81
  "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
82
82
  ),
83
+ PT: PromptConfig(
84
+ default_prompt_label_mapping={
85
+ "b-per": "pessoa",
86
+ "i-per": "pessoa",
87
+ "b-loc": "local",
88
+ "i-loc": "local",
89
+ "b-org": "organização",
90
+ "i-org": "organização",
91
+ "b-misc": "diverso",
92
+ "i-misc": "diverso",
93
+ },
94
+ default_prompt_prefix="Seguem-se frases e dicionários JSON com as entidades "
95
+ "mencionadas presentes na frase indicada.",
96
+ default_prompt_template="Frase: {text}\nEntidades mencionadas: {label}",
97
+ default_instruction_prompt="Frase: {text}\n\nIdentifica as entidades "
98
+ "mencionadas na frase. Deves devolver um dicionário JSON com as chaves "
99
+ "{labels_str}. Os valores devem ser listas contendo as entidades "
100
+ "mencionadas desse tipo, tal como ocorrem na frase.",
101
+ ),
83
102
  FI: PromptConfig(
84
103
  default_prompt_label_mapping={
85
104
  "b-per": "henkilö",
@@ -1,7 +1,7 @@
1
1
  """Templates for the Sentiment Analysis task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  SENT_TEMPLATES = {
7
7
  DA: PromptConfig(
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
44
44
  default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
45
45
  "documento. Responde con {labels_str}, y nada más.",
46
46
  ),
47
+ PT: PromptConfig(
48
+ default_prompt_label_mapping=dict(
49
+ positive="positivo", neutral="neutro", negative="negativo"
50
+ ),
51
+ default_prompt_prefix="Abaixo encontras documentos e os seus "
52
+ "sentimentos correspondentes, que podem ser {labels_str}.",
53
+ default_prompt_template="Documento: {text}\nSentimento: {label}",
54
+ default_instruction_prompt="Documento: {text}\n\nClassifica o "
55
+ "sentimento do documento. Responde apenas com {labels_str}.",
56
+ ),
47
57
  FI: PromptConfig(
48
58
  default_prompt_label_mapping=dict(
49
59
  positive="positiivinen", neutral="neutrali", negative="negatiivinen"
@@ -1,7 +1,7 @@
1
1
  """Templates for the Summarization task."""
2
2
 
3
3
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
4
+ from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
5
5
 
6
6
  # TODO: Missing Faroese
7
7
  SUMM_TEMPLATES = {
@@ -36,6 +36,13 @@ SUMM_TEMPLATES = {
36
36
  "documento anterior.",
37
37
  default_prompt_label_mapping=dict(),
38
38
  ),
39
+ PT: PromptConfig(
40
+ default_prompt_prefix="Abaixo encontras documentos com resumos associados.",
41
+ default_prompt_template="Documento: {text}\nResumo: {target_text}",
42
+ default_instruction_prompt="Documento: {text}\n\nEscreve um resumo do "
43
+ "documento anterior.",
44
+ default_prompt_label_mapping=dict(),
45
+ ),
39
46
  FI: PromptConfig(
40
47
  default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
41
48
  "tiivistelmiä.",
euroeval/scores.py CHANGED
@@ -7,7 +7,7 @@ import warnings
7
7
  import numpy as np
8
8
 
9
9
  if t.TYPE_CHECKING:
10
- from .data_models import MetricConfig
10
+ from .metrics import Metric
11
11
  from .types import ScoreDict
12
12
 
13
13
  logger = logging.getLogger("euroeval")
@@ -15,7 +15,7 @@ logger = logging.getLogger("euroeval")
15
15
 
16
16
  def log_scores(
17
17
  dataset_name: str,
18
- metric_configs: list["MetricConfig"],
18
+ metrics: list["Metric"],
19
19
  scores: list[dict[str, float]],
20
20
  model_id: str,
21
21
  model_revision: str,
@@ -25,7 +25,7 @@ def log_scores(
25
25
  Args:
26
26
  dataset_name:
27
27
  Name of the dataset.
28
- metric_configs:
28
+ metrics:
29
29
  List of metrics to log.
30
30
  scores:
31
31
  The scores that are to be logged. This is a list of dictionaries full of
@@ -46,19 +46,19 @@ def log_scores(
46
46
  logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
47
47
 
48
48
  total_dict: dict[str, float] = dict()
49
- for metric_cfg in metric_configs:
50
- test_score, test_se = aggregate_scores(scores=scores, metric_config=metric_cfg)
51
- test_score, test_score_str = metric_cfg.postprocessing_fn(test_score)
52
- test_se, test_se_str = metric_cfg.postprocessing_fn(test_se)
53
- total_dict[f"test_{metric_cfg.name}"] = test_score
54
- total_dict[f"test_{metric_cfg.name}_se"] = test_se
55
- logger.info(f"{metric_cfg.pretty_name}: {test_score_str} ± {test_se_str}")
49
+ for metric in metrics:
50
+ test_score, test_se = aggregate_scores(scores=scores, metric=metric)
51
+ test_score, test_score_str = metric.postprocessing_fn(test_score)
52
+ test_se, test_se_str = metric.postprocessing_fn(test_se)
53
+ total_dict[f"test_{metric.name}"] = test_score
54
+ total_dict[f"test_{metric.name}_se"] = test_se
55
+ logger.info(f"{metric.pretty_name}: {test_score_str} ± {test_se_str}")
56
56
 
57
57
  return dict(raw=scores, total=total_dict)
58
58
 
59
59
 
60
60
  def aggregate_scores(
61
- scores: list[dict[str, float]], metric_config: "MetricConfig"
61
+ scores: list[dict[str, float]], metric: "Metric"
62
62
  ) -> tuple[float, float]:
63
63
  """Helper function to compute the mean with confidence intervals.
64
64
 
@@ -66,9 +66,8 @@ def aggregate_scores(
66
66
  scores:
67
67
  Dictionary with the names of the metrics as keys, of the form
68
68
  "<split>_<metric_name>", such as "val_f1", and values the metric values.
69
- metric_config:
70
- The configuration of the metric, which is used to collect the correct
71
- metric from `scores`.
69
+ metric:
70
+ The metric, which is used to collect the correct metric from `scores`.
72
71
 
73
72
  Returns:
74
73
  A pair of floats, containing the score and the radius of its 95% confidence
@@ -78,11 +77,7 @@ def aggregate_scores(
78
77
  warnings.simplefilter("ignore")
79
78
 
80
79
  test_scores = [
81
- (
82
- dct[metric_config.name]
83
- if metric_config.name in dct
84
- else dct[f"test_{metric_config.name}"]
85
- )
80
+ dct[metric.name] if metric.name in dct else dct[f"test_{metric.name}"]
86
81
  for dct in scores
87
82
  ]
88
83
  test_score = np.mean(test_scores).item()
@@ -1,21 +1,20 @@
1
1
  """Benchmarking model inference speed."""
2
2
 
3
3
  import logging
4
+ import typing as t
4
5
 
5
6
  import pyinfer
6
7
  from tqdm.auto import tqdm
7
8
  from transformers.models.auto.tokenization_auto import AutoTokenizer
8
9
 
9
- from .benchmark_modules import (
10
- BenchmarkModule,
11
- HuggingFaceEncoderModel,
12
- LiteLLMModel,
13
- VLLMModel,
14
- )
15
- from .data_models import BenchmarkConfig
10
+ from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
16
11
  from .exceptions import InvalidBenchmark
17
12
  from .utils import clear_memory
18
13
 
14
+ if t.TYPE_CHECKING:
15
+ from .benchmark_modules import BenchmarkModule
16
+ from .data_models import BenchmarkConfig
17
+
19
18
  logger = logging.getLogger("euroeval")
20
19
 
21
20
 
@@ -7,14 +7,15 @@ import typing as t
7
7
  from collections import defaultdict
8
8
 
9
9
  import numpy as np
10
- from datasets import Dataset
11
- from transformers.tokenization_utils import PreTrainedTokenizer
12
- from transformers.tokenization_utils_base import BatchEncoding
13
10
  from transformers.trainer import Trainer
14
11
 
15
12
  from ..exceptions import InvalidBenchmark
16
13
 
17
14
  if t.TYPE_CHECKING:
15
+ from datasets import Dataset
16
+ from transformers.tokenization_utils import PreTrainedTokenizer
17
+ from transformers.tokenization_utils_base import BatchEncoding
18
+
18
19
  from ..types import Labels, Predictions
19
20
 
20
21
  logger = logging.getLogger("euroeval")
@@ -147,7 +148,8 @@ def postprocess_predictions_and_labels(
147
148
 
148
149
  Args:
149
150
  predictions:
150
- The model predictions, of shape (num_examples, 2).
151
+ The model predictions, of shape (num_examples, 2), corresponding to the
152
+ False/True probabilities for each example.
151
153
  dataset:
152
154
  The dataset containing the examples.
153
155
 
@@ -5,13 +5,10 @@ import logging
5
5
  import typing as t
6
6
  from collections import defaultdict
7
7
 
8
- import evaluate
9
8
  import numpy as np
10
- from evaluate import EvaluationModule
11
9
  from transformers.tokenization_utils_base import PreTrainedTokenizerBase
12
10
  from transformers.trainer import Trainer
13
11
 
14
- from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
12
  from ..exceptions import InvalidBenchmark
16
13
  from ..tokenization_utils import get_special_token_metadata
17
14
  from ..utils import raise_if_model_output_contains_nan_values
@@ -26,6 +23,7 @@ if t.TYPE_CHECKING:
26
23
  from transformers.trainer_utils import EvalPrediction
27
24
  from transformers.training_args import TrainingArguments
28
25
 
26
+ from ..data_models import DatasetConfig, GenerativeModelOutput
29
27
  from ..types import Labels, Predictions
30
28
 
31
29
  logger = logging.getLogger("euroeval")
@@ -151,7 +149,6 @@ class QuestionAnsweringTrainer(Trainer):
151
149
  def compute_metrics(
152
150
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
153
151
  dataset_config: "DatasetConfig",
154
- benchmark_config: "BenchmarkConfig",
155
152
  ) -> dict[str, float]:
156
153
  """Compute the metrics needed for evaluation.
157
154
 
@@ -161,8 +158,6 @@ def compute_metrics(
161
158
  contains the true labels.
162
159
  dataset_config:
163
160
  The configuration of the dataset.
164
- benchmark_config:
165
- The configuration of the benchmark.
166
161
 
167
162
  Returns:
168
163
  A dictionary with the names of the metrics as keys and the metric values as
@@ -178,17 +173,6 @@ def compute_metrics(
178
173
  assert not isinstance(model_outputs, tuple)
179
174
  raise_if_model_output_contains_nan_values(model_output=model_outputs)
180
175
 
181
- metrics = {
182
- metric_cfg.name: (
183
- evaluate.load(
184
- path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
185
- )
186
- if metric_cfg.huggingface_id != ""
187
- else None
188
- )
189
- for metric_cfg in dataset_config.task.metrics
190
- }
191
-
192
176
  model_output_dtype = np.asarray(model_outputs).dtype
193
177
  if model_output_dtype in [np.float16, np.float32, np.float64]:
194
178
  predictions = np.asarray(model_outputs).argmax(axis=-1)
@@ -196,20 +180,13 @@ def compute_metrics(
196
180
  predictions = model_outputs
197
181
 
198
182
  results: dict[str, float] = dict()
199
- for cfg in dataset_config.task.metrics:
200
- metric = metrics[cfg.name]
201
- assert isinstance(metric, EvaluationModule)
202
- score_dict: dict[str, float] | None = metric.compute(
203
- predictions=predictions, references=labels, **cfg.compute_kwargs
204
- )
183
+ for metric in dataset_config.task.metrics:
184
+ score: float | None = metric(predictions=predictions, references=labels)
205
185
 
206
186
  # The metric returns None if we are running on multi-GPU and the current
207
187
  # process is not the main process
208
- if score_dict is not None:
209
- scores = score_dict[cfg.results_key]
210
- if isinstance(scores, list):
211
- scores = sum(scores) / len(scores)
212
- results[cfg.name] = scores
188
+ if score is not None:
189
+ results[metric.name] = score
213
190
 
214
191
  return results
215
192
 
@@ -4,19 +4,16 @@ import logging
4
4
  import re
5
5
  import typing as t
6
6
 
7
- import evaluate
8
7
  import Levenshtein
9
8
  import numpy as np
10
- from evaluate import EvaluationModule
11
9
 
12
- from ..data_models import BenchmarkConfig, GenerativeModelOutput
13
10
  from ..exceptions import InvalidBenchmark
14
11
  from ..utils import log_once, raise_if_model_output_contains_nan_values
15
12
 
16
13
  if t.TYPE_CHECKING:
17
14
  from transformers.trainer_utils import EvalPrediction
18
15
 
19
- from ..data_models import DatasetConfig
16
+ from ..data_models import DatasetConfig, GenerativeModelOutput
20
17
  from ..types import Labels, Predictions
21
18
 
22
19
 
@@ -26,7 +23,6 @@ logger = logging.getLogger("euroeval")
26
23
  def compute_metrics(
27
24
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
28
25
  dataset_config: "DatasetConfig",
29
- benchmark_config: "BenchmarkConfig",
30
26
  ) -> dict[str, float]:
31
27
  """Compute the metrics needed for evaluation.
32
28
 
@@ -36,8 +32,6 @@ def compute_metrics(
36
32
  contains the true labels.
37
33
  dataset_config:
38
34
  The configuration of the dataset.
39
- benchmark_config:
40
- The configuration of the benchmark.
41
35
 
42
36
  Returns:
43
37
  A dictionary with the names of the metrics as keys and the metric values as
@@ -51,17 +45,6 @@ def compute_metrics(
51
45
  if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
52
46
  model_outputs = model_outputs[0]
53
47
 
54
- metrics = {
55
- metric_cfg.name: (
56
- evaluate.load(
57
- path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
58
- )
59
- if metric_cfg.huggingface_id != ""
60
- else None
61
- )
62
- for metric_cfg in dataset_config.task.metrics
63
- }
64
-
65
48
  model_output_dtype = np.asarray(model_outputs).dtype
66
49
  if model_output_dtype in [np.float16, np.float32, np.float64]:
67
50
  predictions = np.asarray(model_outputs).argmax(axis=-1)
@@ -89,27 +72,20 @@ def compute_metrics(
89
72
  ]
90
73
 
91
74
  results: dict[str, float] = dict()
92
- for cfg in dataset_config.task.metrics:
93
- metric = metrics[cfg.name]
94
- assert isinstance(metric, EvaluationModule)
95
- score_dict: dict[str, float] | None = metric.compute(
96
- predictions=predictions, references=label_ids, **cfg.compute_kwargs
97
- )
75
+ for metric in dataset_config.task.metrics:
76
+ score: float | None = metric(predictions=predictions, references=label_ids)
98
77
 
99
78
  # The metric returns None if we are running on multi-GPU and the current
100
79
  # process is not the main process
101
- if score_dict is not None:
102
- scores = score_dict[cfg.results_key]
103
- if isinstance(scores, list):
104
- scores = sum(scores) / len(scores)
105
- results[cfg.name] = scores
80
+ if score is not None:
81
+ results[metric.name] = score
106
82
 
107
83
  return results
108
84
 
109
85
 
110
86
  def extract_labels_from_generation(
111
87
  input_batch: dict[str, list],
112
- model_output: GenerativeModelOutput,
88
+ model_output: "GenerativeModelOutput",
113
89
  dataset_config: "DatasetConfig",
114
90
  first_label_token_mapping: dict[str, str] | bool,
115
91
  ) -> list[str]:
@@ -3,18 +3,17 @@
3
3
  import logging
4
4
  import typing as t
5
5
 
6
- import evaluate
7
6
  import numpy as np
8
- from evaluate import EvaluationModule
9
7
 
10
8
  from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
11
- from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
12
9
  from ..exceptions import InvalidBenchmark
13
- from ..utils import HiddenPrints, raise_if_model_output_contains_nan_values
10
+ from ..metrics import HuggingFaceMetric
11
+ from ..utils import raise_if_model_output_contains_nan_values
14
12
 
15
13
  if t.TYPE_CHECKING:
16
14
  from transformers.trainer_utils import EvalPrediction
17
15
 
16
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
18
17
  from ..types import Labels, Predictions
19
18
 
20
19
 
@@ -51,17 +50,6 @@ def compute_metrics(
51
50
  assert not isinstance(model_outputs, tuple)
52
51
  raise_if_model_output_contains_nan_values(model_output=model_outputs)
53
52
 
54
- metrics = {
55
- metric_cfg.name: (
56
- evaluate.load(
57
- path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
58
- )
59
- if metric_cfg.huggingface_id != ""
60
- else None
61
- )
62
- for metric_cfg in dataset_config.task.metrics
63
- }
64
-
65
53
  model_output_dtype = np.asarray(model_outputs).dtype
66
54
  output_is_prob = model_output_dtype in [np.float16, np.float32, np.float64]
67
55
  if output_is_prob:
@@ -70,21 +58,18 @@ def compute_metrics(
70
58
  predictions = model_outputs
71
59
 
72
60
  results: dict[str, float] = dict()
73
- for cfg in dataset_config.task.metrics:
74
- metric = metrics[cfg.name]
75
- assert isinstance(metric, EvaluationModule)
76
-
61
+ for metric in dataset_config.task.metrics:
77
62
  # Some metrics can be computed on hardware accelerators. In this case we
78
63
  # start by setting the device to the same device as the model
79
- if cfg.compute_kwargs.get("device", None) == "auto":
80
- cfg.compute_kwargs["device"] = benchmark_config.device.type
64
+ if (
65
+ isinstance(metric, HuggingFaceMetric)
66
+ and metric.compute_kwargs.get("device", None) == "auto"
67
+ ):
68
+ metric.compute_kwargs["device"] = benchmark_config.device.type
81
69
 
82
70
  while True:
83
71
  try:
84
- with HiddenPrints():
85
- score_dict: dict[str, float] | None = metric.compute(
86
- predictions=predictions, references=labels, **cfg.compute_kwargs
87
- )
72
+ score: float | None = metric(predictions=predictions, references=labels)
88
73
  break
89
74
  except Exception as e:
90
75
  oom_error = [
@@ -95,11 +80,14 @@ def compute_metrics(
95
80
  if not any(error in str(e) for error in oom_error):
96
81
  raise InvalidBenchmark(str(e))
97
82
 
98
- if cfg.compute_kwargs.get("device", "cpu") != "cpu":
99
- cfg.compute_kwargs["device"] = "cpu"
83
+ if (
84
+ isinstance(metric, HuggingFaceMetric)
85
+ and metric.compute_kwargs.get("device", "cpu") != "cpu"
86
+ ):
87
+ metric.compute_kwargs["device"] = "cpu"
100
88
  logger.debug(
101
89
  "Out of memory error occurred during the computation of "
102
- f"the metric {cfg.pretty_name}. Moving the computation to "
90
+ f"the metric {metric.pretty_name}. Moving the computation to "
103
91
  "the CPU."
104
92
  )
105
93
  else:
@@ -109,17 +97,14 @@ def compute_metrics(
109
97
  if hasattr(metric, attribute):
110
98
  logger.debug(
111
99
  f"Deleting the {attribute!r} attribute of the metric "
112
- f"{cfg.pretty_name} to free up memory."
100
+ f"{metric.pretty_name} to free up memory."
113
101
  )
114
102
  delattr(metric, attribute)
115
103
 
116
104
  # The metric returns None if we are running on multi-GPU and the current
117
105
  # process is not the main process
118
- if score_dict is not None:
119
- scores = score_dict[cfg.results_key]
120
- if isinstance(scores, list):
121
- scores = sum(scores) / len(scores)
122
- results[cfg.name] = scores
106
+ if score is not None:
107
+ results[metric.name] = score
123
108
 
124
109
  return results
125
110
 
@@ -6,19 +6,17 @@ import typing as t
6
6
  from copy import deepcopy
7
7
 
8
8
  import demjson3
9
- import evaluate
10
9
  import numpy as np
11
- from evaluate import EvaluationModule
12
- from transformers.tokenization_utils import PreTrainedTokenizer
13
10
 
14
- from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
11
  from ..exceptions import InvalidBenchmark
16
12
  from ..utils import raise_if_model_output_contains_nan_values
17
13
 
18
14
  if t.TYPE_CHECKING:
15
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
16
  from transformers.tokenization_utils_base import BatchEncoding
20
17
  from transformers.trainer_utils import EvalPrediction
21
18
 
19
+ from ..data_models import DatasetConfig, GenerativeModelOutput
22
20
  from ..types import Labels, Predictions
23
21
 
24
22
 
@@ -29,7 +27,6 @@ def compute_metrics(
29
27
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
30
28
  has_misc_tags: bool,
31
29
  dataset_config: "DatasetConfig",
32
- benchmark_config: "BenchmarkConfig",
33
30
  ) -> dict[str, float]:
34
31
  """Compute the metrics needed for evaluation.
35
32
 
@@ -41,8 +38,6 @@ def compute_metrics(
41
38
  Whether the dataset has MISC tags.
42
39
  dataset_config:
43
40
  The configuration of the dataset.
44
- benchmark_config:
45
- The configuration of the benchmark.
46
41
 
47
42
  Returns:
48
43
  A dictionary with the names of the metrics as keys and the metric values as
@@ -55,17 +50,6 @@ def compute_metrics(
55
50
  if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
56
51
  model_outputs = model_outputs[0]
57
52
 
58
- metrics = {
59
- metric_cfg.name: (
60
- evaluate.load(
61
- path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
62
- )
63
- if metric_cfg.huggingface_id != ""
64
- else None
65
- )
66
- for metric_cfg in dataset_config.task.metrics
67
- }
68
-
69
53
  predictions: list[list[str]]
70
54
  if not isinstance(model_outputs[0][0], str):
71
55
  raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
@@ -145,11 +129,14 @@ def compute_metrics(
145
129
  all(ner_tag == "o" for ner_tag in label_list) for label_list in labels
146
130
  )
147
131
  if predictions_all_zero and labels_all_zero:
148
- results = dict(overall_f1=1.0)
132
+ micro_f1_score: float | None = 1.0
149
133
  else:
150
- metric = metrics["micro_f1"]
151
- assert isinstance(metric, EvaluationModule)
152
- results = metric.compute(predictions=predictions, references=labels)
134
+ metric = next(
135
+ metric
136
+ for metric in dataset_config.task.metrics
137
+ if metric.name == "micro_f1"
138
+ )
139
+ micro_f1_score = metric(predictions=predictions, references=list(labels))
153
140
 
154
141
  # Compute the metrics without MISC tags
155
142
  # We manually set the F1 metric to be 100% if both the labels and the models
@@ -163,21 +150,22 @@ def compute_metrics(
163
150
  all(ner_tag == "o" for ner_tag in label_list) for label_list in labels_no_misc
164
151
  )
165
152
  if predictions_no_misc_all_zero and labels_no_misc_all_zero:
166
- results_no_misc = dict(overall_f1=1.0)
153
+ micro_f1_no_misc_score: float | None = 1.0
167
154
  else:
168
- metric = metrics["micro_f1_no_misc"]
169
- assert isinstance(metric, EvaluationModule)
170
- results_no_misc = metric.compute(
155
+ metric = next(
156
+ metric
157
+ for metric in dataset_config.task.metrics
158
+ if metric.name == "micro_f1_no_misc"
159
+ )
160
+ micro_f1_no_misc_score = metric(
171
161
  predictions=predictions_no_misc, references=labels_no_misc
172
162
  )
173
163
 
174
164
  # Raise error if the metrics are invalid
175
- if results is None or results_no_misc is None:
165
+ if micro_f1_score is None or micro_f1_no_misc_score is None:
176
166
  raise InvalidBenchmark("The predictions and labels are not of the same length.")
177
167
 
178
- return dict(
179
- micro_f1_no_misc=results_no_misc["overall_f1"], micro_f1=results["overall_f1"]
180
- )
168
+ return dict(micro_f1_no_misc=micro_f1_no_misc_score, micro_f1=micro_f1_score)
181
169
 
182
170
 
183
171
  def extract_labels_from_generation(