EuroEval 15.10.1__py3-none-any.whl → 15.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -13,14 +13,11 @@ from pathlib import Path
13
13
  from time import sleep
14
14
 
15
15
  import torch
16
- from datasets import DatasetDict
17
16
  from huggingface_hub import snapshot_download
18
17
  from pydantic import conlist, create_model
19
18
  from tqdm.auto import tqdm
20
19
  from transformers.models.auto.configuration_auto import AutoConfig
21
20
  from transformers.models.auto.tokenization_auto import AutoTokenizer
22
- from transformers.tokenization_utils import PreTrainedTokenizer
23
- from transformers.trainer import Trainer
24
21
  from urllib3.exceptions import RequestError
25
22
 
26
23
  from ..constants import (
@@ -34,13 +31,7 @@ from ..constants import (
34
31
  TASKS_USING_JSON,
35
32
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
36
33
  )
37
- from ..data_models import (
38
- BenchmarkConfig,
39
- DatasetConfig,
40
- GenerativeModelOutput,
41
- ModelConfig,
42
- Task,
43
- )
34
+ from ..data_models import GenerativeModelOutput, ModelConfig
44
35
  from ..enums import (
45
36
  BatchingPreference,
46
37
  GenerativeType,
@@ -94,6 +85,13 @@ if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
94
85
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
95
86
  import ray
96
87
 
88
+ if t.TYPE_CHECKING:
89
+ from datasets import DatasetDict
90
+ from transformers.tokenization_utils import PreTrainedTokenizer
91
+ from transformers.trainer import Trainer
92
+
93
+ from ..data_models import BenchmarkConfig, DatasetConfig, Task
94
+
97
95
  logger = logging.getLogger("euroeval")
98
96
 
99
97
 
@@ -106,9 +104,9 @@ class VLLMModel(HuggingFaceEncoderModel):
106
104
 
107
105
  def __init__(
108
106
  self,
109
- model_config: ModelConfig,
110
- dataset_config: DatasetConfig,
111
- benchmark_config: BenchmarkConfig,
107
+ model_config: "ModelConfig",
108
+ dataset_config: "DatasetConfig",
109
+ benchmark_config: "BenchmarkConfig",
112
110
  ) -> None:
113
111
  """Initialise the vLLM model.
114
112
 
@@ -129,8 +127,8 @@ class VLLMModel(HuggingFaceEncoderModel):
129
127
  model, tokenizer = load_model_and_tokenizer(
130
128
  model_config=model_config, benchmark_config=benchmark_config
131
129
  )
132
- self._model: LLM = model
133
- self._tokenizer: PreTrainedTokenizer = tokenizer
130
+ self._model: "LLM" = model
131
+ self._tokenizer: "PreTrainedTokenizer" = tokenizer
134
132
  self.end_of_reasoning_token = get_end_of_reasoning_token(
135
133
  model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
136
134
  )
@@ -230,8 +228,8 @@ class VLLMModel(HuggingFaceEncoderModel):
230
228
  )
231
229
 
232
230
  def prepare_dataset(
233
- self, dataset: DatasetDict, task: Task, itr_idx: int
234
- ) -> DatasetDict:
231
+ self, dataset: "DatasetDict", task: "Task", itr_idx: int
232
+ ) -> "DatasetDict":
235
233
  """Prepare the dataset for the model.
236
234
 
237
235
  This includes things like tokenisation.
@@ -293,7 +291,7 @@ class VLLMModel(HuggingFaceEncoderModel):
293
291
 
294
292
  return dataset
295
293
 
296
- def generate(self, inputs: dict) -> GenerativeModelOutput:
294
+ def generate(self, inputs: dict) -> "GenerativeModelOutput":
297
295
  """Generate outputs from the model.
298
296
 
299
297
  Args:
@@ -524,7 +522,7 @@ class VLLMModel(HuggingFaceEncoderModel):
524
522
 
525
523
  @classmethod
526
524
  def model_exists(
527
- cls, model_id: str, benchmark_config: BenchmarkConfig
525
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
528
526
  ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
529
527
  """Check if a model exists.
530
528
 
@@ -558,8 +556,8 @@ class VLLMModel(HuggingFaceEncoderModel):
558
556
 
559
557
  @classmethod
560
558
  def get_model_config(
561
- cls, model_id: str, benchmark_config: BenchmarkConfig
562
- ) -> ModelConfig:
559
+ cls, model_id: str, benchmark_config: "BenchmarkConfig"
560
+ ) -> "ModelConfig":
563
561
  """Fetch the model configuration.
564
562
 
565
563
  Args:
@@ -628,8 +626,8 @@ class VLLMModel(HuggingFaceEncoderModel):
628
626
 
629
627
 
630
628
  def load_model_and_tokenizer(
631
- model_config: ModelConfig, benchmark_config: BenchmarkConfig
632
- ) -> "tuple[LLM, PreTrainedTokenizer]":
629
+ model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
630
+ ) -> tuple["LLM", "PreTrainedTokenizer"]:
633
631
  """Load the model and tokenizer.
634
632
 
635
633
  Args:
@@ -1017,7 +1015,6 @@ def get_custom_stop_tokens(
1017
1015
  """
1018
1016
  candidate_stop_tokens = CUSTOM_STOP_TOKENS
1019
1017
 
1020
- # Create a prompt to check if the model uses the reasoning tokens
1021
1018
  prompt = "Hello"
1022
1019
  if tokenizer.chat_template is not None:
1023
1020
  templated_prompt = tokenizer.apply_chat_template(
@@ -1028,7 +1025,6 @@ def get_custom_stop_tokens(
1028
1025
  assert isinstance(templated_prompt, str)
1029
1026
  prompt = templated_prompt
1030
1027
 
1031
- # Check that the beginning-of-reasoning token is actually used by the model
1032
1028
  max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
1033
1029
  completion = (
1034
1030
  model.generate(
euroeval/benchmarker.py CHANGED
@@ -767,7 +767,7 @@ class Benchmarker:
767
767
 
768
768
  results = log_scores(
769
769
  dataset_name=dataset_config.pretty_name,
770
- metric_configs=dataset_config.task.metrics,
770
+ metrics=dataset_config.task.metrics,
771
771
  scores=scores,
772
772
  model_id=model_config.model_id,
773
773
  model_revision=model_config.revision,
euroeval/callbacks.py CHANGED
@@ -1,12 +1,16 @@
1
1
  """Callbacks for the Hugging Face Trainer."""
2
2
 
3
3
  import sys
4
+ import typing as t
4
5
  from collections.abc import Sized
5
6
 
6
- from torch.utils.data import DataLoader
7
7
  from tqdm.auto import tqdm
8
- from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
9
- from transformers.training_args import TrainingArguments
8
+ from transformers.trainer_callback import ProgressCallback
9
+
10
+ if t.TYPE_CHECKING:
11
+ from torch.utils.data import DataLoader
12
+ from transformers.trainer_callback import TrainerControl, TrainerState
13
+ from transformers.training_args import TrainingArguments
10
14
 
11
15
 
12
16
  class NeverLeaveProgressCallback(ProgressCallback):
@@ -20,9 +24,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
20
24
 
21
25
  def on_train_begin(
22
26
  self,
23
- args: TrainingArguments,
24
- state: TrainerState,
25
- control: TrainerControl,
27
+ args: "TrainingArguments",
28
+ state: "TrainerState",
29
+ control: "TrainerControl",
26
30
  **kwargs: str,
27
31
  ) -> None:
28
32
  """Callback actions when training begins."""
@@ -38,9 +42,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
38
42
 
39
43
  def on_step_end(
40
44
  self,
41
- args: TrainingArguments,
42
- state: TrainerState,
43
- control: TrainerControl,
45
+ args: "TrainingArguments",
46
+ state: "TrainerState",
47
+ control: "TrainerControl",
44
48
  **kwargs: str,
45
49
  ) -> None:
46
50
  """Callback actions when a training step ends."""
@@ -50,10 +54,10 @@ class NeverLeaveProgressCallback(ProgressCallback):
50
54
 
51
55
  def on_prediction_step(
52
56
  self,
53
- args: TrainingArguments,
54
- state: TrainerState,
55
- control: TrainerControl,
56
- eval_dataloader: DataLoader | None = None,
57
+ args: "TrainingArguments",
58
+ state: "TrainerState",
59
+ control: "TrainerControl",
60
+ eval_dataloader: "DataLoader | None" = None,
57
61
  **kwargs: str,
58
62
  ) -> None:
59
63
  """Callback actions when a prediction step ends."""
euroeval/data_loading.py CHANGED
@@ -3,23 +3,28 @@
3
3
  import logging
4
4
  import sys
5
5
  import time
6
+ import typing as t
6
7
 
7
8
  import requests
8
- from datasets import Dataset, DatasetDict, load_dataset
9
+ from datasets import DatasetDict, load_dataset
9
10
  from datasets.exceptions import DatasetsError
10
11
  from huggingface_hub.errors import HfHubHTTPError
11
12
  from numpy.random import Generator
12
13
 
13
- from .data_models import BenchmarkConfig, DatasetConfig
14
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
15
15
  from .utils import unscramble
16
16
 
17
+ if t.TYPE_CHECKING:
18
+ from datasets import Dataset
19
+
20
+ from .data_models import BenchmarkConfig, DatasetConfig
21
+
17
22
  logger = logging.getLogger("euroeval")
18
23
 
19
24
 
20
25
  def load_data(
21
26
  rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
22
- ) -> list[DatasetDict]:
27
+ ) -> list["DatasetDict"]:
23
28
  """Load the raw bootstrapped datasets.
24
29
 
25
30
  Args:
@@ -56,7 +61,7 @@ def load_data(
56
61
  dataset["test"] = dataset["test"].select(range(1))
57
62
 
58
63
  # Bootstrap the splits
59
- bootstrapped_splits: dict[str, list[Dataset]] = dict()
64
+ bootstrapped_splits: dict[str, list["Dataset"]] = dict()
60
65
  for split in ["train", "val", "test"]:
61
66
  bootstrap_indices = rng.integers(
62
67
  0,
@@ -80,7 +85,7 @@ def load_data(
80
85
  return datasets
81
86
 
82
87
 
83
- def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
88
+ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
84
89
  """Load the raw dataset.
85
90
 
86
91
  Args:
euroeval/data_models.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
- import collections.abc as c
4
3
  import json
5
4
  import pathlib
6
5
  import re
@@ -11,48 +10,11 @@ import pydantic
11
10
  import torch
12
11
 
13
12
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
13
+ from .metrics import Metric
14
14
  from .types import ScoreDict
15
15
  from .utils import get_package_version
16
16
 
17
17
 
18
- @dataclass
19
- class MetricConfig:
20
- """Configuration for a metric.
21
-
22
- Attributes:
23
- name:
24
- The name of the metric.
25
- pretty_name:
26
- A longer prettier name for the metric, which allows cases and spaces. Used
27
- for logging.
28
- huggingface_id:
29
- The Hugging Face ID of the metric.
30
- results_key:
31
- The name of the key used to extract the metric scores from the results
32
- dictionary.
33
- compute_kwargs:
34
- Keyword arguments to pass to the metric's compute function. Defaults to
35
- an empty dictionary.
36
- postprocessing_fn:
37
- A function to apply to the metric scores after they are computed, taking
38
- the score to the postprocessed score along with its string representation.
39
- Defaults to x -> (100 * x, f"{x:.2%}").
40
- """
41
-
42
- name: str
43
- pretty_name: str
44
- huggingface_id: str
45
- results_key: str
46
- compute_kwargs: dict[str, t.Any] = field(default_factory=dict)
47
- postprocessing_fn: c.Callable[[float], tuple[float, str]] = field(
48
- default_factory=lambda: lambda raw_score: (100 * raw_score, f"{raw_score:.2%}")
49
- )
50
-
51
- def __hash__(self) -> int:
52
- """Return a hash of the metric configuration."""
53
- return hash(self.name)
54
-
55
-
56
18
  @dataclass
57
19
  class Language:
58
20
  """A benchmarkable language.
@@ -147,7 +109,7 @@ class Task:
147
109
  name: str
148
110
  task_group: TaskGroup
149
111
  template_dict: dict["Language", "PromptConfig"]
150
- metrics: list[MetricConfig]
112
+ metrics: list[Metric]
151
113
  default_num_few_shot_examples: int
152
114
  default_max_generated_tokens: int
153
115
  default_labels: list[str]
@@ -49,10 +49,10 @@ CNN_DAILYMAIL_CONFIG = DatasetConfig(
49
49
  languages=[EN],
50
50
  )
51
51
 
52
- MMLU_CONFIG = DatasetConfig(
53
- name="mmlu",
54
- pretty_name="the truncated version of the English knowledge dataset MMLU",
55
- huggingface_id="EuroEval/mmlu-mini",
52
+ LIFE_IN_THE_UK_CONFIG = DatasetConfig(
53
+ name="life-in-the-uk",
54
+ pretty_name="the English knowledge dataset Life in the UK",
55
+ huggingface_id="EuroEval/life-in-the-uk",
56
56
  task=KNOW,
57
57
  languages=[EN],
58
58
  )
@@ -86,3 +86,12 @@ BELEBELE_CONFIG = DatasetConfig(
86
86
  languages=[EN],
87
87
  unofficial=True,
88
88
  )
89
+
90
+ MMLU_CONFIG = DatasetConfig(
91
+ name="mmlu",
92
+ pretty_name="the truncated version of the English knowledge dataset MMLU",
93
+ huggingface_id="EuroEval/mmlu-mini",
94
+ task=KNOW,
95
+ languages=[EN],
96
+ unofficial=True,
97
+ )
@@ -76,6 +76,14 @@ NRK_QUIZ_QA_CONFIG = DatasetConfig(
76
76
  languages=[NB, NN, NO],
77
77
  )
78
78
 
79
+ IDIOMS_NO_CONFIG = DatasetConfig(
80
+ name="idioms-no",
81
+ pretty_name="the Norwegian knowledge dataset Idioms-no",
82
+ huggingface_id="EuroEval/idioms-no",
83
+ task=KNOW,
84
+ languages=[NB, NN, NO],
85
+ )
86
+
79
87
  NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
80
88
  name="nor-common-sense-qa",
81
89
  pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
euroeval/finetuning.py CHANGED
@@ -5,7 +5,6 @@ import sys
5
5
  import typing as t
6
6
 
7
7
  import torch
8
- from datasets import DatasetDict
9
8
  from tqdm.auto import tqdm
10
9
  from transformers.trainer_callback import (
11
10
  EarlyStoppingCallback,
@@ -15,7 +14,6 @@ from transformers.trainer_callback import (
15
14
  from transformers.trainer_utils import IntervalStrategy
16
15
  from transformers.training_args import OptimizerNames, TrainingArguments
17
16
 
18
- from .benchmark_modules import BenchmarkModule
19
17
  from .callbacks import NeverLeaveProgressCallback
20
18
  from .enums import DataType
21
19
  from .exceptions import InvalidBenchmark, NaNValueInModelOutput
@@ -28,14 +26,17 @@ from .utils import (
28
26
  )
29
27
 
30
28
  if t.TYPE_CHECKING:
29
+ from datasets import DatasetDict
30
+
31
+ from .benchmark_modules import BenchmarkModule
31
32
  from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
32
33
 
33
34
  logger = logging.getLogger("euroeval")
34
35
 
35
36
 
36
37
  def finetune(
37
- model: BenchmarkModule,
38
- datasets: list[DatasetDict],
38
+ model: "BenchmarkModule",
39
+ datasets: list["DatasetDict"],
39
40
  model_config: "ModelConfig",
40
41
  dataset_config: "DatasetConfig",
41
42
  benchmark_config: "BenchmarkConfig",
@@ -155,9 +156,9 @@ def finetune(
155
156
 
156
157
 
157
158
  def finetune_single_iteration(
158
- model: BenchmarkModule | None,
159
- dataset: DatasetDict,
160
- training_args: TrainingArguments,
159
+ model: "BenchmarkModule | None",
160
+ dataset: "DatasetDict",
161
+ training_args: "TrainingArguments",
161
162
  model_config: "ModelConfig",
162
163
  dataset_config: "DatasetConfig",
163
164
  benchmark_config: "BenchmarkConfig",
@@ -254,7 +255,7 @@ def get_training_args(
254
255
  iteration_idx: int,
255
256
  dtype: DataType,
256
257
  batch_size: int | None = None,
257
- ) -> TrainingArguments:
258
+ ) -> "TrainingArguments":
258
259
  """Get the training arguments for the current iteration.
259
260
 
260
261
  Args:
euroeval/generation.py CHANGED
@@ -6,10 +6,8 @@ import typing as t
6
6
  from pathlib import Path
7
7
 
8
8
  import more_itertools as mit
9
- from datasets import Dataset, DatasetDict
10
9
  from tqdm.auto import tqdm
11
10
 
12
- from .benchmark_modules import BenchmarkModule
13
11
  from .enums import BatchingPreference, TaskGroup
14
12
  from .exceptions import InvalidBenchmark
15
13
  from .model_cache import (
@@ -20,6 +18,9 @@ from .model_cache import (
20
18
  from .utils import clear_memory
21
19
 
22
20
  if t.TYPE_CHECKING:
21
+ from datasets import Dataset, DatasetDict
22
+
23
+ from .benchmark_modules import BenchmarkModule
23
24
  from .data_models import (
24
25
  BenchmarkConfig,
25
26
  DatasetConfig,
@@ -32,7 +33,7 @@ logger = logging.getLogger("euroeval")
32
33
 
33
34
  def generate(
34
35
  model: "BenchmarkModule",
35
- datasets: list[DatasetDict],
36
+ datasets: list["DatasetDict"],
36
37
  model_config: "ModelConfig",
37
38
  dataset_config: "DatasetConfig",
38
39
  benchmark_config: "BenchmarkConfig",
@@ -100,7 +101,7 @@ def generate(
100
101
 
101
102
 
102
103
  def generate_single_iteration(
103
- dataset: Dataset,
104
+ dataset: "Dataset",
104
105
  model: "BenchmarkModule",
105
106
  dataset_config: "DatasetConfig",
106
107
  benchmark_config: "BenchmarkConfig",
@@ -323,6 +323,7 @@ def apply_prompt(
323
323
  tokenize=False,
324
324
  add_generation_prompt=True,
325
325
  chat_template=chat_template,
326
+ enable_thinking=True,
326
327
  )
327
328
  for messages in messages_list
328
329
  ]
@@ -3,6 +3,7 @@
3
3
  import importlib.util
4
4
  import json
5
5
  import logging
6
+ import typing as t
6
7
  from collections import defaultdict
7
8
  from functools import partial
8
9
  from pathlib import Path
@@ -24,13 +25,15 @@ from .task_group_utils import (
24
25
  token_classification,
25
26
  )
26
27
  from .tasks import NER
27
- from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
28
28
  from .utils import enforce_reproducibility
29
29
 
30
30
  if importlib.util.find_spec("gradio") is not None:
31
31
  import gradio as gr
32
32
  from gradio.components import HTML, Button, Dropdown, Markdown, Textbox
33
33
 
34
+ if t.TYPE_CHECKING:
35
+ from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
36
+
34
37
  logger = logging.getLogger("euroeval")
35
38
 
36
39
 
@@ -86,8 +89,8 @@ class HumanEvaluator:
86
89
  }
87
90
  )
88
91
 
89
- self.extract_labels_from_generation: ExtractLabelsFunction
90
- self.compute_metrics: ComputeMetricsFunction
92
+ self.extract_labels_from_generation: "ExtractLabelsFunction"
93
+ self.compute_metrics: "ComputeMetricsFunction"
91
94
 
92
95
  def create_app(self) -> "gr.Blocks":
93
96
  """Create the Gradio app for human evaluation.
@@ -342,7 +345,6 @@ class HumanEvaluator:
342
345
  self.compute_metrics = partial(
343
346
  sequence_classification.compute_metrics,
344
347
  dataset_config=self.dataset_config,
345
- benchmark_config=benchmark_config,
346
348
  )
347
349
  self.extract_labels_from_generation = partial(
348
350
  sequence_classification.extract_labels_from_generation,
@@ -362,7 +364,6 @@ class HumanEvaluator:
362
364
  token_classification.compute_metrics,
363
365
  has_misc_tags=self.has_misc_tags,
364
366
  dataset_config=self.dataset_config,
365
- benchmark_config=benchmark_config,
366
367
  )
367
368
  self.extract_labels_from_generation = partial(
368
369
  token_classification.extract_labels_from_generation,
@@ -372,7 +373,6 @@ class HumanEvaluator:
372
373
  self.compute_metrics = partial(
373
374
  question_answering.compute_metrics,
374
375
  dataset_config=self.dataset_config,
375
- benchmark_config=benchmark_config,
376
376
  )
377
377
  self.extract_labels_from_generation = (
378
378
  question_answering.extract_labels_from_generation
@@ -641,7 +641,7 @@ class HumanEvaluator:
641
641
  # only a single iteration, so the results from the current annotation should be
642
642
  # added to the previous results.
643
643
  results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
644
- results: ScoreDict = defaultdict(list)
644
+ results: "ScoreDict" = defaultdict(list)
645
645
  if results_path.exists():
646
646
  all_results = [
647
647
  json.loads(line.strip())
@@ -664,15 +664,15 @@ class HumanEvaluator:
664
664
 
665
665
  # Aggregate scores
666
666
  total_dict: dict[str, float] = dict()
667
- for metric_cfg in self.dataset_config.task.metrics:
667
+ for metric in self.dataset_config.task.metrics:
668
668
  test_score, test_se = aggregate_scores(
669
669
  scores=results["raw"], # type: ignore[arg-type]
670
- metric_config=metric_cfg,
670
+ metric=metric,
671
671
  )
672
- test_score, _ = metric_cfg.postprocessing_fn(test_score)
673
- test_se, _ = metric_cfg.postprocessing_fn(test_se)
674
- total_dict[f"test_{metric_cfg.name}"] = test_score
675
- total_dict[f"test_{metric_cfg.name}_se"] = test_se
672
+ test_score, _ = metric.postprocessing_fn(test_score)
673
+ test_se, _ = metric.postprocessing_fn(test_se)
674
+ total_dict[f"test_{metric.name}"] = test_score
675
+ total_dict[f"test_{metric.name}_se"] = test_se
676
676
  results["total"] = total_dict
677
677
 
678
678
  benchmark_result = BenchmarkResult(