ScandEval 16.10.0__py3-none-any.whl → 16.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,6 +74,14 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
74
74
  languages=[DUTCH],
75
75
  )
76
76
 
77
+ DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
78
+ name="duidelijke-taal",
79
+ pretty_name="Duidelijke Taal",
80
+ source="EuroEval/duidelijke-taal",
81
+ task=SIMPL,
82
+ languages=[DUTCH],
83
+ )
84
+
77
85
  VALEU_NL_CONFIG = DatasetConfig(
78
86
  name="valeu-nl",
79
87
  pretty_name="VaLEU-nl",
@@ -161,12 +169,3 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
161
169
  _labels=["a", "b"],
162
170
  unofficial=True,
163
171
  )
164
-
165
- DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
166
- name="duidelijke-taal",
167
- pretty_name="Duidelijke Taal",
168
- source="EuroEval/duidelijke-taal",
169
- task=SIMPL,
170
- languages=[DUTCH],
171
- unofficial=True,
172
- )
@@ -27,7 +27,7 @@ SCALA_NN_CONFIG = DatasetConfig(
27
27
  pretty_name="ScaLA-nn",
28
28
  source="EuroEval/scala-nn",
29
29
  task=LA,
30
- languages=[NORWEGIAN_NYNORSK],
30
+ languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
31
31
  )
32
32
 
33
33
  NORNE_NB_CONFIG = DatasetConfig(
@@ -43,7 +43,7 @@ NORNE_NN_CONFIG = DatasetConfig(
43
43
  pretty_name="NorNE-nn",
44
44
  source="EuroEval/norne-nn-mini",
45
45
  task=NER,
46
- languages=[NORWEGIAN_NYNORSK],
46
+ languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
47
47
  )
48
48
 
49
49
  NORQUAD_CONFIG = DatasetConfig(
@@ -197,7 +197,7 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
197
197
  pretty_name="MultiWikiQA-nn",
198
198
  source="EuroEval/multi-wiki-qa-nn-mini",
199
199
  task=RC,
200
- languages=[NORWEGIAN_NYNORSK],
200
+ languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
201
201
  unofficial=True,
202
202
  )
203
203
 
@@ -87,7 +87,7 @@ def log(message: str, level: int, colour: str | None = None) -> None:
87
87
 
88
88
 
89
89
  @cache_arguments("message")
90
- def log_once(message: str, level: int = logging.INFO, prefix: str = "") -> None:
90
+ def log_once(message: str, level: int, prefix: str = "") -> None:
91
91
  """Log a message once.
92
92
 
93
93
  This is ensured by caching the "message" argument and only logging it the first time
@@ -1,6 +1,7 @@
1
1
  """All the Hugging Face metrics used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
+ import os
4
5
  import typing as t
5
6
  from pathlib import Path
6
7
 
@@ -130,7 +131,7 @@ class HuggingFaceMetric(Metric):
130
131
  "__call__ method."
131
132
  )
132
133
 
133
- with no_terminal_output(disable=benchmark_config.verbose):
134
+ with no_terminal_output(disable=os.getenv("FULL_LOG", "0") == "1"):
134
135
  results = self.metric.compute(
135
136
  predictions=predictions, references=references, **self.compute_kwargs
136
137
  )
@@ -196,7 +197,7 @@ class SourceBasedMetric(HuggingFaceMetric):
196
197
  f"instead."
197
198
  )
198
199
 
199
- with no_terminal_output(disable=benchmark_config.verbose):
200
+ with no_terminal_output(disable=os.getenv("FULL_LOG", "0") == "1"):
200
201
  results = self.metric.compute(
201
202
  sources=sources,
202
203
  predictions=predictions,
@@ -5,7 +5,7 @@ import logging
5
5
  import typing as t
6
6
  from pathlib import Path
7
7
 
8
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel, Field, ValidationError
9
9
 
10
10
  from ..exceptions import InvalidBenchmark
11
11
  from ..logging_utils import log
@@ -17,6 +17,8 @@ if t.TYPE_CHECKING:
17
17
 
18
18
  from ..data_models import BenchmarkConfig, DatasetConfig
19
19
 
20
+ from ..types import BatchScoringFunction, ScoringFunction
21
+
20
22
 
21
23
  class LLMAsAJudgeMetric(Metric):
22
24
  """Use an LLM to judge the quality of the predictions."""
@@ -29,7 +31,8 @@ class LLMAsAJudgeMetric(Metric):
29
31
  judge_kwargs: dict[str, t.Any],
30
32
  user_prompt: str,
31
33
  response_format: t.Type[BaseModel],
32
- scoring_fn: t.Callable[[BaseModel | None], float],
34
+ scoring_fn: ScoringFunction | None = None,
35
+ batch_scoring_fn: BatchScoringFunction | None = None,
33
36
  condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
34
37
  system_prompt: str | None = None,
35
38
  ) -> None:
@@ -57,6 +60,8 @@ class LLMAsAJudgeMetric(Metric):
57
60
  response.
58
61
  scoring_fn:
59
62
  A function that takes the judge's response and returns a score.
63
+ batch_scoring_fn:
64
+ A function that takes all judge responses and returns a score.
60
65
  condition_formatting_fn (optional):
61
66
  A function to format the condition string before it is included in the
62
67
  user prompt. Defaults to a no-op function that returns the input
@@ -70,7 +75,9 @@ class LLMAsAJudgeMetric(Metric):
70
75
  self.judge_kwargs = judge_kwargs
71
76
  self.user_prompt = user_prompt
72
77
  self.response_format = response_format
73
- self.scoring_fn = scoring_fn
78
+ self.batch_scoring_fn = self._get_batch_scoring_fn(
79
+ scoring_fn=scoring_fn, batch_scoring_fn=batch_scoring_fn
80
+ )
74
81
  self.condition_formatting_fn = condition_formatting_fn
75
82
  self.system_prompt = system_prompt
76
83
 
@@ -181,22 +188,36 @@ class LLMAsAJudgeMetric(Metric):
181
188
  json_dicts = [
182
189
  extract_json_dict_from_string(s=output.sequence) for output in raw_outputs
183
190
  ]
184
- outputs = [
185
- self.response_format.model_validate(obj=json_dict)
186
- if json_dict is not None
187
- else None
188
- for json_dict in json_dicts
189
- ]
191
+ outputs_raw: list[BaseModel | None] = []
192
+ for json_dict in json_dicts:
193
+ if json_dict is None:
194
+ outputs_raw.append(None)
195
+ continue
196
+ try:
197
+ outputs_raw.append(self.response_format.model_validate(obj=json_dict))
198
+ except ValidationError:
199
+ outputs_raw.append(None)
200
+
201
+ num_none: int = sum(output is None for output in outputs_raw)
202
+ if num_none:
203
+ log(
204
+ f"Could not parse/validate {num_none:,} of {len(outputs_raw):,} judge "
205
+ f"outputs for metric {self.pretty_name!r}. These will be ignored.",
206
+ level=logging.DEBUG,
207
+ )
190
208
 
191
- # Calculate the scores using the scoring function
192
- scores = [self.scoring_fn(output) for output in outputs]
193
- if not scores:
209
+ outputs: list[BaseModel] = [
210
+ output for output in outputs_raw if output is not None
211
+ ]
212
+ if not outputs:
194
213
  log(
195
- f"No scores were calculated for {self.pretty_name}.",
214
+ f"No valid judge outputs were produced for metric "
215
+ f"{self.pretty_name!r}.",
196
216
  level=logging.WARNING,
197
217
  )
198
218
  return None
199
- return sum(scores) / len(scores)
219
+
220
+ return self.batch_scoring_fn(outputs=outputs, dataset=dataset)
200
221
 
201
222
  def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
202
223
  """Apply the user prompt to the prediction and condition.
@@ -227,6 +248,49 @@ class LLMAsAJudgeMetric(Metric):
227
248
  )
228
249
  return self.user_prompt.format(prediction=prediction)
229
250
 
251
+ def _get_batch_scoring_fn(
252
+ self,
253
+ scoring_fn: ScoringFunction | None,
254
+ batch_scoring_fn: BatchScoringFunction | None,
255
+ ) -> BatchScoringFunction:
256
+ """Get the batch scoring function.
257
+
258
+ Args:
259
+ scoring_fn:
260
+ The scoring function to use.
261
+ batch_scoring_fn:
262
+ The batch scoring function to use.
263
+
264
+ Returns:
265
+ The batch scoring function.
266
+
267
+ Raises:
268
+ InvalidBenchmark:
269
+ If both or neither of the scoring functions are provided.
270
+ """
271
+ if scoring_fn is not None and batch_scoring_fn is not None:
272
+ raise InvalidBenchmark(
273
+ "Both `scoring_fn` and `batch_scoring_fn` are provided. Please "
274
+ "provide only one of them."
275
+ )
276
+ if scoring_fn is not None:
277
+ scoring_fn_nonnull = scoring_fn
278
+
279
+ def batch_fn(
280
+ outputs: list[BaseModel], dataset: "Dataset | None" = None
281
+ ) -> float:
282
+ return sum(scoring_fn_nonnull(output) for output in outputs) / len(
283
+ outputs
284
+ )
285
+
286
+ return batch_fn
287
+ if batch_scoring_fn is not None:
288
+ return batch_scoring_fn
289
+ raise InvalidBenchmark(
290
+ "Neither `scoring_fn` nor `batch_scoring_fn` are provided. Please "
291
+ "provide one of them."
292
+ )
293
+
230
294
 
231
295
  ### Fluency metric ###
232
296
 
@@ -257,5 +321,5 @@ fluency_metric = LLMAsAJudgeMetric(
257
321
  "Text: {prediction!r}\n\n"
258
322
  "Output your rating as a JSON object with a single key 'fluency'.",
259
323
  response_format=Fluency,
260
- scoring_fn=lambda output: (output.fluency - 1) / 4.0 if output is not None else 0.0,
324
+ scoring_fn=lambda output: (output.fluency - 1) / 4.0,
261
325
  )
@@ -1,5 +1,6 @@
1
1
  """Functions related to the loading of models."""
2
2
 
3
+ import logging
3
4
  import typing as t
4
5
 
5
6
  from .benchmark_modules import (
@@ -35,7 +36,7 @@ def load_model(
35
36
  Returns:
36
37
  The model.
37
38
  """
38
- log_once(f"\nLoading the model {model_config.model_id}...")
39
+ log_once(f"\nLoading the model {model_config.model_id}...", level=logging.INFO)
39
40
 
40
41
  # The order matters; the first model type that matches will be used. For this
41
42
  # reason, they have been ordered in terms of the most common model types.
@@ -180,6 +180,17 @@ def extract_labels_from_generation(
180
180
  if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
181
181
  predicted_label = m.group(1)
182
182
 
183
+ # If the prediction starts with one of the candidate labels (case-insensitive)
184
+ # then use that one
185
+ prefix_candidate_labels = [
186
+ candidate_label
187
+ for candidate_label in sample_candidate_labels[idx]
188
+ if predicted_label.lower().startswith(candidate_label.lower())
189
+ ]
190
+ if prefix_candidate_labels:
191
+ new_predicted_labels.append(prefix_candidate_labels[0])
192
+ continue
193
+
183
194
  # We set the word edit distance weights such that we heavily penalise insertions
184
195
  # and substitutions, so that we don't just insert the correct label, but that we
185
196
  # want the model to have included the correct label in its output.
@@ -235,9 +246,7 @@ def extract_labels_from_generation(
235
246
  f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
236
247
  "of the samples. This likely means that the model were completely "
237
248
  "off in these cases. Since this task does not allow invalid model "
238
- "outputs, we have to abort the evaluation. Please re-run the "
239
- "evaluation with the `--debug` flag (or `debug=True` if you're using "
240
- "the `Benchmarker` API) to see the precise model outputs."
249
+ "outputs, we have to abort the evaluation."
241
250
  )
242
251
 
243
252
  return new_predicted_labels
scandeval/types.py CHANGED
@@ -13,9 +13,11 @@ except ImportError:
13
13
  MistralCommonBackend as MistralCommonTokenizer,
14
14
  )
15
15
 
16
+
16
17
  if t.TYPE_CHECKING:
17
18
  from datasets.arrow_dataset import Dataset
18
19
  from numpy.typing import NDArray
20
+ from pydantic import BaseModel
19
21
 
20
22
  from .data_models import BenchmarkConfig, GenerativeModelOutput
21
23
 
@@ -73,6 +75,43 @@ class ExtractLabelsFunction(t.Protocol):
73
75
  ...
74
76
 
75
77
 
78
+ class ScoringFunction(t.Protocol):
79
+ """A function used to compute a score from a single model output."""
80
+
81
+ def __call__(self, output: "BaseModel") -> float:
82
+ """Compute a score from a model output.
83
+
84
+ Args:
85
+ output:
86
+ A model output (Pydantic model) from the judge.
87
+
88
+ Returns:
89
+ A float score computed from the output.
90
+ """
91
+ ...
92
+
93
+
94
+ class BatchScoringFunction(t.Protocol):
95
+ """A function used to compute batch scores from model outputs."""
96
+
97
+ def __call__(
98
+ self, outputs: list["BaseModel"], dataset: "Dataset | None" = None
99
+ ) -> float:
100
+ """Compute a batch score from model outputs.
101
+
102
+ Args:
103
+ outputs:
104
+ List of model outputs (Pydantic models) from the judge.
105
+ dataset:
106
+ Optional dataset used for evaluation. Can be used for additional
107
+ context when computing the score.
108
+
109
+ Returns:
110
+ A float score computed from the batch of outputs.
111
+ """
112
+ ...
113
+
114
+
76
115
  def is_list_of_int(x: object) -> t.TypeGuard[c.Sequence[int]]:
77
116
  """Check if an object is a list of integers.
78
117
 
scandeval/utils.py CHANGED
@@ -21,6 +21,7 @@ import huggingface_hub as hf_hub
21
21
  import numpy as np
22
22
  import torch
23
23
  from huggingface_hub.errors import LocalTokenNotFoundError
24
+ from requests.exceptions import RequestException
24
25
 
25
26
  from .caching_utils import cache_arguments
26
27
  from .constants import T
@@ -44,10 +45,25 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
44
45
  Returns:
45
46
  The path to the cache directory.
46
47
  """
47
- # to avoid nesting due to models name containing '/'
48
- _model_id = model_id.replace("/", "--")
49
- cache_dir_path = Path(cache_dir) / "model_cache" / _model_id
50
- return str(cache_dir_path)
48
+ # If the model ID is a path, we just use that as the cache dir
49
+ if Path(model_id).is_dir():
50
+ log_once(
51
+ f"Since the model {model_id!r} is a local model, we will use the model "
52
+ "directory directly as the model cache directory.",
53
+ level=logging.DEBUG,
54
+ )
55
+ return model_id
56
+
57
+ # Otherwise, we create a cache dir based on the model ID
58
+ model_cache_dir = Path(
59
+ cache_dir, "model_cache", model_id.replace("/", "--")
60
+ ).as_posix()
61
+ log_once(
62
+ f"Using the model cache directory {model_cache_dir!r} for the model "
63
+ f"{model_id!r}.",
64
+ level=logging.DEBUG,
65
+ )
66
+ return model_cache_dir
51
67
 
52
68
 
53
69
  def resolve_model_path(download_dir: str) -> str:
@@ -65,8 +81,10 @@ def resolve_model_path(download_dir: str) -> str:
65
81
  If the model path is not valid, or if required files are missing.
66
82
  """
67
83
  model_path = Path(download_dir)
84
+
68
85
  # Get the 'path safe' version of the model id, which is the last dir in the path
69
86
  model_id_path = model_path.name
87
+
70
88
  # Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
71
89
  model_path = model_path / f"models--{model_id_path}" / "snapshots"
72
90
  if not model_path.exists():
@@ -423,6 +441,13 @@ def get_hf_token(api_key: str | None) -> str | bool:
423
441
  level=logging.DEBUG,
424
442
  )
425
443
  return False
444
+ except RequestException:
445
+ log_once(
446
+ "No Hugging Face API key was set and the connection to Hugging Face "
447
+ "failed, so no token will be used.",
448
+ level=logging.DEBUG,
449
+ )
450
+ return False
426
451
 
427
452
 
428
453
  def extract_multiple_choice_labels(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ScandEval
3
- Version: 16.10.0
3
+ Version: 16.11.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -8,7 +8,7 @@ Author-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
8
8
  Maintainer-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
9
9
  License: MIT License
10
10
 
11
- Copyright (c) 2022-2025 Dan Saattrup Smart
11
+ Copyright (c) 2022-2026 Dan Saattrup Smart
12
12
 
13
13
  Permission is hereby granted, free of charge, to any person obtaining a copy
14
14
  of this software and associated documentation files (the "Software"), to deal
@@ -123,16 +123,17 @@ The easiest way to benchmark pretrained models is via the command line interface
123
123
  having installed the package, you can benchmark your favorite model like so:
124
124
 
125
125
  ```bash
126
- euroeval --model <model-id>
126
+ euroeval --model <model-id-or-path>
127
127
  ```
128
128
 
129
- Here `model` is the HuggingFace model ID, which can be found on the [HuggingFace
130
- Hub](https://huggingface.co/models). By default this will benchmark the model on all
131
- the tasks available. If you want to benchmark on a particular task, then use the
132
- `--task` argument:
129
+ Here `model` is either the HuggingFace model ID, which can be found on the [HuggingFace
130
+ Hub](https://huggingface.co/models), or a local path to a model directory (containing
131
+ the model files as well as the `config.json` file). By default this will benchmark the
132
+ model on all the tasks available. If you want to benchmark on a particular task, then
133
+ use the `--task` argument:
133
134
 
134
135
  ```bash
135
- euroeval --model <model-id> --task sentiment-classification
136
+ euroeval --model <model-id-or-path> --task sentiment-classification
136
137
  ```
137
138
 
138
139
  We can also narrow down which languages we would like to benchmark on. This can be done
@@ -140,20 +141,20 @@ by setting the `--language` argument. Here we thus benchmark the model on the Da
140
141
  sentiment classification task:
141
142
 
142
143
  ```bash
143
- euroeval --model <model-id> --task sentiment-classification --language da
144
+ euroeval --model <model-id-or-path> --task sentiment-classification --language da
144
145
  ```
145
146
 
146
147
  Multiple models, datasets and/or languages can be specified by just attaching multiple
147
148
  arguments. Here is an example with two models:
148
149
 
149
150
  ```bash
150
- euroeval --model <model-id1> --model <model-id2>
151
+ euroeval --model <model-id-or-path-1> --model <model-id-or-path-2>
151
152
  ```
152
153
 
153
154
  The specific model version/revision to use can also be added after the suffix '@':
154
155
 
155
156
  ```bash
156
- euroeval --model <model-id>@<commit>
157
+ euroeval --model <model-id-or-path>@<commit>
157
158
  ```
158
159
 
159
160
  This can be a branch name, a tag name, or a commit id. It defaults to 'main' for latest.
@@ -173,7 +174,7 @@ model:
173
174
  ```python
174
175
  >>> from euroeval import Benchmarker
175
176
  >>> benchmarker = Benchmarker()
176
- >>> benchmarker.benchmark(model="<model-id>")
177
+ >>> benchmarker.benchmark(model="<model-id-or-path>")
177
178
  ```
178
179
 
179
180
  To benchmark on a specific task and/or language, you simply specify the `task` or
@@ -181,7 +182,7 @@ To benchmark on a specific task and/or language, you simply specify the `task` o
181
182
 
182
183
  ```python
183
184
  >>> benchmarker.benchmark(
184
- ... model="<model-id>",
185
+ ... model="<model-id-or-path>",
185
186
  ... task="sentiment-classification",
186
187
  ... language="da",
187
188
  ... )
@@ -225,7 +226,7 @@ docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
225
226
  ```
226
227
 
227
228
  Here `<euroeval-arguments>` consists of the arguments added to the `euroeval` CLI
228
- argument. This could for instance be `--model <model-id> --task
229
+ argument. This could for instance be `--model <model-id-or-path> --task
229
230
  sentiment-classification`.
230
231
 
231
232
  ## Benchmarking custom inference APIs
@@ -291,14 +292,14 @@ script. For example to download the model you want and all of the Danish sentime
291
292
  classification datasets:
292
293
 
293
294
  ```bash
294
- euroeval --model <model-id> --task sentiment-classification --language da --download-only
295
+ euroeval --model <model-id-or-path> --task sentiment-classification --language da --download-only
295
296
  ```
296
297
 
297
298
  Or from a script:
298
299
 
299
300
  ```python
300
301
  >>> benchmarker.benchmark(
301
- ... model="<model-id>",
302
+ ... model="<model-id-or-path>",
302
303
  ... task="sentiment-classification",
303
304
  ... language="da",
304
305
  ... download_only=True,
@@ -346,7 +347,7 @@ MY_CONFIG = DatasetConfig(
346
347
  You can then benchmark your custom dataset by simply running
347
348
 
348
349
  ```bash
349
- euroeval --dataset my-dataset --model <model-id>
350
+ euroeval --dataset my-dataset --model <model-id-or-path>
350
351
  ```
351
352
 
352
353
  You can also run the benchmark from a Python script, by simply providing your custom
@@ -356,7 +357,7 @@ dataset configuration directly into the `benchmark` method:
356
357
  from euroeval import Benchmarker
357
358
 
358
359
  benchmarker = Benchmarker()
359
- benchmarker.benchmark(model="<model-id>", dataset=MY_CONFIG)
360
+ benchmarker.benchmark(model="<model-id-or-path>", dataset=MY_CONFIG)
360
361
  ```
361
362
 
362
363
  We have included three convenience tasks to make it easier to set up custom datasets:
@@ -436,7 +437,7 @@ MY_SQL_DATASET = DatasetConfig(
436
437
  Again, with this you can benchmark your custom dataset by simply running
437
438
 
438
439
  ```bash
439
- euroeval --dataset my-sql-dataset --model <model-id>
440
+ euroeval --dataset my-sql-dataset --model <model-id-or-path>
440
441
  ```
441
442
 
442
443
  ## Reproducing the evaluation datasets
@@ -592,6 +593,13 @@ A huge thank you to all the contributors who have helped make this project a suc
592
593
  alt="Contributor avatar for tvosch"
593
594
  />
594
595
  </a>
596
+ <a href="https://github.com/Touzen">
597
+ <img
598
+ src="https://avatars.githubusercontent.com/u/1416265"
599
+ width=50
600
+ alt="Contributor avatar for Touzen"
601
+ />
602
+ </a>
595
603
 
596
604
  ### Contribute to EuroEval
597
605
 
@@ -1,34 +1,34 @@
1
1
  scandeval/__init__.py,sha256=w4oYw-lbj5ZZ4pv-bHrgZNJ6dlu-WcAWg2e--_UMmeE,4244
2
2
  scandeval/benchmark_config_factory.py,sha256=2stmcqKwx0G9pAiA0atunqDchJ9eoezp1Wh3vB41zV4,8745
3
- scandeval/benchmarker.py,sha256=ARH1ATYAunKNRgIQTDvGqMN_M-ygG0SIQw-hfTOuC6U,53556
3
+ scandeval/benchmarker.py,sha256=Enf3IGYPl2q8j4ViXi5M8_ZaftpCAemTi0Z9HGMv7wc,53841
4
4
  scandeval/caching_utils.py,sha256=lLUbkpDdJZy4xodIpwIz5d-WNKGuszbr_d9dyiJ5kZc,2591
5
5
  scandeval/callbacks.py,sha256=l8f6Zr8EoHfVFsI1ZnMUK0Y8uZB00Nvaz_I6XDn6avE,2515
6
6
  scandeval/cli.py,sha256=zvPGomSdrcjxc4uhmh8SkB4s2d7U9JYhxBJ34vznqUI,9411
7
7
  scandeval/constants.py,sha256=wF7fQwaX8yZIypq_eh5RcaQFEhABR7dJxQaAX82b4P8,3766
8
8
  scandeval/data_loading.py,sha256=8ryYEmj6di1f9QefGfNajxObQ9iapIGuAsL8m9KzDyI,7050
9
- scandeval/data_models.py,sha256=vRGKrYr1YFBcH4ngOHrESicbTaIcz-joKz58JN5YMFE,30548
9
+ scandeval/data_models.py,sha256=btAafgRktlRhcOXDIFNp4y0RiR2n5-C_rRmgZCyxmCE,30562
10
10
  scandeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
11
11
  scandeval/exceptions.py,sha256=4-N2OIo5PJ2aciLjagNAVhdHPxpq2QxywbBqJ8lkKj0,5780
12
12
  scandeval/finetuning.py,sha256=dTjchPHLFRD65ZrEmtj5TfMTPZ6PODn77t372fgTNwE,11983
13
13
  scandeval/generation.py,sha256=ccE-S0jxkM99XziIdeaBbk8yRGv4YBkzZkoabhFCSKA,13382
14
14
  scandeval/generation_utils.py,sha256=A6YCiiMrMEUHq5BcVEjsouIKMPGt0sCfPzsJY1GVyk0,20092
15
15
  scandeval/languages.py,sha256=gUSosFbvf1eEQHjVsKhXdJ4jiGXC-9lMkOL8AsBG33Q,37295
16
- scandeval/logging_utils.py,sha256=Pd6DyHTPHCUsjtriomJboiTB35UdXvzxwnNpGTuec-g,9522
16
+ scandeval/logging_utils.py,sha256=Qnni11ngHrjCf_fgkk6lp6gs-tGSgUS3d5zRR83y6ec,9507
17
17
  scandeval/model_cache.py,sha256=sjMYW0klnHt2yAFLavDTsp_InxPeSOuVEFo-Rh_31UM,10219
18
18
  scandeval/model_config.py,sha256=fxHfgpw-9vj3hwke28DguVGvG9TU06nkTXT0V6KAMpQ,2761
19
- scandeval/model_loading.py,sha256=bE51L4-AaVgo9h10UsKH_47CB4tOJGU988HxotQ5sYE,2342
19
+ scandeval/model_loading.py,sha256=DsX7et18Epcv8kHATZgwPJnwH17GHmh3JCzrSoI3GAE,2377
20
20
  scandeval/scores.py,sha256=9a1XtppFbp8GJFc9JdThGxqBY0YUE7-92oyrlxScjNk,3281
21
21
  scandeval/speed_benchmark.py,sha256=VUOvauc9tuAegThNT2g1a-Z1l7DEmKq57dHI4t16o5A,4068
22
22
  scandeval/tasks.py,sha256=mgE6Vx_1WD9-aY-yeBxc_09Uyz-tqk69xISMWVYcrsY,5980
23
23
  scandeval/tokenisation_utils.py,sha256=Sa8V91J4NDFBF-qbConPsQvUkW_02cJp0gySz_Q3NDo,21191
24
- scandeval/types.py,sha256=-VNeeDEvlNwfemszpvuGb3Dr9Gu3Eqc6XRmR11HLRi4,3293
25
- scandeval/utils.py,sha256=BIAP9TWmY_xv6tuCUgmnYifoeodxlz8N2Q0We3frgLU,18389
24
+ scandeval/types.py,sha256=CHQjLzqKYDXPCyZas7rKg6wD1pNiYuaOFMWimrj5H64,4374
25
+ scandeval/utils.py,sha256=E3HQ-8cecJh6NMHF7Ji2YBx6x4tiVKeESglkBeQ0CKg,19167
26
26
  scandeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
27
27
  scandeval/benchmark_modules/base.py,sha256=5YAsCMILKTRXFx_ylGQ7iS5AFKN25iFdkBjj8KzzElw,11445
28
28
  scandeval/benchmark_modules/fresh.py,sha256=sG5ae4p1J-GGmVNcVBIxY1xZIAlUwq_pu-9c4uAYU3Y,10734
29
- scandeval/benchmark_modules/hf.py,sha256=f89E7XoMqsBHhYnMYBgy7ZuXDsAQ7VaIqMfFrHyjg8g,47363
30
- scandeval/benchmark_modules/litellm.py,sha256=TH35CQhoVinlmfHnAW-XJE21o96YfiIv993m0ASS80E,71590
31
- scandeval/benchmark_modules/vllm.py,sha256=dloZsXU6_JE9pNbAAnqezKpoVatF7E_c6ivYlZ1emnY,57223
29
+ scandeval/benchmark_modules/hf.py,sha256=bfaPCCBWtRB36TAfJU82WhK_KtdWSuFbSVE81JU1uEY,47900
30
+ scandeval/benchmark_modules/litellm.py,sha256=LPYwCkqpMOMiJzBHQ6mepa94tQZ2POWIpgciVszbOyE,75061
31
+ scandeval/benchmark_modules/vllm.py,sha256=DbGM-_ExTKAhETibb5GOlvG0MguG0JZZHD3cXYP65LM,59754
32
32
  scandeval/dataset_configs/__init__.py,sha256=GFI_W9GKd3OSDdhhJzHc8mwoP9b32IHIIyvPBI-hK6k,3223
33
33
  scandeval/dataset_configs/albanian.py,sha256=D__dli7JO3yeHzzdJ3FFyUGw-z20f1yI6QLnws-WB8I,1473
34
34
  scandeval/dataset_configs/bosnian.py,sha256=golIWqwW1pFwSkuBM1v0yhHDblB2FoJgK24aO7kKm7M,877
@@ -37,7 +37,7 @@ scandeval/dataset_configs/catalan.py,sha256=SXwRJjIcMMN7rVuhFRZSnCGDoMfabW5HFoZO
37
37
  scandeval/dataset_configs/croatian.py,sha256=U5oBTjttpWTWonTEzZAf-G3nvQICRQmw6Kla-HWn_5k,1260
38
38
  scandeval/dataset_configs/czech.py,sha256=ghv2yNw839G-utll8PQRSjyKYbM5gfoQhFKy664GTCI,1562
39
39
  scandeval/dataset_configs/danish.py,sha256=LEKs04vK2KnV0CYheT7FeS-g3iHBvf2bQxyl0D_LbTg,3293
40
- scandeval/dataset_configs/dutch.py,sha256=j9D6WW5o19cuEVeyx_oosC6dF215L7ZJunIJ6tIah0g,3571
40
+ scandeval/dataset_configs/dutch.py,sha256=OZJmaqGguXY5D9hz0zFNrwGQPRXgxZonctSc8Gsy9sY,3550
41
41
  scandeval/dataset_configs/english.py,sha256=nc9nGwxf1tHVMUhQeND61yJbpTO4rJaAusPZlstqtq0,2817
42
42
  scandeval/dataset_configs/estonian.py,sha256=bWiKA_dJ7WUE8Z_1YZnSewhi4ZdCQBGJZ7pQxkCwMcU,2757
43
43
  scandeval/dataset_configs/faroese.py,sha256=13qYwXonDPWG9Av5MY_NBNTRDglPVKz5_mbz7ZCJ_mo,1247
@@ -50,7 +50,7 @@ scandeval/dataset_configs/icelandic.py,sha256=G2Ibe6oF1NknkQmHqLpoHlysW_8f-0G53D
50
50
  scandeval/dataset_configs/italian.py,sha256=qhjAQChnQanzs7EyN1DSAJ4OOU41HAlWqWntQOtbWCw,2761
51
51
  scandeval/dataset_configs/latvian.py,sha256=wbwIDieq5Lplng5Jzx9LEqq4d8b5LnNOyCUmT64b4bA,1928
52
52
  scandeval/dataset_configs/lithuanian.py,sha256=RPqKwsysO1TYeQuEEsbhzGcSFHDX94lk1hgl1CfQaMU,1724
53
- scandeval/dataset_configs/norwegian.py,sha256=skKKs4V4-zbd-1lpVUaxKXAjTMpBM6SAU5HZ8kcQ2mI,5454
53
+ scandeval/dataset_configs/norwegian.py,sha256=k70T78rTY3pmmVRxG3i_J1j7td_boFHJetkyITskIL0,5487
54
54
  scandeval/dataset_configs/polish.py,sha256=nN_NT8cUK2iv1L_zO_aCYOk2R7ACSDZgvI7e0hIaFAM,2074
55
55
  scandeval/dataset_configs/portuguese.py,sha256=m9lEeVtI_yNvIdTIEOn3HFK_ilY2tn3-acC981hjZFM,2401
56
56
  scandeval/dataset_configs/romanian.py,sha256=AcDp0mqOHmmv3EodovGEcBmarxjLYsXOPr_X4IQoNTw,1472
@@ -62,8 +62,8 @@ scandeval/dataset_configs/swedish.py,sha256=kpEK29swY7iyUSzUvD9hNf2qwb3d7bHrFwbo
62
62
  scandeval/dataset_configs/ukrainian.py,sha256=spbCmCOU27jOfz6FZxqCIfVmDN5l8H-7VCl-k-8eAIo,1527
63
63
  scandeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
64
64
  scandeval/metrics/base.py,sha256=dUBby-ZzettMjdcjek6rw0JTZMuScX4cQ2Rd6untKHY,2525
65
- scandeval/metrics/huggingface.py,sha256=W1hPuIGBALOogGN2yTGTJUsylsMII3A66fEe9nB8N2k,9493
66
- scandeval/metrics/llm_as_a_judge.py,sha256=cZ7ZCuB3633T87MjBtAekrBQ_vYaNv1uTcqnI32gNpQ,9837
65
+ scandeval/metrics/huggingface.py,sha256=W4ktwFSYq0Dy6thSmCRpxztvXDDYZtCWC0xKD6_Tcik,9521
66
+ scandeval/metrics/llm_as_a_judge.py,sha256=UUFk3aL2BZqJ-u9-dzexsoArTxPJTMmHRqb1eWxexaI,12133
67
67
  scandeval/metrics/pipeline.py,sha256=GTIqaFkn-nTLU4xBi8-zP1J4Ytv3qeFVuRB4OcuwkOw,10876
68
68
  scandeval/metrics/speed.py,sha256=G5hEQcrtqxF070ZZwLDh61iZnq2CSW2o6ZM7zR4lOTY,1298
69
69
  scandeval/prompt_templates/__init__.py,sha256=p3CUcSaJiiUm6EQyhceDUjotH7GdyHolMznAn2f44as,519
@@ -79,11 +79,11 @@ scandeval/prompt_templates/token_classification.py,sha256=8Uw34mN2xQ_5es-nz7vCK-
79
79
  scandeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
80
80
  scandeval/task_group_utils/multiple_choice_classification.py,sha256=PWUXeGn-9RsXxdVRYHJASyBVQ8L5Jla981eot0GLooY,7316
81
81
  scandeval/task_group_utils/question_answering.py,sha256=tuMwr-RnvJap5jkTrluxC1tfQVS6rKN8_ifNwis-auw,29064
82
- scandeval/task_group_utils/sequence_classification.py,sha256=VhiggNrB7Gi2x-99MPL0RR2VZRv-wpJerXulgQH6wcU,16556
82
+ scandeval/task_group_utils/sequence_classification.py,sha256=1YAaKn5bY8j9ONPfJZODjaGKVMkA9fQcl51fvBcjeF8,16829
83
83
  scandeval/task_group_utils/text_to_text.py,sha256=p6zzjob70qQUpfUOs0LToSzavE1ERqRAHu_727Jb2mM,5476
84
84
  scandeval/task_group_utils/token_classification.py,sha256=8dF32KQAYAFnnn7DPHX-yvJmRrMBmT2CyFREacyTwvQ,17321
85
- scandeval-16.10.0.dist-info/METADATA,sha256=xgQgjZK9T2wSc31Imb1lYvOQjSlEooRLA9oh-suuBr0,23435
86
- scandeval-16.10.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
87
- scandeval-16.10.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
88
- scandeval-16.10.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
89
- scandeval-16.10.0.dist-info/RECORD,,
85
+ scandeval-16.11.0.dist-info/METADATA,sha256=Tf9a-KP53zFhJMuSHkskNm66jNyVzFFb-STy69ur3FQ,23838
86
+ scandeval-16.11.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
87
+ scandeval-16.11.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
88
+ scandeval-16.11.0.dist-info/licenses/LICENSE,sha256=vb2c84xITVnhnVFsBS8AWXl-4S-KpxN6VMxTqqYlV3s,1080
89
+ scandeval-16.11.0.dist-info/RECORD,,
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2022-2025 Dan Saattrup Smart
3
+ Copyright (c) 2022-2026 Dan Saattrup Smart
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal