ScandEval 16.10.0__py3-none-any.whl → 16.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/benchmark_modules/hf.py +14 -1
- scandeval/benchmark_modules/litellm.py +111 -22
- scandeval/benchmark_modules/vllm.py +116 -60
- scandeval/benchmarker.py +13 -6
- scandeval/data_models.py +2 -2
- scandeval/dataset_configs/dutch.py +8 -9
- scandeval/dataset_configs/norwegian.py +3 -3
- scandeval/logging_utils.py +1 -1
- scandeval/metrics/huggingface.py +3 -2
- scandeval/metrics/llm_as_a_judge.py +79 -15
- scandeval/model_loading.py +2 -1
- scandeval/task_group_utils/sequence_classification.py +12 -3
- scandeval/types.py +39 -0
- scandeval/utils.py +29 -4
- {scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/METADATA +27 -19
- {scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/RECORD +19 -19
- {scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/licenses/LICENSE +1 -1
- {scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/WHEEL +0 -0
- {scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/entry_points.txt +0 -0
|
@@ -74,6 +74,14 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
|
74
74
|
languages=[DUTCH],
|
|
75
75
|
)
|
|
76
76
|
|
|
77
|
+
DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
|
|
78
|
+
name="duidelijke-taal",
|
|
79
|
+
pretty_name="Duidelijke Taal",
|
|
80
|
+
source="EuroEval/duidelijke-taal",
|
|
81
|
+
task=SIMPL,
|
|
82
|
+
languages=[DUTCH],
|
|
83
|
+
)
|
|
84
|
+
|
|
77
85
|
VALEU_NL_CONFIG = DatasetConfig(
|
|
78
86
|
name="valeu-nl",
|
|
79
87
|
pretty_name="VaLEU-nl",
|
|
@@ -161,12 +169,3 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
|
161
169
|
_labels=["a", "b"],
|
|
162
170
|
unofficial=True,
|
|
163
171
|
)
|
|
164
|
-
|
|
165
|
-
DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
|
|
166
|
-
name="duidelijke-taal",
|
|
167
|
-
pretty_name="Duidelijke Taal",
|
|
168
|
-
source="EuroEval/duidelijke-taal",
|
|
169
|
-
task=SIMPL,
|
|
170
|
-
languages=[DUTCH],
|
|
171
|
-
unofficial=True,
|
|
172
|
-
)
|
|
@@ -27,7 +27,7 @@ SCALA_NN_CONFIG = DatasetConfig(
|
|
|
27
27
|
pretty_name="ScaLA-nn",
|
|
28
28
|
source="EuroEval/scala-nn",
|
|
29
29
|
task=LA,
|
|
30
|
-
languages=[NORWEGIAN_NYNORSK],
|
|
30
|
+
languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
NORNE_NB_CONFIG = DatasetConfig(
|
|
@@ -43,7 +43,7 @@ NORNE_NN_CONFIG = DatasetConfig(
|
|
|
43
43
|
pretty_name="NorNE-nn",
|
|
44
44
|
source="EuroEval/norne-nn-mini",
|
|
45
45
|
task=NER,
|
|
46
|
-
languages=[NORWEGIAN_NYNORSK],
|
|
46
|
+
languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
NORQUAD_CONFIG = DatasetConfig(
|
|
@@ -197,7 +197,7 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
|
|
|
197
197
|
pretty_name="MultiWikiQA-nn",
|
|
198
198
|
source="EuroEval/multi-wiki-qa-nn-mini",
|
|
199
199
|
task=RC,
|
|
200
|
-
languages=[NORWEGIAN_NYNORSK],
|
|
200
|
+
languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
201
201
|
unofficial=True,
|
|
202
202
|
)
|
|
203
203
|
|
scandeval/logging_utils.py
CHANGED
|
@@ -87,7 +87,7 @@ def log(message: str, level: int, colour: str | None = None) -> None:
|
|
|
87
87
|
|
|
88
88
|
|
|
89
89
|
@cache_arguments("message")
|
|
90
|
-
def log_once(message: str, level: int
|
|
90
|
+
def log_once(message: str, level: int, prefix: str = "") -> None:
|
|
91
91
|
"""Log a message once.
|
|
92
92
|
|
|
93
93
|
This is ensured by caching the "message" argument and only logging it the first time
|
scandeval/metrics/huggingface.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All the Hugging Face metrics used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
+
import os
|
|
4
5
|
import typing as t
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
|
|
@@ -130,7 +131,7 @@ class HuggingFaceMetric(Metric):
|
|
|
130
131
|
"__call__ method."
|
|
131
132
|
)
|
|
132
133
|
|
|
133
|
-
with no_terminal_output(disable=
|
|
134
|
+
with no_terminal_output(disable=os.getenv("FULL_LOG", "0") == "1"):
|
|
134
135
|
results = self.metric.compute(
|
|
135
136
|
predictions=predictions, references=references, **self.compute_kwargs
|
|
136
137
|
)
|
|
@@ -196,7 +197,7 @@ class SourceBasedMetric(HuggingFaceMetric):
|
|
|
196
197
|
f"instead."
|
|
197
198
|
)
|
|
198
199
|
|
|
199
|
-
with no_terminal_output(disable=
|
|
200
|
+
with no_terminal_output(disable=os.getenv("FULL_LOG", "0") == "1"):
|
|
200
201
|
results = self.metric.compute(
|
|
201
202
|
sources=sources,
|
|
202
203
|
predictions=predictions,
|
|
@@ -5,7 +5,7 @@ import logging
|
|
|
5
5
|
import typing as t
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from pydantic import BaseModel, Field
|
|
8
|
+
from pydantic import BaseModel, Field, ValidationError
|
|
9
9
|
|
|
10
10
|
from ..exceptions import InvalidBenchmark
|
|
11
11
|
from ..logging_utils import log
|
|
@@ -17,6 +17,8 @@ if t.TYPE_CHECKING:
|
|
|
17
17
|
|
|
18
18
|
from ..data_models import BenchmarkConfig, DatasetConfig
|
|
19
19
|
|
|
20
|
+
from ..types import BatchScoringFunction, ScoringFunction
|
|
21
|
+
|
|
20
22
|
|
|
21
23
|
class LLMAsAJudgeMetric(Metric):
|
|
22
24
|
"""Use an LLM to judge the quality of the predictions."""
|
|
@@ -29,7 +31,8 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
29
31
|
judge_kwargs: dict[str, t.Any],
|
|
30
32
|
user_prompt: str,
|
|
31
33
|
response_format: t.Type[BaseModel],
|
|
32
|
-
scoring_fn:
|
|
34
|
+
scoring_fn: ScoringFunction | None = None,
|
|
35
|
+
batch_scoring_fn: BatchScoringFunction | None = None,
|
|
33
36
|
condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
|
|
34
37
|
system_prompt: str | None = None,
|
|
35
38
|
) -> None:
|
|
@@ -57,6 +60,8 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
57
60
|
response.
|
|
58
61
|
scoring_fn:
|
|
59
62
|
A function that takes the judge's response and returns a score.
|
|
63
|
+
batch_scoring_fn:
|
|
64
|
+
A function that takes all judge responses and returns a score.
|
|
60
65
|
condition_formatting_fn (optional):
|
|
61
66
|
A function to format the condition string before it is included in the
|
|
62
67
|
user prompt. Defaults to a no-op function that returns the input
|
|
@@ -70,7 +75,9 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
70
75
|
self.judge_kwargs = judge_kwargs
|
|
71
76
|
self.user_prompt = user_prompt
|
|
72
77
|
self.response_format = response_format
|
|
73
|
-
self.
|
|
78
|
+
self.batch_scoring_fn = self._get_batch_scoring_fn(
|
|
79
|
+
scoring_fn=scoring_fn, batch_scoring_fn=batch_scoring_fn
|
|
80
|
+
)
|
|
74
81
|
self.condition_formatting_fn = condition_formatting_fn
|
|
75
82
|
self.system_prompt = system_prompt
|
|
76
83
|
|
|
@@ -181,22 +188,36 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
181
188
|
json_dicts = [
|
|
182
189
|
extract_json_dict_from_string(s=output.sequence) for output in raw_outputs
|
|
183
190
|
]
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
if json_dict is
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
191
|
+
outputs_raw: list[BaseModel | None] = []
|
|
192
|
+
for json_dict in json_dicts:
|
|
193
|
+
if json_dict is None:
|
|
194
|
+
outputs_raw.append(None)
|
|
195
|
+
continue
|
|
196
|
+
try:
|
|
197
|
+
outputs_raw.append(self.response_format.model_validate(obj=json_dict))
|
|
198
|
+
except ValidationError:
|
|
199
|
+
outputs_raw.append(None)
|
|
200
|
+
|
|
201
|
+
num_none: int = sum(output is None for output in outputs_raw)
|
|
202
|
+
if num_none:
|
|
203
|
+
log(
|
|
204
|
+
f"Could not parse/validate {num_none:,} of {len(outputs_raw):,} judge "
|
|
205
|
+
f"outputs for metric {self.pretty_name!r}. These will be ignored.",
|
|
206
|
+
level=logging.DEBUG,
|
|
207
|
+
)
|
|
190
208
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
209
|
+
outputs: list[BaseModel] = [
|
|
210
|
+
output for output in outputs_raw if output is not None
|
|
211
|
+
]
|
|
212
|
+
if not outputs:
|
|
194
213
|
log(
|
|
195
|
-
f"No
|
|
214
|
+
f"No valid judge outputs were produced for metric "
|
|
215
|
+
f"{self.pretty_name!r}.",
|
|
196
216
|
level=logging.WARNING,
|
|
197
217
|
)
|
|
198
218
|
return None
|
|
199
|
-
|
|
219
|
+
|
|
220
|
+
return self.batch_scoring_fn(outputs=outputs, dataset=dataset)
|
|
200
221
|
|
|
201
222
|
def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
|
|
202
223
|
"""Apply the user prompt to the prediction and condition.
|
|
@@ -227,6 +248,49 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
227
248
|
)
|
|
228
249
|
return self.user_prompt.format(prediction=prediction)
|
|
229
250
|
|
|
251
|
+
def _get_batch_scoring_fn(
|
|
252
|
+
self,
|
|
253
|
+
scoring_fn: ScoringFunction | None,
|
|
254
|
+
batch_scoring_fn: BatchScoringFunction | None,
|
|
255
|
+
) -> BatchScoringFunction:
|
|
256
|
+
"""Get the batch scoring function.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
scoring_fn:
|
|
260
|
+
The scoring function to use.
|
|
261
|
+
batch_scoring_fn:
|
|
262
|
+
The batch scoring function to use.
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
The batch scoring function.
|
|
266
|
+
|
|
267
|
+
Raises:
|
|
268
|
+
InvalidBenchmark:
|
|
269
|
+
If both or neither of the scoring functions are provided.
|
|
270
|
+
"""
|
|
271
|
+
if scoring_fn is not None and batch_scoring_fn is not None:
|
|
272
|
+
raise InvalidBenchmark(
|
|
273
|
+
"Both `scoring_fn` and `batch_scoring_fn` are provided. Please "
|
|
274
|
+
"provide only one of them."
|
|
275
|
+
)
|
|
276
|
+
if scoring_fn is not None:
|
|
277
|
+
scoring_fn_nonnull = scoring_fn
|
|
278
|
+
|
|
279
|
+
def batch_fn(
|
|
280
|
+
outputs: list[BaseModel], dataset: "Dataset | None" = None
|
|
281
|
+
) -> float:
|
|
282
|
+
return sum(scoring_fn_nonnull(output) for output in outputs) / len(
|
|
283
|
+
outputs
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
return batch_fn
|
|
287
|
+
if batch_scoring_fn is not None:
|
|
288
|
+
return batch_scoring_fn
|
|
289
|
+
raise InvalidBenchmark(
|
|
290
|
+
"Neither `scoring_fn` nor `batch_scoring_fn` are provided. Please "
|
|
291
|
+
"provide one of them."
|
|
292
|
+
)
|
|
293
|
+
|
|
230
294
|
|
|
231
295
|
### Fluency metric ###
|
|
232
296
|
|
|
@@ -257,5 +321,5 @@ fluency_metric = LLMAsAJudgeMetric(
|
|
|
257
321
|
"Text: {prediction!r}\n\n"
|
|
258
322
|
"Output your rating as a JSON object with a single key 'fluency'.",
|
|
259
323
|
response_format=Fluency,
|
|
260
|
-
scoring_fn=lambda output: (output.fluency - 1) / 4.0
|
|
324
|
+
scoring_fn=lambda output: (output.fluency - 1) / 4.0,
|
|
261
325
|
)
|
scandeval/model_loading.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Functions related to the loading of models."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
import typing as t
|
|
4
5
|
|
|
5
6
|
from .benchmark_modules import (
|
|
@@ -35,7 +36,7 @@ def load_model(
|
|
|
35
36
|
Returns:
|
|
36
37
|
The model.
|
|
37
38
|
"""
|
|
38
|
-
log_once(f"\nLoading the model {model_config.model_id}...")
|
|
39
|
+
log_once(f"\nLoading the model {model_config.model_id}...", level=logging.INFO)
|
|
39
40
|
|
|
40
41
|
# The order matters; the first model type that matches will be used. For this
|
|
41
42
|
# reason, they have been ordered in terms of the most common model types.
|
|
@@ -180,6 +180,17 @@ def extract_labels_from_generation(
|
|
|
180
180
|
if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
|
|
181
181
|
predicted_label = m.group(1)
|
|
182
182
|
|
|
183
|
+
# If the prediction starts with one of the candidate labels (case-insensitive)
|
|
184
|
+
# then use that one
|
|
185
|
+
prefix_candidate_labels = [
|
|
186
|
+
candidate_label
|
|
187
|
+
for candidate_label in sample_candidate_labels[idx]
|
|
188
|
+
if predicted_label.lower().startswith(candidate_label.lower())
|
|
189
|
+
]
|
|
190
|
+
if prefix_candidate_labels:
|
|
191
|
+
new_predicted_labels.append(prefix_candidate_labels[0])
|
|
192
|
+
continue
|
|
193
|
+
|
|
183
194
|
# We set the word edit distance weights such that we heavily penalise insertions
|
|
184
195
|
# and substitutions, so that we don't just insert the correct label, but that we
|
|
185
196
|
# want the model to have included the correct label in its output.
|
|
@@ -235,9 +246,7 @@ def extract_labels_from_generation(
|
|
|
235
246
|
f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
|
|
236
247
|
"of the samples. This likely means that the model were completely "
|
|
237
248
|
"off in these cases. Since this task does not allow invalid model "
|
|
238
|
-
"outputs, we have to abort the evaluation.
|
|
239
|
-
"evaluation with the `--debug` flag (or `debug=True` if you're using "
|
|
240
|
-
"the `Benchmarker` API) to see the precise model outputs."
|
|
249
|
+
"outputs, we have to abort the evaluation."
|
|
241
250
|
)
|
|
242
251
|
|
|
243
252
|
return new_predicted_labels
|
scandeval/types.py
CHANGED
|
@@ -13,9 +13,11 @@ except ImportError:
|
|
|
13
13
|
MistralCommonBackend as MistralCommonTokenizer,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
|
+
|
|
16
17
|
if t.TYPE_CHECKING:
|
|
17
18
|
from datasets.arrow_dataset import Dataset
|
|
18
19
|
from numpy.typing import NDArray
|
|
20
|
+
from pydantic import BaseModel
|
|
19
21
|
|
|
20
22
|
from .data_models import BenchmarkConfig, GenerativeModelOutput
|
|
21
23
|
|
|
@@ -73,6 +75,43 @@ class ExtractLabelsFunction(t.Protocol):
|
|
|
73
75
|
...
|
|
74
76
|
|
|
75
77
|
|
|
78
|
+
class ScoringFunction(t.Protocol):
|
|
79
|
+
"""A function used to compute a score from a single model output."""
|
|
80
|
+
|
|
81
|
+
def __call__(self, output: "BaseModel") -> float:
|
|
82
|
+
"""Compute a score from a model output.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
output:
|
|
86
|
+
A model output (Pydantic model) from the judge.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
A float score computed from the output.
|
|
90
|
+
"""
|
|
91
|
+
...
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class BatchScoringFunction(t.Protocol):
|
|
95
|
+
"""A function used to compute batch scores from model outputs."""
|
|
96
|
+
|
|
97
|
+
def __call__(
|
|
98
|
+
self, outputs: list["BaseModel"], dataset: "Dataset | None" = None
|
|
99
|
+
) -> float:
|
|
100
|
+
"""Compute a batch score from model outputs.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
outputs:
|
|
104
|
+
List of model outputs (Pydantic models) from the judge.
|
|
105
|
+
dataset:
|
|
106
|
+
Optional dataset used for evaluation. Can be used for additional
|
|
107
|
+
context when computing the score.
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
A float score computed from the batch of outputs.
|
|
111
|
+
"""
|
|
112
|
+
...
|
|
113
|
+
|
|
114
|
+
|
|
76
115
|
def is_list_of_int(x: object) -> t.TypeGuard[c.Sequence[int]]:
|
|
77
116
|
"""Check if an object is a list of integers.
|
|
78
117
|
|
scandeval/utils.py
CHANGED
|
@@ -21,6 +21,7 @@ import huggingface_hub as hf_hub
|
|
|
21
21
|
import numpy as np
|
|
22
22
|
import torch
|
|
23
23
|
from huggingface_hub.errors import LocalTokenNotFoundError
|
|
24
|
+
from requests.exceptions import RequestException
|
|
24
25
|
|
|
25
26
|
from .caching_utils import cache_arguments
|
|
26
27
|
from .constants import T
|
|
@@ -44,10 +45,25 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
|
|
|
44
45
|
Returns:
|
|
45
46
|
The path to the cache directory.
|
|
46
47
|
"""
|
|
47
|
-
#
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
# If the model ID is a path, we just use that as the cache dir
|
|
49
|
+
if Path(model_id).is_dir():
|
|
50
|
+
log_once(
|
|
51
|
+
f"Since the model {model_id!r} is a local model, we will use the model "
|
|
52
|
+
"directory directly as the model cache directory.",
|
|
53
|
+
level=logging.DEBUG,
|
|
54
|
+
)
|
|
55
|
+
return model_id
|
|
56
|
+
|
|
57
|
+
# Otherwise, we create a cache dir based on the model ID
|
|
58
|
+
model_cache_dir = Path(
|
|
59
|
+
cache_dir, "model_cache", model_id.replace("/", "--")
|
|
60
|
+
).as_posix()
|
|
61
|
+
log_once(
|
|
62
|
+
f"Using the model cache directory {model_cache_dir!r} for the model "
|
|
63
|
+
f"{model_id!r}.",
|
|
64
|
+
level=logging.DEBUG,
|
|
65
|
+
)
|
|
66
|
+
return model_cache_dir
|
|
51
67
|
|
|
52
68
|
|
|
53
69
|
def resolve_model_path(download_dir: str) -> str:
|
|
@@ -65,8 +81,10 @@ def resolve_model_path(download_dir: str) -> str:
|
|
|
65
81
|
If the model path is not valid, or if required files are missing.
|
|
66
82
|
"""
|
|
67
83
|
model_path = Path(download_dir)
|
|
84
|
+
|
|
68
85
|
# Get the 'path safe' version of the model id, which is the last dir in the path
|
|
69
86
|
model_id_path = model_path.name
|
|
87
|
+
|
|
70
88
|
# Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
|
|
71
89
|
model_path = model_path / f"models--{model_id_path}" / "snapshots"
|
|
72
90
|
if not model_path.exists():
|
|
@@ -423,6 +441,13 @@ def get_hf_token(api_key: str | None) -> str | bool:
|
|
|
423
441
|
level=logging.DEBUG,
|
|
424
442
|
)
|
|
425
443
|
return False
|
|
444
|
+
except RequestException:
|
|
445
|
+
log_once(
|
|
446
|
+
"No Hugging Face API key was set and the connection to Hugging Face "
|
|
447
|
+
"failed, so no token will be used.",
|
|
448
|
+
level=logging.DEBUG,
|
|
449
|
+
)
|
|
450
|
+
return False
|
|
426
451
|
|
|
427
452
|
|
|
428
453
|
def extract_multiple_choice_labels(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ScandEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.11.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -8,7 +8,7 @@ Author-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
|
|
|
8
8
|
Maintainer-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
|
|
9
9
|
License: MIT License
|
|
10
10
|
|
|
11
|
-
Copyright (c) 2022-
|
|
11
|
+
Copyright (c) 2022-2026 Dan Saattrup Smart
|
|
12
12
|
|
|
13
13
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
14
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -123,16 +123,17 @@ The easiest way to benchmark pretrained models is via the command line interface
|
|
|
123
123
|
having installed the package, you can benchmark your favorite model like so:
|
|
124
124
|
|
|
125
125
|
```bash
|
|
126
|
-
euroeval --model <model-id>
|
|
126
|
+
euroeval --model <model-id-or-path>
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
-
Here `model` is the HuggingFace model ID, which can be found on the [HuggingFace
|
|
130
|
-
Hub](https://huggingface.co/models)
|
|
131
|
-
the
|
|
132
|
-
|
|
129
|
+
Here `model` is either the HuggingFace model ID, which can be found on the [HuggingFace
|
|
130
|
+
Hub](https://huggingface.co/models), or a local path to a model directory (containing
|
|
131
|
+
the model files as well as the `config.json` file). By default this will benchmark the
|
|
132
|
+
model on all the tasks available. If you want to benchmark on a particular task, then
|
|
133
|
+
use the `--task` argument:
|
|
133
134
|
|
|
134
135
|
```bash
|
|
135
|
-
euroeval --model <model-id> --task sentiment-classification
|
|
136
|
+
euroeval --model <model-id-or-path> --task sentiment-classification
|
|
136
137
|
```
|
|
137
138
|
|
|
138
139
|
We can also narrow down which languages we would like to benchmark on. This can be done
|
|
@@ -140,20 +141,20 @@ by setting the `--language` argument. Here we thus benchmark the model on the Da
|
|
|
140
141
|
sentiment classification task:
|
|
141
142
|
|
|
142
143
|
```bash
|
|
143
|
-
euroeval --model <model-id> --task sentiment-classification --language da
|
|
144
|
+
euroeval --model <model-id-or-path> --task sentiment-classification --language da
|
|
144
145
|
```
|
|
145
146
|
|
|
146
147
|
Multiple models, datasets and/or languages can be specified by just attaching multiple
|
|
147
148
|
arguments. Here is an example with two models:
|
|
148
149
|
|
|
149
150
|
```bash
|
|
150
|
-
euroeval --model <model-
|
|
151
|
+
euroeval --model <model-id-or-path-1> --model <model-id-or-path-2>
|
|
151
152
|
```
|
|
152
153
|
|
|
153
154
|
The specific model version/revision to use can also be added after the suffix '@':
|
|
154
155
|
|
|
155
156
|
```bash
|
|
156
|
-
euroeval --model <model-id>@<commit>
|
|
157
|
+
euroeval --model <model-id-or-path>@<commit>
|
|
157
158
|
```
|
|
158
159
|
|
|
159
160
|
This can be a branch name, a tag name, or a commit id. It defaults to 'main' for latest.
|
|
@@ -173,7 +174,7 @@ model:
|
|
|
173
174
|
```python
|
|
174
175
|
>>> from euroeval import Benchmarker
|
|
175
176
|
>>> benchmarker = Benchmarker()
|
|
176
|
-
>>> benchmarker.benchmark(model="<model-id>")
|
|
177
|
+
>>> benchmarker.benchmark(model="<model-id-or-path>")
|
|
177
178
|
```
|
|
178
179
|
|
|
179
180
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
@@ -181,7 +182,7 @@ To benchmark on a specific task and/or language, you simply specify the `task` o
|
|
|
181
182
|
|
|
182
183
|
```python
|
|
183
184
|
>>> benchmarker.benchmark(
|
|
184
|
-
... model="<model-id>",
|
|
185
|
+
... model="<model-id-or-path>",
|
|
185
186
|
... task="sentiment-classification",
|
|
186
187
|
... language="da",
|
|
187
188
|
... )
|
|
@@ -225,7 +226,7 @@ docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
|
|
|
225
226
|
```
|
|
226
227
|
|
|
227
228
|
Here `<euroeval-arguments>` consists of the arguments added to the `euroeval` CLI
|
|
228
|
-
argument. This could for instance be `--model <model-id> --task
|
|
229
|
+
argument. This could for instance be `--model <model-id-or-path> --task
|
|
229
230
|
sentiment-classification`.
|
|
230
231
|
|
|
231
232
|
## Benchmarking custom inference APIs
|
|
@@ -291,14 +292,14 @@ script. For example to download the model you want and all of the Danish sentime
|
|
|
291
292
|
classification datasets:
|
|
292
293
|
|
|
293
294
|
```bash
|
|
294
|
-
euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
295
|
+
euroeval --model <model-id-or-path> --task sentiment-classification --language da --download-only
|
|
295
296
|
```
|
|
296
297
|
|
|
297
298
|
Or from a script:
|
|
298
299
|
|
|
299
300
|
```python
|
|
300
301
|
>>> benchmarker.benchmark(
|
|
301
|
-
... model="<model-id>",
|
|
302
|
+
... model="<model-id-or-path>",
|
|
302
303
|
... task="sentiment-classification",
|
|
303
304
|
... language="da",
|
|
304
305
|
... download_only=True,
|
|
@@ -346,7 +347,7 @@ MY_CONFIG = DatasetConfig(
|
|
|
346
347
|
You can then benchmark your custom dataset by simply running
|
|
347
348
|
|
|
348
349
|
```bash
|
|
349
|
-
euroeval --dataset my-dataset --model <model-id>
|
|
350
|
+
euroeval --dataset my-dataset --model <model-id-or-path>
|
|
350
351
|
```
|
|
351
352
|
|
|
352
353
|
You can also run the benchmark from a Python script, by simply providing your custom
|
|
@@ -356,7 +357,7 @@ dataset configuration directly into the `benchmark` method:
|
|
|
356
357
|
from euroeval import Benchmarker
|
|
357
358
|
|
|
358
359
|
benchmarker = Benchmarker()
|
|
359
|
-
benchmarker.benchmark(model="<model-id>", dataset=MY_CONFIG)
|
|
360
|
+
benchmarker.benchmark(model="<model-id-or-path>", dataset=MY_CONFIG)
|
|
360
361
|
```
|
|
361
362
|
|
|
362
363
|
We have included three convenience tasks to make it easier to set up custom datasets:
|
|
@@ -436,7 +437,7 @@ MY_SQL_DATASET = DatasetConfig(
|
|
|
436
437
|
Again, with this you can benchmark your custom dataset by simply running
|
|
437
438
|
|
|
438
439
|
```bash
|
|
439
|
-
euroeval --dataset my-sql-dataset --model <model-id>
|
|
440
|
+
euroeval --dataset my-sql-dataset --model <model-id-or-path>
|
|
440
441
|
```
|
|
441
442
|
|
|
442
443
|
## Reproducing the evaluation datasets
|
|
@@ -592,6 +593,13 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
592
593
|
alt="Contributor avatar for tvosch"
|
|
593
594
|
/>
|
|
594
595
|
</a>
|
|
596
|
+
<a href="https://github.com/Touzen">
|
|
597
|
+
<img
|
|
598
|
+
src="https://avatars.githubusercontent.com/u/1416265"
|
|
599
|
+
width=50
|
|
600
|
+
alt="Contributor avatar for Touzen"
|
|
601
|
+
/>
|
|
602
|
+
</a>
|
|
595
603
|
|
|
596
604
|
### Contribute to EuroEval
|
|
597
605
|
|
|
@@ -1,34 +1,34 @@
|
|
|
1
1
|
scandeval/__init__.py,sha256=w4oYw-lbj5ZZ4pv-bHrgZNJ6dlu-WcAWg2e--_UMmeE,4244
|
|
2
2
|
scandeval/benchmark_config_factory.py,sha256=2stmcqKwx0G9pAiA0atunqDchJ9eoezp1Wh3vB41zV4,8745
|
|
3
|
-
scandeval/benchmarker.py,sha256=
|
|
3
|
+
scandeval/benchmarker.py,sha256=Enf3IGYPl2q8j4ViXi5M8_ZaftpCAemTi0Z9HGMv7wc,53841
|
|
4
4
|
scandeval/caching_utils.py,sha256=lLUbkpDdJZy4xodIpwIz5d-WNKGuszbr_d9dyiJ5kZc,2591
|
|
5
5
|
scandeval/callbacks.py,sha256=l8f6Zr8EoHfVFsI1ZnMUK0Y8uZB00Nvaz_I6XDn6avE,2515
|
|
6
6
|
scandeval/cli.py,sha256=zvPGomSdrcjxc4uhmh8SkB4s2d7U9JYhxBJ34vznqUI,9411
|
|
7
7
|
scandeval/constants.py,sha256=wF7fQwaX8yZIypq_eh5RcaQFEhABR7dJxQaAX82b4P8,3766
|
|
8
8
|
scandeval/data_loading.py,sha256=8ryYEmj6di1f9QefGfNajxObQ9iapIGuAsL8m9KzDyI,7050
|
|
9
|
-
scandeval/data_models.py,sha256=
|
|
9
|
+
scandeval/data_models.py,sha256=btAafgRktlRhcOXDIFNp4y0RiR2n5-C_rRmgZCyxmCE,30562
|
|
10
10
|
scandeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
|
|
11
11
|
scandeval/exceptions.py,sha256=4-N2OIo5PJ2aciLjagNAVhdHPxpq2QxywbBqJ8lkKj0,5780
|
|
12
12
|
scandeval/finetuning.py,sha256=dTjchPHLFRD65ZrEmtj5TfMTPZ6PODn77t372fgTNwE,11983
|
|
13
13
|
scandeval/generation.py,sha256=ccE-S0jxkM99XziIdeaBbk8yRGv4YBkzZkoabhFCSKA,13382
|
|
14
14
|
scandeval/generation_utils.py,sha256=A6YCiiMrMEUHq5BcVEjsouIKMPGt0sCfPzsJY1GVyk0,20092
|
|
15
15
|
scandeval/languages.py,sha256=gUSosFbvf1eEQHjVsKhXdJ4jiGXC-9lMkOL8AsBG33Q,37295
|
|
16
|
-
scandeval/logging_utils.py,sha256=
|
|
16
|
+
scandeval/logging_utils.py,sha256=Qnni11ngHrjCf_fgkk6lp6gs-tGSgUS3d5zRR83y6ec,9507
|
|
17
17
|
scandeval/model_cache.py,sha256=sjMYW0klnHt2yAFLavDTsp_InxPeSOuVEFo-Rh_31UM,10219
|
|
18
18
|
scandeval/model_config.py,sha256=fxHfgpw-9vj3hwke28DguVGvG9TU06nkTXT0V6KAMpQ,2761
|
|
19
|
-
scandeval/model_loading.py,sha256=
|
|
19
|
+
scandeval/model_loading.py,sha256=DsX7et18Epcv8kHATZgwPJnwH17GHmh3JCzrSoI3GAE,2377
|
|
20
20
|
scandeval/scores.py,sha256=9a1XtppFbp8GJFc9JdThGxqBY0YUE7-92oyrlxScjNk,3281
|
|
21
21
|
scandeval/speed_benchmark.py,sha256=VUOvauc9tuAegThNT2g1a-Z1l7DEmKq57dHI4t16o5A,4068
|
|
22
22
|
scandeval/tasks.py,sha256=mgE6Vx_1WD9-aY-yeBxc_09Uyz-tqk69xISMWVYcrsY,5980
|
|
23
23
|
scandeval/tokenisation_utils.py,sha256=Sa8V91J4NDFBF-qbConPsQvUkW_02cJp0gySz_Q3NDo,21191
|
|
24
|
-
scandeval/types.py,sha256
|
|
25
|
-
scandeval/utils.py,sha256=
|
|
24
|
+
scandeval/types.py,sha256=CHQjLzqKYDXPCyZas7rKg6wD1pNiYuaOFMWimrj5H64,4374
|
|
25
|
+
scandeval/utils.py,sha256=E3HQ-8cecJh6NMHF7Ji2YBx6x4tiVKeESglkBeQ0CKg,19167
|
|
26
26
|
scandeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
27
27
|
scandeval/benchmark_modules/base.py,sha256=5YAsCMILKTRXFx_ylGQ7iS5AFKN25iFdkBjj8KzzElw,11445
|
|
28
28
|
scandeval/benchmark_modules/fresh.py,sha256=sG5ae4p1J-GGmVNcVBIxY1xZIAlUwq_pu-9c4uAYU3Y,10734
|
|
29
|
-
scandeval/benchmark_modules/hf.py,sha256=
|
|
30
|
-
scandeval/benchmark_modules/litellm.py,sha256=
|
|
31
|
-
scandeval/benchmark_modules/vllm.py,sha256=
|
|
29
|
+
scandeval/benchmark_modules/hf.py,sha256=bfaPCCBWtRB36TAfJU82WhK_KtdWSuFbSVE81JU1uEY,47900
|
|
30
|
+
scandeval/benchmark_modules/litellm.py,sha256=LPYwCkqpMOMiJzBHQ6mepa94tQZ2POWIpgciVszbOyE,75061
|
|
31
|
+
scandeval/benchmark_modules/vllm.py,sha256=DbGM-_ExTKAhETibb5GOlvG0MguG0JZZHD3cXYP65LM,59754
|
|
32
32
|
scandeval/dataset_configs/__init__.py,sha256=GFI_W9GKd3OSDdhhJzHc8mwoP9b32IHIIyvPBI-hK6k,3223
|
|
33
33
|
scandeval/dataset_configs/albanian.py,sha256=D__dli7JO3yeHzzdJ3FFyUGw-z20f1yI6QLnws-WB8I,1473
|
|
34
34
|
scandeval/dataset_configs/bosnian.py,sha256=golIWqwW1pFwSkuBM1v0yhHDblB2FoJgK24aO7kKm7M,877
|
|
@@ -37,7 +37,7 @@ scandeval/dataset_configs/catalan.py,sha256=SXwRJjIcMMN7rVuhFRZSnCGDoMfabW5HFoZO
|
|
|
37
37
|
scandeval/dataset_configs/croatian.py,sha256=U5oBTjttpWTWonTEzZAf-G3nvQICRQmw6Kla-HWn_5k,1260
|
|
38
38
|
scandeval/dataset_configs/czech.py,sha256=ghv2yNw839G-utll8PQRSjyKYbM5gfoQhFKy664GTCI,1562
|
|
39
39
|
scandeval/dataset_configs/danish.py,sha256=LEKs04vK2KnV0CYheT7FeS-g3iHBvf2bQxyl0D_LbTg,3293
|
|
40
|
-
scandeval/dataset_configs/dutch.py,sha256=
|
|
40
|
+
scandeval/dataset_configs/dutch.py,sha256=OZJmaqGguXY5D9hz0zFNrwGQPRXgxZonctSc8Gsy9sY,3550
|
|
41
41
|
scandeval/dataset_configs/english.py,sha256=nc9nGwxf1tHVMUhQeND61yJbpTO4rJaAusPZlstqtq0,2817
|
|
42
42
|
scandeval/dataset_configs/estonian.py,sha256=bWiKA_dJ7WUE8Z_1YZnSewhi4ZdCQBGJZ7pQxkCwMcU,2757
|
|
43
43
|
scandeval/dataset_configs/faroese.py,sha256=13qYwXonDPWG9Av5MY_NBNTRDglPVKz5_mbz7ZCJ_mo,1247
|
|
@@ -50,7 +50,7 @@ scandeval/dataset_configs/icelandic.py,sha256=G2Ibe6oF1NknkQmHqLpoHlysW_8f-0G53D
|
|
|
50
50
|
scandeval/dataset_configs/italian.py,sha256=qhjAQChnQanzs7EyN1DSAJ4OOU41HAlWqWntQOtbWCw,2761
|
|
51
51
|
scandeval/dataset_configs/latvian.py,sha256=wbwIDieq5Lplng5Jzx9LEqq4d8b5LnNOyCUmT64b4bA,1928
|
|
52
52
|
scandeval/dataset_configs/lithuanian.py,sha256=RPqKwsysO1TYeQuEEsbhzGcSFHDX94lk1hgl1CfQaMU,1724
|
|
53
|
-
scandeval/dataset_configs/norwegian.py,sha256=
|
|
53
|
+
scandeval/dataset_configs/norwegian.py,sha256=k70T78rTY3pmmVRxG3i_J1j7td_boFHJetkyITskIL0,5487
|
|
54
54
|
scandeval/dataset_configs/polish.py,sha256=nN_NT8cUK2iv1L_zO_aCYOk2R7ACSDZgvI7e0hIaFAM,2074
|
|
55
55
|
scandeval/dataset_configs/portuguese.py,sha256=m9lEeVtI_yNvIdTIEOn3HFK_ilY2tn3-acC981hjZFM,2401
|
|
56
56
|
scandeval/dataset_configs/romanian.py,sha256=AcDp0mqOHmmv3EodovGEcBmarxjLYsXOPr_X4IQoNTw,1472
|
|
@@ -62,8 +62,8 @@ scandeval/dataset_configs/swedish.py,sha256=kpEK29swY7iyUSzUvD9hNf2qwb3d7bHrFwbo
|
|
|
62
62
|
scandeval/dataset_configs/ukrainian.py,sha256=spbCmCOU27jOfz6FZxqCIfVmDN5l8H-7VCl-k-8eAIo,1527
|
|
63
63
|
scandeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
|
|
64
64
|
scandeval/metrics/base.py,sha256=dUBby-ZzettMjdcjek6rw0JTZMuScX4cQ2Rd6untKHY,2525
|
|
65
|
-
scandeval/metrics/huggingface.py,sha256=
|
|
66
|
-
scandeval/metrics/llm_as_a_judge.py,sha256=
|
|
65
|
+
scandeval/metrics/huggingface.py,sha256=W4ktwFSYq0Dy6thSmCRpxztvXDDYZtCWC0xKD6_Tcik,9521
|
|
66
|
+
scandeval/metrics/llm_as_a_judge.py,sha256=UUFk3aL2BZqJ-u9-dzexsoArTxPJTMmHRqb1eWxexaI,12133
|
|
67
67
|
scandeval/metrics/pipeline.py,sha256=GTIqaFkn-nTLU4xBi8-zP1J4Ytv3qeFVuRB4OcuwkOw,10876
|
|
68
68
|
scandeval/metrics/speed.py,sha256=G5hEQcrtqxF070ZZwLDh61iZnq2CSW2o6ZM7zR4lOTY,1298
|
|
69
69
|
scandeval/prompt_templates/__init__.py,sha256=p3CUcSaJiiUm6EQyhceDUjotH7GdyHolMznAn2f44as,519
|
|
@@ -79,11 +79,11 @@ scandeval/prompt_templates/token_classification.py,sha256=8Uw34mN2xQ_5es-nz7vCK-
|
|
|
79
79
|
scandeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
80
80
|
scandeval/task_group_utils/multiple_choice_classification.py,sha256=PWUXeGn-9RsXxdVRYHJASyBVQ8L5Jla981eot0GLooY,7316
|
|
81
81
|
scandeval/task_group_utils/question_answering.py,sha256=tuMwr-RnvJap5jkTrluxC1tfQVS6rKN8_ifNwis-auw,29064
|
|
82
|
-
scandeval/task_group_utils/sequence_classification.py,sha256=
|
|
82
|
+
scandeval/task_group_utils/sequence_classification.py,sha256=1YAaKn5bY8j9ONPfJZODjaGKVMkA9fQcl51fvBcjeF8,16829
|
|
83
83
|
scandeval/task_group_utils/text_to_text.py,sha256=p6zzjob70qQUpfUOs0LToSzavE1ERqRAHu_727Jb2mM,5476
|
|
84
84
|
scandeval/task_group_utils/token_classification.py,sha256=8dF32KQAYAFnnn7DPHX-yvJmRrMBmT2CyFREacyTwvQ,17321
|
|
85
|
-
scandeval-16.
|
|
86
|
-
scandeval-16.
|
|
87
|
-
scandeval-16.
|
|
88
|
-
scandeval-16.
|
|
89
|
-
scandeval-16.
|
|
85
|
+
scandeval-16.11.0.dist-info/METADATA,sha256=Tf9a-KP53zFhJMuSHkskNm66jNyVzFFb-STy69ur3FQ,23838
|
|
86
|
+
scandeval-16.11.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
87
|
+
scandeval-16.11.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
|
|
88
|
+
scandeval-16.11.0.dist-info/licenses/LICENSE,sha256=vb2c84xITVnhnVFsBS8AWXl-4S-KpxN6VMxTqqYlV3s,1080
|
|
89
|
+
scandeval-16.11.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|