EuroEval 15.3.1__py3-none-any.whl → 15.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +11 -0
- euroeval/benchmark_config_factory.py +2 -2
- euroeval/benchmark_modules/hf.py +2 -3
- euroeval/benchmark_modules/litellm.py +124 -2
- euroeval/benchmark_modules/vllm.py +33 -13
- euroeval/benchmarker.py +2 -2
- euroeval/constants.py +7 -1
- euroeval/data_loading.py +2 -1
- euroeval/dataset_configs.py +172 -1
- euroeval/task_utils/token_classification.py +3 -9
- euroeval/utils.py +1 -0
- {euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/METADATA +22 -7
- {euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/RECORD +16 -16
- {euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/WHEEL +0 -0
- {euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.3.1.dist-info → euroeval-15.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
|
|
|
14
14
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
15
15
|
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
16
16
|
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
17
|
+
logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
|
|
17
18
|
|
|
18
19
|
# Set up logging
|
|
19
20
|
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
@@ -66,6 +67,16 @@ os.environ["OMP_NUM_THREADS"] = "1"
|
|
|
66
67
|
os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
|
|
67
68
|
|
|
68
69
|
|
|
70
|
+
# Avoid the "Cannot re-initialize CUDA in forked subprocess" error - see
|
|
71
|
+
# https://github.com/vllm-project/vllm/issues/6152 for more
|
|
72
|
+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
|
|
76
|
+
# but XGrammar does not support having a maximal amount of elements in lists
|
|
77
|
+
os.environ["VLLM_USE_V1"] = "0"
|
|
78
|
+
|
|
79
|
+
|
|
69
80
|
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
70
81
|
# former and LiteLLM uses the latter
|
|
71
82
|
if os.getenv("HUGGINGFACE_API_KEY"):
|
|
@@ -12,7 +12,7 @@ from .dataset_configs import get_all_dataset_configs
|
|
|
12
12
|
from .enums import Device
|
|
13
13
|
from .exceptions import InvalidBenchmark
|
|
14
14
|
from .languages import get_all_languages
|
|
15
|
-
from .tasks import get_all_tasks
|
|
15
|
+
from .tasks import SPEED, get_all_tasks
|
|
16
16
|
from .utils import log_once
|
|
17
17
|
|
|
18
18
|
if t.TYPE_CHECKING:
|
|
@@ -294,7 +294,7 @@ def prepare_tasks_and_datasets(
|
|
|
294
294
|
# Create the list of dataset tasks
|
|
295
295
|
try:
|
|
296
296
|
if task is None:
|
|
297
|
-
tasks =
|
|
297
|
+
tasks = [t for t in task_mapping.values() if t != SPEED]
|
|
298
298
|
elif isinstance(task, str):
|
|
299
299
|
tasks = [task_mapping[task]]
|
|
300
300
|
else:
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -224,8 +224,6 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
224
224
|
"max_position_embeddings",
|
|
225
225
|
"max_sequence_length",
|
|
226
226
|
"model_max_length",
|
|
227
|
-
"sliding_window",
|
|
228
|
-
"sliding_window_size",
|
|
229
227
|
"n_positions",
|
|
230
228
|
]
|
|
231
229
|
for candidate_config_max_length in candidate_config_max_lengths:
|
|
@@ -804,7 +802,7 @@ def get_model_repo_info(
|
|
|
804
802
|
generative_class_names = [
|
|
805
803
|
class_name
|
|
806
804
|
for tag in GENERATIVE_PIPELINE_TAGS
|
|
807
|
-
for class_name in TASK_MAPPING
|
|
805
|
+
for class_name in TASK_MAPPING.get(tag, dict()).values()
|
|
808
806
|
]
|
|
809
807
|
if class_names is not None and any(
|
|
810
808
|
class_name in generative_class_names for class_name in class_names
|
|
@@ -1023,6 +1021,7 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
|
|
|
1023
1021
|
"""
|
|
1024
1022
|
# Get the models' token type embedding children, if they exist
|
|
1025
1023
|
children = get_children_of_module(name="model", module=model)
|
|
1024
|
+
assert isinstance(children, dict)
|
|
1026
1025
|
|
|
1027
1026
|
# If the model has token type embeddings then get them
|
|
1028
1027
|
if children:
|
|
@@ -12,6 +12,7 @@ from functools import cached_property, partial
|
|
|
12
12
|
from time import sleep
|
|
13
13
|
|
|
14
14
|
import litellm
|
|
15
|
+
import ollama
|
|
15
16
|
from datasets import DatasetDict
|
|
16
17
|
from huggingface_hub import HfApi
|
|
17
18
|
from huggingface_hub.errors import (
|
|
@@ -31,6 +32,7 @@ from litellm.exceptions import (
|
|
|
31
32
|
)
|
|
32
33
|
from litellm.types.utils import ModelResponse
|
|
33
34
|
from requests.exceptions import RequestException
|
|
35
|
+
from tqdm.auto import tqdm
|
|
34
36
|
from transformers import Trainer
|
|
35
37
|
|
|
36
38
|
from ..constants import (
|
|
@@ -39,7 +41,13 @@ from ..constants import (
|
|
|
39
41
|
TASK_GROUPS_USING_LOGPROBS,
|
|
40
42
|
TASKS_USING_JSON,
|
|
41
43
|
)
|
|
42
|
-
from ..data_models import
|
|
44
|
+
from ..data_models import (
|
|
45
|
+
BenchmarkConfig,
|
|
46
|
+
DatasetConfig,
|
|
47
|
+
GenerativeModelOutput,
|
|
48
|
+
ModelConfig,
|
|
49
|
+
Task,
|
|
50
|
+
)
|
|
43
51
|
from ..enums import (
|
|
44
52
|
BatchingPreference,
|
|
45
53
|
GenerativeType,
|
|
@@ -49,6 +57,7 @@ from ..enums import (
|
|
|
49
57
|
)
|
|
50
58
|
from ..exceptions import (
|
|
51
59
|
InvalidBenchmark,
|
|
60
|
+
InvalidModel,
|
|
52
61
|
NeedsAdditionalArgument,
|
|
53
62
|
NeedsEnvironmentVariable,
|
|
54
63
|
NeedsExtraInstalled,
|
|
@@ -60,7 +69,7 @@ from ..task_utils import (
|
|
|
60
69
|
token_classification,
|
|
61
70
|
)
|
|
62
71
|
from ..types import ExtractLabelsFunction
|
|
63
|
-
from ..utils import create_model_cache_dir
|
|
72
|
+
from ..utils import create_model_cache_dir, log_once
|
|
64
73
|
from .base import BenchmarkModule
|
|
65
74
|
from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
|
|
66
75
|
|
|
@@ -136,6 +145,34 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
136
145
|
batching_preference = BatchingPreference.SINGLE_SAMPLE
|
|
137
146
|
high_priority = False
|
|
138
147
|
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
model_config: ModelConfig,
|
|
151
|
+
dataset_config: DatasetConfig,
|
|
152
|
+
benchmark_config: BenchmarkConfig,
|
|
153
|
+
) -> None:
|
|
154
|
+
"""Initialise the model.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
model_config:
|
|
158
|
+
The model configuration.
|
|
159
|
+
dataset_config:
|
|
160
|
+
The dataset configuration.
|
|
161
|
+
benchmark_config:
|
|
162
|
+
The benchmark configuration.
|
|
163
|
+
"""
|
|
164
|
+
# Detect whether the model is an Ollama model, as we need to extract metadata
|
|
165
|
+
# differently for these models
|
|
166
|
+
self.is_ollama = model_config.model_id.startswith(
|
|
167
|
+
"ollama/"
|
|
168
|
+
) or model_config.model_id.startswith("ollama_chat/")
|
|
169
|
+
|
|
170
|
+
super().__init__(
|
|
171
|
+
model_config=model_config,
|
|
172
|
+
dataset_config=dataset_config,
|
|
173
|
+
benchmark_config=benchmark_config,
|
|
174
|
+
)
|
|
175
|
+
|
|
139
176
|
@property
|
|
140
177
|
def generative_type(self) -> GenerativeType | None:
|
|
141
178
|
"""Get the generative type of the model.
|
|
@@ -269,10 +306,24 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
269
306
|
Returns:
|
|
270
307
|
The number of parameters in the model.
|
|
271
308
|
"""
|
|
309
|
+
# Start by trying out the regex mapping, and use the value if it matches
|
|
272
310
|
for key, value in NUM_PARAMS_MAPPING.items():
|
|
273
311
|
if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
|
|
274
312
|
return value
|
|
275
313
|
|
|
314
|
+
# If it is an Ollama model then we can get the number of parameters from the
|
|
315
|
+
# Ollama Python SDK
|
|
316
|
+
if self.is_ollama:
|
|
317
|
+
ollama_model_id = self.model_config.model_id.split("/")[-1]
|
|
318
|
+
model_info = ollama.show(ollama_model_id).modelinfo
|
|
319
|
+
if model_info is not None:
|
|
320
|
+
num_params = model_info.get("general.parameter_count")
|
|
321
|
+
if num_params is not None:
|
|
322
|
+
return int(num_params)
|
|
323
|
+
|
|
324
|
+
# If it is a model accessed through the Hugging Face inference API then we can
|
|
325
|
+
# get the number of parameters from the Hugging Face model configuration from
|
|
326
|
+
# the Hugging Face Hub
|
|
276
327
|
if self.model_config.model_id.startswith("huggingface/"):
|
|
277
328
|
model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
|
|
278
329
|
if HuggingFaceEncoderModel.model_exists(
|
|
@@ -329,10 +380,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
329
380
|
Returns:
|
|
330
381
|
The vocabulary size of the model.
|
|
331
382
|
"""
|
|
383
|
+
# Start by trying out the regex mapping, and use the value if it matches
|
|
332
384
|
for key, value in VOCAB_SIZE_MAPPING.items():
|
|
333
385
|
if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
|
|
334
386
|
return value
|
|
335
387
|
|
|
388
|
+
# If it is a model accessed through the Hugging Face inference API then we can
|
|
389
|
+
# get the vocabulary size from the Hugging Face model configuration from the
|
|
390
|
+
# Hugging Face Hub
|
|
336
391
|
if self.model_config.model_id.startswith("huggingface/"):
|
|
337
392
|
model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
|
|
338
393
|
if HuggingFaceEncoderModel.model_exists(
|
|
@@ -379,10 +434,40 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
379
434
|
Returns:
|
|
380
435
|
The maximum length of the model.
|
|
381
436
|
"""
|
|
437
|
+
# Start by trying out the regex mapping, and use the value if it matches
|
|
382
438
|
for key, value in MODEL_MAX_LENGTH_MAPPING.items():
|
|
383
439
|
if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
|
|
384
440
|
return value
|
|
385
441
|
|
|
442
|
+
# If it is an Ollama model then we can get the maximum length from the Ollama
|
|
443
|
+
# Python SDK
|
|
444
|
+
if self.is_ollama:
|
|
445
|
+
ollama_model_id = self.model_config.model_id.split("/")[-1]
|
|
446
|
+
model_info = ollama.show(ollama_model_id).modelinfo
|
|
447
|
+
if model_info is not None:
|
|
448
|
+
context_length_keys = [
|
|
449
|
+
key for key in model_info.keys() if "context_length" in key.lower()
|
|
450
|
+
]
|
|
451
|
+
if context_length_keys:
|
|
452
|
+
context_length = model_info[context_length_keys[0]]
|
|
453
|
+
if context_length is not None:
|
|
454
|
+
log_once(
|
|
455
|
+
f"Detected context length key {context_length_keys[0]!r} "
|
|
456
|
+
f"for Ollama model {ollama_model_id!r}",
|
|
457
|
+
level=logging.DEBUG,
|
|
458
|
+
)
|
|
459
|
+
return int(context_length)
|
|
460
|
+
else:
|
|
461
|
+
log_once(
|
|
462
|
+
f"Tried to get the maximum length of the Ollama model "
|
|
463
|
+
f"{ollama_model_id!r}, but could not find a context length. "
|
|
464
|
+
f"The model info was {model_info}. Returning -1",
|
|
465
|
+
level=logging.DEBUG,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
# If it is a model accessed through the Hugging Face inference API then we can
|
|
469
|
+
# get the maximum length from the Hugging Face model configuration from the
|
|
470
|
+
# Hugging Face Hub
|
|
386
471
|
if self.model_config.model_id.startswith("huggingface/"):
|
|
387
472
|
model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
|
|
388
473
|
if HuggingFaceEncoderModel.model_exists(
|
|
@@ -523,6 +608,43 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
523
608
|
if model_id in litellm.model_list:
|
|
524
609
|
return True
|
|
525
610
|
|
|
611
|
+
# If it is an Ollama model then try to download it
|
|
612
|
+
if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
|
|
613
|
+
ollama_model_id = model_id.split("/")[-1]
|
|
614
|
+
downloaded_ollama_models: list[str] = [
|
|
615
|
+
model_obj.model
|
|
616
|
+
for model_obj in ollama.list().models
|
|
617
|
+
if model_obj.model is not None
|
|
618
|
+
]
|
|
619
|
+
if ollama_model_id not in downloaded_ollama_models:
|
|
620
|
+
try:
|
|
621
|
+
response = ollama.pull(model=ollama_model_id, stream=True)
|
|
622
|
+
with tqdm(
|
|
623
|
+
desc=f"Downloading {ollama_model_id}",
|
|
624
|
+
unit_scale=True,
|
|
625
|
+
unit="B",
|
|
626
|
+
leave=False,
|
|
627
|
+
) as pbar:
|
|
628
|
+
for status in response:
|
|
629
|
+
if status.total is not None:
|
|
630
|
+
pbar.total = status.total
|
|
631
|
+
if status.completed is not None:
|
|
632
|
+
pbar.update(status.completed - pbar.n)
|
|
633
|
+
except ollama.ResponseError as e:
|
|
634
|
+
if "file does not exist" in str(e).lower():
|
|
635
|
+
return False
|
|
636
|
+
else:
|
|
637
|
+
raise InvalidModel(
|
|
638
|
+
f"Failed to download Ollama model {ollama_model_id}. The "
|
|
639
|
+
f"error message was: {e}"
|
|
640
|
+
)
|
|
641
|
+
else:
|
|
642
|
+
log_once(
|
|
643
|
+
f"Ollama model {ollama_model_id!r} already downloaded, so skipping "
|
|
644
|
+
"download.",
|
|
645
|
+
level=logging.DEBUG,
|
|
646
|
+
)
|
|
647
|
+
|
|
526
648
|
num_attempts = 10
|
|
527
649
|
for _ in range(num_attempts):
|
|
528
650
|
try:
|
|
@@ -73,7 +73,6 @@ from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_conf
|
|
|
73
73
|
if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
74
74
|
from vllm import LLM, RequestOutput, SamplingParams
|
|
75
75
|
from vllm.lora.request import LoRARequest
|
|
76
|
-
from vllm.sampling_params import GuidedDecodingParams
|
|
77
76
|
|
|
78
77
|
try:
|
|
79
78
|
from vllm.model_executor.parallel_utils.parallel_state import (
|
|
@@ -82,6 +81,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
82
81
|
except ImportError:
|
|
83
82
|
from vllm.distributed.parallel_state import destroy_model_parallel
|
|
84
83
|
|
|
84
|
+
if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
85
|
+
from outlines.models.vllm import adapt_tokenizer
|
|
86
|
+
from outlines.processors import JSONLogitsProcessor
|
|
87
|
+
|
|
85
88
|
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
86
89
|
import ray
|
|
87
90
|
|
|
@@ -319,12 +322,18 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
319
322
|
for tag_name in ner_tag_names
|
|
320
323
|
}
|
|
321
324
|
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
+
logits_processor = JSONLogitsProcessor(
|
|
326
|
+
schema=pydantic_class,
|
|
327
|
+
tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
|
|
328
|
+
whitespace_pattern=r" ?",
|
|
329
|
+
)
|
|
330
|
+
log_once(
|
|
331
|
+
"Using structured generation with the schema "
|
|
332
|
+
f"{pydantic_class.model_json_schema()}",
|
|
333
|
+
level=logging.DEBUG,
|
|
325
334
|
)
|
|
326
335
|
else:
|
|
327
|
-
|
|
336
|
+
logits_processor = None
|
|
328
337
|
|
|
329
338
|
# Define the parameters used for vLLM generation
|
|
330
339
|
max_tokens: int = (
|
|
@@ -337,7 +346,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
337
346
|
logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
|
|
338
347
|
temperature=0.0,
|
|
339
348
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
340
|
-
|
|
349
|
+
logits_processors=[logits_processor] if logits_processor else None,
|
|
341
350
|
)
|
|
342
351
|
|
|
343
352
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
@@ -881,8 +890,6 @@ def load_model_and_tokenizer(
|
|
|
881
890
|
"max_position_embeddings",
|
|
882
891
|
"max_sequence_length",
|
|
883
892
|
"model_max_length",
|
|
884
|
-
"sliding_window",
|
|
885
|
-
"sliding_window_size",
|
|
886
893
|
"n_positions",
|
|
887
894
|
]
|
|
888
895
|
true_max_model_len_candidates: list[int] = list()
|
|
@@ -1087,7 +1094,8 @@ def get_end_of_reasoning_token_id(
|
|
|
1087
1094
|
"""Get the end of reasoning token ID for a generative model.
|
|
1088
1095
|
|
|
1089
1096
|
This assumes that the reasoning token is of the form <X> and that the end of
|
|
1090
|
-
reasoning token is </X> (for X being any string without spaces).
|
|
1097
|
+
reasoning token is </X> (for X being any string without spaces). We disallow the
|
|
1098
|
+
reasoning token to be the same as the beginning-of-sentence token.
|
|
1091
1099
|
|
|
1092
1100
|
Args:
|
|
1093
1101
|
model:
|
|
@@ -1106,6 +1114,7 @@ def get_end_of_reasoning_token_id(
|
|
|
1106
1114
|
add_generation_prompt=True,
|
|
1107
1115
|
tokenize=False,
|
|
1108
1116
|
)
|
|
1117
|
+
assert isinstance(prompt, str)
|
|
1109
1118
|
|
|
1110
1119
|
# Generate a completion and remove the BOS token from it, to not confuse it with the
|
|
1111
1120
|
# potential reasoning token
|
|
@@ -1119,11 +1128,18 @@ def get_end_of_reasoning_token_id(
|
|
|
1119
1128
|
.text
|
|
1120
1129
|
)
|
|
1121
1130
|
if tokenizer.bos_token is not None:
|
|
1122
|
-
|
|
1131
|
+
if isinstance(tokenizer.bos_token, str):
|
|
1132
|
+
prompt = prompt.replace(tokenizer.bos_token, "").strip()
|
|
1133
|
+
completion = completion.replace(tokenizer.bos_token, "").strip()
|
|
1134
|
+
elif isinstance(tokenizer.bos_token, list):
|
|
1135
|
+
for bos_token in tokenizer.bos_token:
|
|
1136
|
+
prompt = prompt.replace(bos_token, "").strip()
|
|
1137
|
+
completion = completion.replace(bos_token, "").strip()
|
|
1123
1138
|
|
|
1124
1139
|
# If it doesn't contain a reasoning token, we can't find the end of reasoning token
|
|
1125
|
-
|
|
1126
|
-
|
|
1140
|
+
prompt_match = re.search(pattern=r"<\w+>", string=prompt)
|
|
1141
|
+
completion_match = re.search(pattern=r"<\w+>", string=completion)
|
|
1142
|
+
if completion_match is None and prompt_match is None:
|
|
1127
1143
|
log_once(
|
|
1128
1144
|
message=(
|
|
1129
1145
|
"Could not find a reasoning token, so assuming the model is not a "
|
|
@@ -1135,7 +1151,11 @@ def get_end_of_reasoning_token_id(
|
|
|
1135
1151
|
|
|
1136
1152
|
# Check that the found reasoning token and its associated end-of-reasoning tokens
|
|
1137
1153
|
# are both special tokens
|
|
1138
|
-
|
|
1154
|
+
elif completion_match is not None:
|
|
1155
|
+
reasoning_token = completion_match.group()
|
|
1156
|
+
else:
|
|
1157
|
+
assert prompt_match is not None
|
|
1158
|
+
reasoning_token = prompt_match.group()
|
|
1139
1159
|
end_of_reasoning_token = f"</{reasoning_token[1:-1]}>"
|
|
1140
1160
|
special_tokens = [
|
|
1141
1161
|
decoder_token.content
|
euroeval/benchmarker.py
CHANGED
|
@@ -709,7 +709,7 @@ class Benchmarker:
|
|
|
709
709
|
|
|
710
710
|
if dataset_config.task == SPEED:
|
|
711
711
|
scores = benchmark_speed(
|
|
712
|
-
model=model, benchmark_config=
|
|
712
|
+
model=model, benchmark_config=benchmark_config
|
|
713
713
|
)
|
|
714
714
|
|
|
715
715
|
else:
|
|
@@ -727,7 +727,7 @@ class Benchmarker:
|
|
|
727
727
|
datasets=prepared_datasets,
|
|
728
728
|
model_config=model_config,
|
|
729
729
|
dataset_config=dataset_config,
|
|
730
|
-
benchmark_config=
|
|
730
|
+
benchmark_config=benchmark_config,
|
|
731
731
|
)
|
|
732
732
|
else:
|
|
733
733
|
scores = finetune(
|
euroeval/constants.py
CHANGED
|
@@ -13,7 +13,13 @@ REASONING_MAX_TOKENS = 8_192
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
# The Hugging Face Hub pipeline tags used to classify models as generative
|
|
16
|
-
GENERATIVE_PIPELINE_TAGS = [
|
|
16
|
+
GENERATIVE_PIPELINE_TAGS = [
|
|
17
|
+
"text-generation",
|
|
18
|
+
"text2text-generation",
|
|
19
|
+
"image-text-to-text",
|
|
20
|
+
"audio-text-to-text",
|
|
21
|
+
"video-text-to-text",
|
|
22
|
+
]
|
|
17
23
|
|
|
18
24
|
|
|
19
25
|
# Used to disallow non-generative models to be evaluated on these task groups
|
euroeval/data_loading.py
CHANGED
|
@@ -8,6 +8,7 @@ from datasets import Dataset, DatasetDict, load_dataset
|
|
|
8
8
|
from datasets.exceptions import DatasetsError
|
|
9
9
|
from huggingface_hub.errors import HfHubHTTPError
|
|
10
10
|
from numpy.random import Generator
|
|
11
|
+
from requests import ReadTimeout
|
|
11
12
|
|
|
12
13
|
from .data_models import BenchmarkConfig, DatasetConfig
|
|
13
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
@@ -47,7 +48,7 @@ def load_data(
|
|
|
47
48
|
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
48
49
|
)
|
|
49
50
|
break
|
|
50
|
-
except (FileNotFoundError, DatasetsError, ConnectionError):
|
|
51
|
+
except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
|
|
51
52
|
logger.warning(
|
|
52
53
|
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
53
54
|
)
|
euroeval/dataset_configs.py
CHANGED
|
@@ -1,7 +1,22 @@
|
|
|
1
1
|
"""All dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from .data_models import DatasetConfig
|
|
4
|
-
from .languages import
|
|
4
|
+
from .languages import (
|
|
5
|
+
DA,
|
|
6
|
+
DE,
|
|
7
|
+
EN,
|
|
8
|
+
ES,
|
|
9
|
+
FO,
|
|
10
|
+
FR,
|
|
11
|
+
IS,
|
|
12
|
+
IT,
|
|
13
|
+
NB,
|
|
14
|
+
NL,
|
|
15
|
+
NN,
|
|
16
|
+
NO,
|
|
17
|
+
SV,
|
|
18
|
+
get_all_languages,
|
|
19
|
+
)
|
|
5
20
|
from .tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SPEED, SUMM
|
|
6
21
|
|
|
7
22
|
|
|
@@ -265,6 +280,25 @@ SENTIPOLC_CONFIG = DatasetConfig(
|
|
|
265
280
|
)
|
|
266
281
|
|
|
267
282
|
|
|
283
|
+
SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
|
|
284
|
+
name="sentiment-headlines-es",
|
|
285
|
+
pretty_name="the truncated version of the Spanish sentiment headlines dataset",
|
|
286
|
+
huggingface_id="EuroEval/sentiment-headlines-es",
|
|
287
|
+
task=SENT,
|
|
288
|
+
languages=[ES],
|
|
289
|
+
labels=["negative", "neutral", "positive"],
|
|
290
|
+
prompt_prefix="Lo siguiente son reseñas y su sentimiento, que puede ser "
|
|
291
|
+
"'positivo', 'neutral' o 'negativo'.",
|
|
292
|
+
prompt_template="Texto: {text}\nSentimiento: {label}",
|
|
293
|
+
prompt_label_mapping=dict(
|
|
294
|
+
positive="positivo", neutral="neutral", negative="negativo"
|
|
295
|
+
),
|
|
296
|
+
instruction_prompt="Texto: {text}\n\nClasifica el sentimiento de la reseña. "
|
|
297
|
+
"Responde con 'positivo', 'neutral' o 'negativo', y nada más.",
|
|
298
|
+
num_few_shot_examples=12,
|
|
299
|
+
max_generated_tokens=5,
|
|
300
|
+
)
|
|
301
|
+
|
|
268
302
|
### NAMED ENTITY RECOGNITION DATASETS ###
|
|
269
303
|
|
|
270
304
|
SUC3_CONFIG = DatasetConfig(
|
|
@@ -817,6 +851,45 @@ MULTINERD_IT_CONFIG = DatasetConfig(
|
|
|
817
851
|
max_generated_tokens=128,
|
|
818
852
|
)
|
|
819
853
|
|
|
854
|
+
CONLL_ES_CONFIG = DatasetConfig(
|
|
855
|
+
name="conll-es",
|
|
856
|
+
pretty_name="the Spanish part of the truncated version of the named entity "
|
|
857
|
+
"recognition dataset CoNLL 2002",
|
|
858
|
+
huggingface_id="EuroEval/conll-es-mini",
|
|
859
|
+
task=NER,
|
|
860
|
+
languages=[ES],
|
|
861
|
+
labels=[
|
|
862
|
+
"o",
|
|
863
|
+
"b-loc",
|
|
864
|
+
"i-loc",
|
|
865
|
+
"b-org",
|
|
866
|
+
"i-org",
|
|
867
|
+
"b-per",
|
|
868
|
+
"i-per",
|
|
869
|
+
"b-misc",
|
|
870
|
+
"i-misc",
|
|
871
|
+
],
|
|
872
|
+
prompt_prefix="Lo siguiente son oraciones y diccionarios JSON con las entidades "
|
|
873
|
+
"nombradas que aparecen en la oración dada.",
|
|
874
|
+
prompt_template="Oración: {text}\nEntidades nombradas: {label}",
|
|
875
|
+
prompt_label_mapping={
|
|
876
|
+
"b-per": "persona",
|
|
877
|
+
"i-per": "persona",
|
|
878
|
+
"b-loc": "lugar",
|
|
879
|
+
"i-loc": "lugar",
|
|
880
|
+
"b-org": "organización",
|
|
881
|
+
"i-org": "organización",
|
|
882
|
+
"b-misc": "misceláneo",
|
|
883
|
+
"i-misc": "misceláneo",
|
|
884
|
+
},
|
|
885
|
+
instruction_prompt="Oración: {text}\n\nIdentifica las entidades nombradas en la "
|
|
886
|
+
"oración. Debes producir esto como un diccionario JSON con las claves 'persona', "
|
|
887
|
+
"'lugar', 'organización' y 'misceláneo'. Los valores deben ser listas de las "
|
|
888
|
+
"entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
|
|
889
|
+
num_few_shot_examples=8,
|
|
890
|
+
max_generated_tokens=128,
|
|
891
|
+
unofficial=True,
|
|
892
|
+
)
|
|
820
893
|
|
|
821
894
|
### LINGUISTIC ACCEPTABILITY DATASETS ###
|
|
822
895
|
|
|
@@ -1029,6 +1102,22 @@ SCALA_IT_CONFIG = DatasetConfig(
|
|
|
1029
1102
|
max_generated_tokens=5,
|
|
1030
1103
|
)
|
|
1031
1104
|
|
|
1105
|
+
SCALA_ES_CONFIG = DatasetConfig(
|
|
1106
|
+
name="scala-es",
|
|
1107
|
+
pretty_name="the Spanish part of the linguistic acceptability dataset ScaLA",
|
|
1108
|
+
huggingface_id="EuroEval/scala-es",
|
|
1109
|
+
task=LA,
|
|
1110
|
+
languages=[ES],
|
|
1111
|
+
labels=["incorrect", "correct"],
|
|
1112
|
+
prompt_prefix="Lo siguiente son textos y si son gramaticalmente correctos.",
|
|
1113
|
+
prompt_template="Texto: {text}\nGramaticalmente correcto: {label}",
|
|
1114
|
+
prompt_label_mapping=dict(correct="sí", incorrect="no"),
|
|
1115
|
+
instruction_prompt="Texto: {text}\n\nDetermina si el texto es gramaticalmente "
|
|
1116
|
+
"correcto o no. Responde con 'sí' si el texto es correcto, y 'no' si no lo es.",
|
|
1117
|
+
num_few_shot_examples=12,
|
|
1118
|
+
max_generated_tokens=5,
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1032
1121
|
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
1033
1122
|
name="dutch-cola",
|
|
1034
1123
|
pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
|
|
@@ -1326,6 +1415,41 @@ FQUAD_CONFIG = DatasetConfig(
|
|
|
1326
1415
|
max_generated_tokens=32,
|
|
1327
1416
|
)
|
|
1328
1417
|
|
|
1418
|
+
XQUAD_ES_CONFIG = DatasetConfig(
|
|
1419
|
+
name="xquad-es",
|
|
1420
|
+
pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
|
|
1421
|
+
huggingface_id="EuroEval/xquad-es",
|
|
1422
|
+
task=RC,
|
|
1423
|
+
languages=[ES],
|
|
1424
|
+
labels=["start_positions", "end_positions"],
|
|
1425
|
+
prompt_prefix="A continuación se presentan textos con sus preguntas y respuestas "
|
|
1426
|
+
"correspondientes.",
|
|
1427
|
+
prompt_template="Texto: {text}\nPregunta: {question}\nRespuesta en máximo 3 "
|
|
1428
|
+
"palabras: {label}",
|
|
1429
|
+
instruction_prompt="Texto: {text}\n\nResponda la siguiente pregunta sobre el "
|
|
1430
|
+
"texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
|
|
1431
|
+
num_few_shot_examples=4,
|
|
1432
|
+
max_generated_tokens=32,
|
|
1433
|
+
unofficial=True,
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
MLQA_ES_CONFIG = DatasetConfig(
|
|
1437
|
+
name="mlqa-es",
|
|
1438
|
+
pretty_name="the Spanish version of the MLQA reading comprehension dataset",
|
|
1439
|
+
huggingface_id="EuroEval/mlqa-es",
|
|
1440
|
+
task=RC,
|
|
1441
|
+
languages=[ES],
|
|
1442
|
+
labels=["start_positions", "end_positions"],
|
|
1443
|
+
prompt_prefix="A continuación se presentan textos con sus preguntas y respuestas "
|
|
1444
|
+
"correspondientes.",
|
|
1445
|
+
prompt_template="Texto: {text}\nPregunta: {question}\nRespuesta en máximo 3 "
|
|
1446
|
+
"palabras: {label}",
|
|
1447
|
+
instruction_prompt="Texto: {text}\n\nResponda la siguiente pregunta sobre el "
|
|
1448
|
+
"texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
|
|
1449
|
+
num_few_shot_examples=4,
|
|
1450
|
+
max_generated_tokens=32,
|
|
1451
|
+
)
|
|
1452
|
+
|
|
1329
1453
|
### SUMMARIZATION DATASETS ###
|
|
1330
1454
|
|
|
1331
1455
|
NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
|
|
@@ -1358,6 +1482,19 @@ MLSUM_CONFIG = DatasetConfig(
|
|
|
1358
1482
|
max_generated_tokens=256,
|
|
1359
1483
|
)
|
|
1360
1484
|
|
|
1485
|
+
MLSUM_ES_CONFIG = DatasetConfig(
|
|
1486
|
+
name="mlsum-es",
|
|
1487
|
+
pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
|
|
1488
|
+
huggingface_id="EuroEval/mlsum-es-mini",
|
|
1489
|
+
task=SUMM,
|
|
1490
|
+
languages=[ES],
|
|
1491
|
+
prompt_prefix="Los siguientes son artículos de noticias con sus resúmenes.",
|
|
1492
|
+
prompt_template="Artículo: {text}\nResumen: {target_text}",
|
|
1493
|
+
instruction_prompt="Artículo: {text}\n\nEscribe un resumen del artículo anterior.",
|
|
1494
|
+
num_few_shot_examples=1,
|
|
1495
|
+
max_generated_tokens=256,
|
|
1496
|
+
)
|
|
1497
|
+
|
|
1361
1498
|
RRN_CONFIG = DatasetConfig(
|
|
1362
1499
|
name="rrn",
|
|
1363
1500
|
pretty_name="the truncated version of the Icelandic summarisation dataset "
|
|
@@ -1745,6 +1882,23 @@ MMLU_IT_CONFIG = DatasetConfig(
|
|
|
1745
1882
|
max_generated_tokens=5,
|
|
1746
1883
|
)
|
|
1747
1884
|
|
|
1885
|
+
MMLU_ES_CONFIG = DatasetConfig(
|
|
1886
|
+
name="mmlu-es",
|
|
1887
|
+
pretty_name="the truncated version of the Spanish knowledge dataset MMLU-es, "
|
|
1888
|
+
"translated from the English MMLU dataset",
|
|
1889
|
+
huggingface_id="EuroEval/mmlu-es-mini",
|
|
1890
|
+
task=KNOW,
|
|
1891
|
+
languages=[ES],
|
|
1892
|
+
labels=["a", "b", "c", "d"],
|
|
1893
|
+
prompt_prefix="Las siguientes son preguntas de opción múltiple (con respuestas).",
|
|
1894
|
+
prompt_template="Pregunta: {text}\nRespuesta: {label}",
|
|
1895
|
+
prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
|
|
1896
|
+
instruction_prompt="Pregunta: {text}\n\nResponda la pregunta anterior usando "
|
|
1897
|
+
"solo 'a', 'b', 'c' o 'd', y nada más.",
|
|
1898
|
+
num_few_shot_examples=5,
|
|
1899
|
+
max_generated_tokens=5,
|
|
1900
|
+
)
|
|
1901
|
+
|
|
1748
1902
|
ARC_DA_CONFIG = DatasetConfig(
|
|
1749
1903
|
name="arc-da",
|
|
1750
1904
|
pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
|
|
@@ -1870,6 +2024,23 @@ ARC_CONFIG = DatasetConfig(
|
|
|
1870
2024
|
unofficial=True,
|
|
1871
2025
|
)
|
|
1872
2026
|
|
|
2027
|
+
HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
2028
|
+
name="hellaswag-es",
|
|
2029
|
+
pretty_name="the truncated version of the Spanish common-sense reasoning dataset "
|
|
2030
|
+
"HellaSwag-es, translated from the English HellaSwag dataset",
|
|
2031
|
+
huggingface_id="EuroEval/hellaswag-es-mini",
|
|
2032
|
+
task=COMMON_SENSE,
|
|
2033
|
+
languages=[ES],
|
|
2034
|
+
labels=["a", "b", "c", "d"],
|
|
2035
|
+
prompt_prefix="Las siguientes son preguntas de opción múltiple (con respuestas).",
|
|
2036
|
+
prompt_template="Pregunta: {text}\nRespuesta: {label}",
|
|
2037
|
+
prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
|
|
2038
|
+
instruction_prompt="Pregunta: {text}\n\nResponda la pregunta anterior usando solo "
|
|
2039
|
+
"'a', 'b', 'c' o 'd', y nada más.",
|
|
2040
|
+
num_few_shot_examples=5,
|
|
2041
|
+
max_generated_tokens=5,
|
|
2042
|
+
)
|
|
2043
|
+
|
|
1873
2044
|
# TODO: Faroese knowledge
|
|
1874
2045
|
|
|
1875
2046
|
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
"""Utility functions related to the token-classification task group."""
|
|
2
2
|
|
|
3
|
-
import importlib.util
|
|
4
3
|
import logging
|
|
5
4
|
import re
|
|
6
5
|
import typing as t
|
|
7
6
|
from copy import deepcopy
|
|
8
7
|
|
|
8
|
+
import demjson3
|
|
9
9
|
import evaluate
|
|
10
10
|
import numpy as np
|
|
11
11
|
from evaluate import EvaluationModule
|
|
12
12
|
from transformers import PreTrainedTokenizer
|
|
13
13
|
|
|
14
14
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
|
-
from ..exceptions import InvalidBenchmark
|
|
15
|
+
from ..exceptions import InvalidBenchmark
|
|
16
16
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
17
17
|
|
|
18
18
|
if t.TYPE_CHECKING:
|
|
@@ -20,9 +20,6 @@ if t.TYPE_CHECKING:
|
|
|
20
20
|
|
|
21
21
|
from ..types import Labels, Predictions
|
|
22
22
|
|
|
23
|
-
if importlib.util.find_spec("demjson3") is not None:
|
|
24
|
-
import demjson3
|
|
25
|
-
|
|
26
23
|
|
|
27
24
|
logger = logging.getLogger("euroeval")
|
|
28
25
|
|
|
@@ -201,13 +198,10 @@ def extract_labels_from_generation(
|
|
|
201
198
|
Returns:
|
|
202
199
|
The predicted labels.
|
|
203
200
|
"""
|
|
204
|
-
if importlib.util.find_spec("demjson3") is None:
|
|
205
|
-
raise NeedsExtraInstalled(extra="generative")
|
|
206
|
-
|
|
207
201
|
raw_predictions = model_output.sequences
|
|
208
202
|
|
|
209
203
|
# Attempt to extract the JSON dictionary from the predictions
|
|
210
|
-
json_regex = r"\{
|
|
204
|
+
json_regex = r"\{[^{}]+?\}"
|
|
211
205
|
json_matches = [
|
|
212
206
|
re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
|
|
213
207
|
or raw_prediction
|
euroeval/utils.py
CHANGED
|
@@ -141,6 +141,7 @@ def block_terminal_output() -> None:
|
|
|
141
141
|
logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
|
|
142
142
|
logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
|
|
143
143
|
logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
|
|
144
|
+
logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
|
|
144
145
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
145
146
|
logging.getLogger("ray._private.worker").setLevel(logging.CRITICAL)
|
|
146
147
|
logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.4.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -33,12 +33,14 @@ Requires-Dist: accelerate>=0.34.2
|
|
|
33
33
|
Requires-Dist: bert-score>=0.3.13
|
|
34
34
|
Requires-Dist: click>=8.1.3
|
|
35
35
|
Requires-Dist: datasets>=2.15.0
|
|
36
|
+
Requires-Dist: demjson3>=3.0.6
|
|
36
37
|
Requires-Dist: evaluate>=0.4.1
|
|
37
38
|
Requires-Dist: huggingface-hub>=0.24.0
|
|
38
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
39
40
|
Requires-Dist: litellm>=1.61.13
|
|
40
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
41
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
|
+
Requires-Dist: ollama>=0.4.7
|
|
42
44
|
Requires-Dist: pandas>=2.2.0
|
|
43
45
|
Requires-Dist: protobuf~=3.20.0
|
|
44
46
|
Requires-Dist: pydantic>=2.6.0
|
|
@@ -52,19 +54,19 @@ Requires-Dist: seqeval>=1.2.2
|
|
|
52
54
|
Requires-Dist: setuptools>=75.8.2
|
|
53
55
|
Requires-Dist: tenacity>=9.0.0
|
|
54
56
|
Requires-Dist: termcolor>=2.0.0
|
|
55
|
-
Requires-Dist: torch>=2.
|
|
56
|
-
Requires-Dist: transformers>=4.
|
|
57
|
+
Requires-Dist: torch>=2.6.0
|
|
58
|
+
Requires-Dist: transformers>=4.50.0
|
|
57
59
|
Provides-Extra: all
|
|
58
60
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
59
|
-
Requires-Dist: demjson3>=3.0.6; extra == 'all'
|
|
60
61
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
61
62
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
62
|
-
Requires-Dist:
|
|
63
|
+
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
64
|
+
Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
65
|
Provides-Extra: generative
|
|
64
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
65
|
-
Requires-Dist: demjson3>=3.0.6; extra == 'generative'
|
|
66
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
67
|
-
Requires-Dist:
|
|
68
|
+
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
69
|
+
Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
70
|
Provides-Extra: human-evaluation
|
|
69
71
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
70
72
|
Provides-Extra: test
|
|
@@ -202,6 +204,19 @@ argument. This could for instance be `--model <model-id> --task
|
|
|
202
204
|
sentiment-classification`.
|
|
203
205
|
|
|
204
206
|
|
|
207
|
+
### Reproducing the datasets
|
|
208
|
+
All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
|
|
209
|
+
|
|
210
|
+
```shell
|
|
211
|
+
$ uv run src/scripts/<name-of-script>.py
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
215
|
+
|
|
216
|
+
```shell
|
|
217
|
+
$ uv run src/scripts/create_allocine.py
|
|
218
|
+
```
|
|
219
|
+
|
|
205
220
|
## Special Thanks :pray:
|
|
206
221
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
207
222
|
models on the leaderboards.
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
2
|
-
euroeval/benchmark_config_factory.py,sha256=
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=l3V3ybiCj0I193jvn8wS9VK4UEc9ajiOq4SojChH6Xs,2615
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
|
|
3
|
+
euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
|
|
4
4
|
euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
|
-
euroeval/constants.py,sha256=
|
|
7
|
-
euroeval/data_loading.py,sha256=
|
|
6
|
+
euroeval/constants.py,sha256=9iXe26WAigL9RYob3PhsB5c0dr11wCeRxrEfm_ssynM,1562
|
|
7
|
+
euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
|
|
8
8
|
euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
|
|
9
|
-
euroeval/dataset_configs.py,sha256=
|
|
9
|
+
euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,90627
|
|
10
10
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
11
11
|
euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
|
|
12
12
|
euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
|
|
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
|
|
|
20
20
|
euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
|
|
21
21
|
euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
|
|
22
22
|
euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
23
|
+
euroeval/utils.py,sha256=MkiVI-0KmK4ilKJTTfYAynKaPDOzW1WjyRdZsYmnoIg,18803
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
|
|
27
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
28
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
27
|
+
euroeval/benchmark_modules/hf.py,sha256=YeaaP_YGAlKG5G1KFq0bFOFWv42eH_zfmhuW3FAXjAA,41726
|
|
28
|
+
euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3cKu4Oo,44519
|
|
30
30
|
euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
31
31
|
euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
|
|
32
32
|
euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
|
|
33
33
|
euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
|
|
34
34
|
euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
|
|
35
|
-
euroeval/task_utils/token_classification.py,sha256=
|
|
36
|
-
euroeval-15.
|
|
37
|
-
euroeval-15.
|
|
38
|
-
euroeval-15.
|
|
39
|
-
euroeval-15.
|
|
40
|
-
euroeval-15.
|
|
35
|
+
euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
|
|
36
|
+
euroeval-15.4.0.dist-info/METADATA,sha256=HfNWsANdb8TJAyK__QPBhs7O5qsQp9G_gPlhVVNuK9c,10724
|
|
37
|
+
euroeval-15.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
euroeval-15.4.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
39
|
+
euroeval-15.4.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
40
|
+
euroeval-15.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|