EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/base.py +3 -2
- euroeval/benchmark_modules/fresh.py +8 -6
- euroeval/benchmark_modules/hf.py +44 -33
- euroeval/benchmark_modules/litellm.py +314 -120
- euroeval/benchmark_modules/vllm.py +99 -59
- euroeval/benchmarker.py +52 -21
- euroeval/callbacks.py +2 -2
- euroeval/constants.py +9 -2
- euroeval/data_models.py +258 -44
- euroeval/dataset_configs/__init__.py +61 -0
- euroeval/dataset_configs/danish.py +120 -0
- euroeval/dataset_configs/dutch.py +123 -0
- euroeval/dataset_configs/english.py +88 -0
- euroeval/dataset_configs/faroese.py +53 -0
- euroeval/dataset_configs/french.py +83 -0
- euroeval/dataset_configs/german.py +91 -0
- euroeval/dataset_configs/icelandic.py +148 -0
- euroeval/dataset_configs/italian.py +81 -0
- euroeval/dataset_configs/norwegian.py +178 -0
- euroeval/dataset_configs/spanish.py +78 -0
- euroeval/dataset_configs/swedish.py +100 -0
- euroeval/exceptions.py +10 -10
- euroeval/finetuning.py +6 -10
- euroeval/generation.py +1 -0
- euroeval/human_evaluation.py +2 -2
- euroeval/languages.py +20 -13
- euroeval/model_cache.py +1 -1
- euroeval/model_loading.py +1 -12
- euroeval/prompt_templates/__init__.py +8 -0
- euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval/prompt_templates/summarization.py +97 -0
- euroeval/speed_benchmark.py +1 -1
- euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
- euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
- euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
- euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
- euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
- euroeval/tasks.py +54 -0
- euroeval/tokenization_utils.py +343 -0
- euroeval/types.py +3 -1
- euroeval/utils.py +5 -254
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
- euroeval-15.6.0.dist-info/RECORD +59 -0
- euroeval/dataset_configs.py +0 -2408
- euroeval-15.4.2.dist-info/RECORD +0 -40
- /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
euroeval/utils.py
CHANGED
|
@@ -12,7 +12,6 @@ import typing as t
|
|
|
12
12
|
import warnings
|
|
13
13
|
from functools import cache
|
|
14
14
|
from pathlib import Path
|
|
15
|
-
from types import TracebackType
|
|
16
15
|
|
|
17
16
|
import litellm
|
|
18
17
|
import numpy as np
|
|
@@ -20,15 +19,16 @@ import requests
|
|
|
20
19
|
import torch
|
|
21
20
|
from datasets.utils import disable_progress_bar
|
|
22
21
|
from requests.exceptions import RequestException
|
|
23
|
-
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
|
|
24
22
|
from transformers import logging as tf_logging
|
|
25
23
|
|
|
26
|
-
from .exceptions import
|
|
24
|
+
from .exceptions import NaNValueInModelOutput
|
|
27
25
|
|
|
28
26
|
if importlib.util.find_spec("ray") is not None:
|
|
29
27
|
import ray
|
|
30
28
|
|
|
31
29
|
if t.TYPE_CHECKING:
|
|
30
|
+
from types import TracebackType
|
|
31
|
+
|
|
32
32
|
from .types import Predictions
|
|
33
33
|
|
|
34
34
|
|
|
@@ -116,6 +116,7 @@ def block_terminal_output() -> None:
|
|
|
116
116
|
logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
|
|
117
117
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
118
118
|
logging.getLogger("ray._private.worker").setLevel(logging.CRITICAL)
|
|
119
|
+
logging.getLogger("ray._private.services").setLevel(logging.CRITICAL)
|
|
119
120
|
logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
|
|
120
121
|
logging.getLogger("accelerate").setLevel(logging.CRITICAL)
|
|
121
122
|
logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
|
|
@@ -193,19 +194,6 @@ def get_min_cuda_compute_capability() -> float | None:
|
|
|
193
194
|
return float(f"{major}.{minor}")
|
|
194
195
|
|
|
195
196
|
|
|
196
|
-
def kebab_to_pascal(kebab_string: str) -> str:
|
|
197
|
-
"""Converts a kebab-case string to PascalCase.
|
|
198
|
-
|
|
199
|
-
Args:
|
|
200
|
-
kebab_string:
|
|
201
|
-
The kebab-case string.
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
The PascalCase string.
|
|
205
|
-
"""
|
|
206
|
-
return "".join(word.title() for word in kebab_string.split("-"))
|
|
207
|
-
|
|
208
|
-
|
|
209
197
|
def internet_connection_available() -> bool:
|
|
210
198
|
"""Checks if internet connection is available by pinging google.com.
|
|
211
199
|
|
|
@@ -219,58 +207,6 @@ def internet_connection_available() -> bool:
|
|
|
219
207
|
return False
|
|
220
208
|
|
|
221
209
|
|
|
222
|
-
def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
|
|
223
|
-
"""Get the special token metadata for a tokenizer.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
tokenizer:
|
|
227
|
-
The tokenizer.
|
|
228
|
-
|
|
229
|
-
Returns:
|
|
230
|
-
The special token metadata.
|
|
231
|
-
"""
|
|
232
|
-
# Create some test input IDs, to check if the tokenizer is adding special tokens
|
|
233
|
-
test_input_ids = tokenizer("Test").input_ids
|
|
234
|
-
|
|
235
|
-
# Extract the CLS token IDs from the tokenizer, if it's using them
|
|
236
|
-
has_cls_token = True
|
|
237
|
-
if tokenizer.cls_token_id in test_input_ids:
|
|
238
|
-
cls_token_id = tokenizer.cls_token_id
|
|
239
|
-
cls_token = tokenizer.cls_token
|
|
240
|
-
elif tokenizer.bos_token_id in test_input_ids:
|
|
241
|
-
cls_token_id = tokenizer.bos_token_id
|
|
242
|
-
cls_token = tokenizer.bos_token
|
|
243
|
-
elif tokenizer.cls_token is not None:
|
|
244
|
-
cls_token_id = tokenizer.cls_token_id
|
|
245
|
-
cls_token = tokenizer.cls_token
|
|
246
|
-
has_cls_token = False
|
|
247
|
-
else:
|
|
248
|
-
cls_token_id = tokenizer.bos_token_id
|
|
249
|
-
cls_token = tokenizer.bos_token
|
|
250
|
-
has_cls_token = False
|
|
251
|
-
|
|
252
|
-
# Extract the SEP token IDs from the tokenizer, if it's using them
|
|
253
|
-
has_sep_token = True
|
|
254
|
-
if tokenizer.sep_token_id in test_input_ids:
|
|
255
|
-
sep_token = tokenizer.sep_token
|
|
256
|
-
elif tokenizer.eos_token_id in test_input_ids:
|
|
257
|
-
sep_token = tokenizer.eos_token
|
|
258
|
-
elif tokenizer.sep_token is not None:
|
|
259
|
-
sep_token = tokenizer.sep_token
|
|
260
|
-
has_sep_token = False
|
|
261
|
-
else:
|
|
262
|
-
sep_token = tokenizer.eos_token
|
|
263
|
-
has_sep_token = False
|
|
264
|
-
|
|
265
|
-
return dict(
|
|
266
|
-
cls_token_id=cls_token_id,
|
|
267
|
-
cls_token=cls_token,
|
|
268
|
-
sep_token=sep_token,
|
|
269
|
-
has_cls_token=has_cls_token,
|
|
270
|
-
has_sep_token=has_sep_token,
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
|
|
274
210
|
class HiddenPrints:
|
|
275
211
|
"""Context manager which removes all terminal output."""
|
|
276
212
|
|
|
@@ -285,7 +221,7 @@ class HiddenPrints:
|
|
|
285
221
|
self,
|
|
286
222
|
exc_type: t.Type[BaseException],
|
|
287
223
|
exc_val: BaseException,
|
|
288
|
-
exc_tb: TracebackType,
|
|
224
|
+
exc_tb: "TracebackType",
|
|
289
225
|
) -> None:
|
|
290
226
|
"""Exit the context manager."""
|
|
291
227
|
sys.stdout.close()
|
|
@@ -316,191 +252,6 @@ def raise_if_model_output_contains_nan_values(model_output: "Predictions") -> No
|
|
|
316
252
|
raise NaNValueInModelOutput()
|
|
317
253
|
|
|
318
254
|
|
|
319
|
-
def should_prompts_be_stripped(
|
|
320
|
-
labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
|
|
321
|
-
) -> bool:
|
|
322
|
-
"""Determine if we should strip the prompts for few-shot evaluation.
|
|
323
|
-
|
|
324
|
-
This is the case if the tokenizer needs to include the space as part of the label
|
|
325
|
-
token. The strategy is thus to tokenize a label with a preceeding colon (as in the
|
|
326
|
-
prompts), i.e., ": positive", and check if the tokenization starts with the tokens
|
|
327
|
-
of ": ". If this is the case, then we should not strip the prompts, since the
|
|
328
|
-
tokenizer produces the whitespace token separately.
|
|
329
|
-
|
|
330
|
-
Args:
|
|
331
|
-
labels_to_be_generated:
|
|
332
|
-
The labels that are to be generated.
|
|
333
|
-
tokenizer:
|
|
334
|
-
The tokenizer used to tokenize the labels.
|
|
335
|
-
|
|
336
|
-
Returns:
|
|
337
|
-
Whether we should strip the prompts.
|
|
338
|
-
"""
|
|
339
|
-
strip_prompts = True
|
|
340
|
-
for label in labels_to_be_generated:
|
|
341
|
-
colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
|
|
342
|
-
label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
|
|
343
|
-
|
|
344
|
-
if isinstance(colon_tokens, torch.Tensor):
|
|
345
|
-
colon_tokens = list(colon_tokens.squeeze(0))
|
|
346
|
-
if isinstance(label_tokens, torch.Tensor):
|
|
347
|
-
label_tokens = list(label_tokens.squeeze(0))
|
|
348
|
-
|
|
349
|
-
label_tokens_start_with_colon_tokens = (
|
|
350
|
-
label_tokens[: len(colon_tokens)] == colon_tokens
|
|
351
|
-
)
|
|
352
|
-
if label_tokens_start_with_colon_tokens:
|
|
353
|
-
strip_prompts = False
|
|
354
|
-
|
|
355
|
-
return strip_prompts
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
# TODO: This is currently not used - maybe remove.
|
|
359
|
-
def should_prefix_space_be_added_to_labels(
|
|
360
|
-
labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
|
|
361
|
-
) -> bool:
|
|
362
|
-
"""Determine if we should add a prefix space to the labels.
|
|
363
|
-
|
|
364
|
-
This is the case if the prompts are stripped and the tokenizer doesn't
|
|
365
|
-
automatically add prefix whitespaces to the labels.
|
|
366
|
-
|
|
367
|
-
Args:
|
|
368
|
-
labels_to_be_generated:
|
|
369
|
-
The labels that are to be generated.
|
|
370
|
-
tokenizer:
|
|
371
|
-
The tokenizer used to tokenize the labels.
|
|
372
|
-
|
|
373
|
-
Returns:
|
|
374
|
-
Whether we should add a prefix space to the labels.
|
|
375
|
-
"""
|
|
376
|
-
if not should_prompts_be_stripped(
|
|
377
|
-
labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
|
|
378
|
-
):
|
|
379
|
-
return False
|
|
380
|
-
|
|
381
|
-
whitespace_token = tokenizer.convert_ids_to_tokens(
|
|
382
|
-
ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
|
|
383
|
-
)[0]
|
|
384
|
-
|
|
385
|
-
add_prefix_space = True
|
|
386
|
-
for label in labels_to_be_generated:
|
|
387
|
-
label_tokens = tokenizer(label, add_special_tokens=False).input_ids
|
|
388
|
-
if isinstance(label_tokens, torch.Tensor):
|
|
389
|
-
label_tokens = list(label_tokens.squeeze(0))
|
|
390
|
-
first_label_token: int = int(label_tokens[0])
|
|
391
|
-
first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
|
|
392
|
-
has_prefix_space = first_character_of_label == whitespace_token
|
|
393
|
-
if has_prefix_space:
|
|
394
|
-
add_prefix_space = False
|
|
395
|
-
break
|
|
396
|
-
|
|
397
|
-
return add_prefix_space
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
|
|
401
|
-
"""Get the beginning-of-sequence token from a tokenizer.
|
|
402
|
-
|
|
403
|
-
Args:
|
|
404
|
-
tokenizer:
|
|
405
|
-
The tokenizer.
|
|
406
|
-
|
|
407
|
-
Returns:
|
|
408
|
-
A pair (token, token_id) representing the beginning-of-sequence token and its
|
|
409
|
-
token ID.
|
|
410
|
-
"""
|
|
411
|
-
if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
|
|
412
|
-
return tokenizer.bos_token, tokenizer.bos_token_id
|
|
413
|
-
|
|
414
|
-
vocab: dict[str, int] = tokenizer.get_vocab()
|
|
415
|
-
|
|
416
|
-
candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
|
|
417
|
-
for candidate_bos_token in candidate_bos_tokens:
|
|
418
|
-
if candidate_bos_token in vocab:
|
|
419
|
-
bos_token = candidate_bos_token
|
|
420
|
-
bos_token_id = vocab[bos_token]
|
|
421
|
-
break
|
|
422
|
-
else:
|
|
423
|
-
raise InvalidModel(
|
|
424
|
-
"The model does not have a beginning-of-sequence token. Please ensure that "
|
|
425
|
-
"this has been set in the tokenizer's configuration."
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
return bos_token, bos_token_id
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
|
|
432
|
-
"""Get the end-of-sequence token from a tokenizer.
|
|
433
|
-
|
|
434
|
-
Args:
|
|
435
|
-
tokenizer:
|
|
436
|
-
The tokenizer.
|
|
437
|
-
|
|
438
|
-
Returns:
|
|
439
|
-
A pair (token, token_id) representing the end-of-sequence token and its token
|
|
440
|
-
ID.
|
|
441
|
-
"""
|
|
442
|
-
if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
|
|
443
|
-
return tokenizer.eos_token, tokenizer.eos_token_id
|
|
444
|
-
|
|
445
|
-
vocab: dict[str, int] = tokenizer.get_vocab()
|
|
446
|
-
|
|
447
|
-
candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
|
|
448
|
-
for candidate_eos_token in candidate_eos_tokens:
|
|
449
|
-
if candidate_eos_token in vocab:
|
|
450
|
-
eos_token = candidate_eos_token
|
|
451
|
-
eos_token_id = vocab[eos_token]
|
|
452
|
-
break
|
|
453
|
-
else:
|
|
454
|
-
raise InvalidModel(
|
|
455
|
-
"The model does not have an end-of-sequence token. Please ensure that this "
|
|
456
|
-
"has been set in the tokenizer's configuration."
|
|
457
|
-
)
|
|
458
|
-
|
|
459
|
-
return eos_token, eos_token_id
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
|
|
463
|
-
"""Get the end token ID for chat models.
|
|
464
|
-
|
|
465
|
-
This is only relevant for tokenizers with a chat template.
|
|
466
|
-
|
|
467
|
-
Args:
|
|
468
|
-
tokenizer:
|
|
469
|
-
The tokenizer.
|
|
470
|
-
|
|
471
|
-
Returns:
|
|
472
|
-
The token IDs used to end chats, or None if the tokenizer does not have a chat
|
|
473
|
-
template.
|
|
474
|
-
|
|
475
|
-
Raises:
|
|
476
|
-
ValueError:
|
|
477
|
-
If the end-of-chat token could not be located.
|
|
478
|
-
"""
|
|
479
|
-
if tokenizer.chat_template is None:
|
|
480
|
-
return None
|
|
481
|
-
|
|
482
|
-
user_message: dict[t.Literal["role", "content"], str] = dict()
|
|
483
|
-
user_message["role"] = "user"
|
|
484
|
-
user_message["content"] = "X"
|
|
485
|
-
token_ids = tokenizer.apply_chat_template(conversation=[user_message])
|
|
486
|
-
assert isinstance(token_ids, list)
|
|
487
|
-
|
|
488
|
-
for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
|
|
489
|
-
token_id = tokenizer.convert_tokens_to_ids(token)
|
|
490
|
-
assert isinstance(token_id, int)
|
|
491
|
-
token = tokenizer.decode([token_id])
|
|
492
|
-
if "X" in token:
|
|
493
|
-
x_token_index = idx
|
|
494
|
-
break
|
|
495
|
-
else:
|
|
496
|
-
raise ValueError("Could not locate the end-of-chat token for the model.")
|
|
497
|
-
|
|
498
|
-
end_of_chat_tokens = token_ids[x_token_index + 1 :]
|
|
499
|
-
if len(end_of_chat_tokens) == 0:
|
|
500
|
-
return None
|
|
501
|
-
return end_of_chat_tokens
|
|
502
|
-
|
|
503
|
-
|
|
504
255
|
def scramble(text: str) -> str:
|
|
505
256
|
"""Scramble a string in a bijective manner.
|
|
506
257
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.6.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -35,9 +35,9 @@ Requires-Dist: click>=8.1.3
|
|
|
35
35
|
Requires-Dist: datasets>=2.15.0
|
|
36
36
|
Requires-Dist: demjson3>=3.0.6
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
|
-
Requires-Dist: huggingface-hub>=0.
|
|
38
|
+
Requires-Dist: huggingface-hub>=0.30.1
|
|
39
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
40
|
+
Requires-Dist: litellm>=1.63.0
|
|
41
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
43
|
Requires-Dist: ollama>=0.4.7
|
|
@@ -56,18 +56,18 @@ Requires-Dist: setuptools>=75.8.2
|
|
|
56
56
|
Requires-Dist: tenacity>=9.0.0
|
|
57
57
|
Requires-Dist: termcolor>=2.0.0
|
|
58
58
|
Requires-Dist: torch>=2.6.0
|
|
59
|
-
Requires-Dist: transformers>=4.
|
|
59
|
+
Requires-Dist: transformers>=4.51.0
|
|
60
60
|
Provides-Extra: all
|
|
61
61
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
62
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
64
64
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm
|
|
65
|
+
Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
69
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm
|
|
70
|
+
Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Provides-Extra: human-evaluation
|
|
72
72
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
73
73
|
Provides-Extra: test
|
|
@@ -89,7 +89,7 @@ ______________________________________________________________________
|
|
|
89
89
|
[](https://arxiv.org/abs/2406.13469)
|
|
90
90
|
[](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
|
|
91
91
|
[](https://github.com/EuroEval/EuroEval/commits/main)
|
|
92
|
-
[](https://github.com/EuroEval/EuroEval/tree/main/tests)
|
|
93
93
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
94
94
|
|
|
95
95
|
|
|
@@ -206,7 +206,9 @@ sentiment-classification`.
|
|
|
206
206
|
|
|
207
207
|
|
|
208
208
|
### Reproducing the datasets
|
|
209
|
-
All datasets used in this project are generated using the scripts located in the
|
|
209
|
+
All datasets used in this project are generated using the scripts located in the
|
|
210
|
+
[src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
|
|
211
|
+
with the following command
|
|
210
212
|
|
|
211
213
|
```shell
|
|
212
214
|
$ uv run src/scripts/<name-of-script>.py
|
|
@@ -218,7 +220,27 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
|
218
220
|
$ uv run src/scripts/create_allocine.py
|
|
219
221
|
```
|
|
220
222
|
|
|
221
|
-
##
|
|
223
|
+
## Contributors :pray:
|
|
224
|
+
|
|
225
|
+
A huge thank you to all the contributors who have helped make this project a success!
|
|
226
|
+
|
|
227
|
+
<a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
|
|
228
|
+
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
229
|
+
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
230
|
+
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
231
|
+
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
232
|
+
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
233
|
+
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
234
|
+
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
235
|
+
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
236
|
+
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
237
|
+
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
238
|
+
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
239
|
+
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
240
|
+
|
|
241
|
+
### Special Thanks
|
|
242
|
+
- Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
|
|
243
|
+
[Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
|
|
222
244
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
223
245
|
models on the leaderboards.
|
|
224
246
|
- Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
|
|
3
|
+
euroeval/benchmarker.py,sha256=7LVFr7zL7OeJPs7WVYwekNnEmiIKPXHydcbAkW99MUk,48080
|
|
4
|
+
euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
|
|
5
|
+
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
|
+
euroeval/constants.py,sha256=t2mAT8tE3Dn2lXWHTnaFoaOIaUcdiBjJTASCt7nSdkg,1984
|
|
7
|
+
euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
|
|
8
|
+
euroeval/data_models.py,sha256=RjU7REmUMxSMeZfTeUNYb6XRlHMUri7Tk_zwexUOupU,22840
|
|
9
|
+
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
|
+
euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
|
|
11
|
+
euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
|
|
12
|
+
euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
|
|
13
|
+
euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
|
|
14
|
+
euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
|
|
15
|
+
euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
|
|
16
|
+
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
17
|
+
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
18
|
+
euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
|
|
19
|
+
euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
|
|
20
|
+
euroeval/tasks.py,sha256=VVXFDcEM250KTGXd1pxQb8vwdia4ZJxgTUY5Kdsa-ik,7070
|
|
21
|
+
euroeval/tokenization_utils.py,sha256=PNuS-FTdVrL9TWNDGlq42MvUggKwmyYM0BnC5I37IO0,11876
|
|
22
|
+
euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
|
|
23
|
+
euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
|
|
24
|
+
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
|
+
euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
|
|
26
|
+
euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
|
|
27
|
+
euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
|
|
28
|
+
euroeval/benchmark_modules/litellm.py,sha256=wohdi1WoeJ-JEdQLgg2q3JbZJA77XO7yGZaTRvbRU4o,47575
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=FTpwal5WdrVsOpkjm_RXwf6-2PrNrrP1LO6BVGYb6GE,48086
|
|
30
|
+
euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
|
|
31
|
+
euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
|
|
32
|
+
euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
|
|
33
|
+
euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
|
|
34
|
+
euroeval/dataset_configs/faroese.py,sha256=9QYFtd3GqaFcyQjsmru_yvJuTjParyz8Ra_ekw_3xbA,1320
|
|
35
|
+
euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
|
|
36
|
+
euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
|
|
37
|
+
euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
|
|
38
|
+
euroeval/dataset_configs/italian.py,sha256=5yYMMBbxkfSDpLgJ9IH_pgkpzEp-74vMMvx-dT8x4WY,2345
|
|
39
|
+
euroeval/dataset_configs/norwegian.py,sha256=3kKhri5qWIiFwNSzNFHjNbRpkW1NSK_PUltAGQpxmAY,5172
|
|
40
|
+
euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
|
|
41
|
+
euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
|
|
42
|
+
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
43
|
+
euroeval/prompt_templates/linguistic_acceptability.py,sha256=sx_WqLm7N6Thll6COUCCA0lXe9RMZ7WhoH6X498pixM,6232
|
|
44
|
+
euroeval/prompt_templates/multiple_choice.py,sha256=H0CDQPs_WzgSJ7oI_FBzHs0TOF0Na2qZYJLhDC7S8tk,4710
|
|
45
|
+
euroeval/prompt_templates/named_entity_recognition.py,sha256=T65oFEtVT8JRF9c7bq2nPm233rftPdEAGic0DU-toko,11835
|
|
46
|
+
euroeval/prompt_templates/reading_comprehension.py,sha256=WbQoal_tjoTt7qsmSZXEWwlI77vgiANcZoZC1l1AZjc,6090
|
|
47
|
+
euroeval/prompt_templates/sentiment_classification.py,sha256=LcFD89e5nMOv4u-Unj8_jHpNjKMmgKPEfz0-e39VbsM,6639
|
|
48
|
+
euroeval/prompt_templates/summarization.py,sha256=eX0uUTf_5Xorm6f_TlBBNwLC9zKvR7YJkP0RSaLWgIw,4585
|
|
49
|
+
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
50
|
+
euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
|
|
51
|
+
euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
|
|
52
|
+
euroeval/task_group_utils/sequence_classification.py,sha256=gqd0-l5o7vAY5QIpGSkSqwJwez3Y0r5SqOiywfPNW8A,12239
|
|
53
|
+
euroeval/task_group_utils/text_to_text.py,sha256=QECnGdZ0YLjsbMc6LwXqVi4KMuITdiOjmJUNQtAAOW0,5712
|
|
54
|
+
euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
|
|
55
|
+
euroeval-15.6.0.dist-info/METADATA,sha256=m1NE2zaj_hbP-3kW-2_oC9Ug-POilMU1fVWQTt-SNIU,13027
|
|
56
|
+
euroeval-15.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
57
|
+
euroeval-15.6.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
58
|
+
euroeval-15.6.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
59
|
+
euroeval-15.6.0.dist-info/RECORD,,
|