EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/generation.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Functions related to text generation of models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
6
|
import typing as t
|
|
@@ -33,11 +34,11 @@ if t.TYPE_CHECKING:
|
|
|
33
34
|
|
|
34
35
|
def generate(
|
|
35
36
|
model: "BenchmarkModule",
|
|
36
|
-
datasets:
|
|
37
|
+
datasets: c.Sequence["DatasetDict"],
|
|
37
38
|
model_config: "ModelConfig",
|
|
38
39
|
dataset_config: "DatasetConfig",
|
|
39
40
|
benchmark_config: "BenchmarkConfig",
|
|
40
|
-
) ->
|
|
41
|
+
) -> c.Sequence[dict[str, float]]:
|
|
41
42
|
"""Evaluate a model on a dataset through generation.
|
|
42
43
|
|
|
43
44
|
Args:
|
|
@@ -253,7 +254,7 @@ def generate_single_iteration(
|
|
|
253
254
|
def debug_log(
|
|
254
255
|
batch: dict[str, t.Any],
|
|
255
256
|
model_output: "GenerativeModelOutput",
|
|
256
|
-
extracted_labels:
|
|
257
|
+
extracted_labels: c.Sequence[dict | str | c.Sequence[str]],
|
|
257
258
|
dataset_config: "DatasetConfig",
|
|
258
259
|
) -> None:
|
|
259
260
|
"""Log inputs and outputs for debugging purposes.
|
|
@@ -331,7 +332,7 @@ def debug_log(
|
|
|
331
332
|
else:
|
|
332
333
|
input_texts = batch["text"]
|
|
333
334
|
|
|
334
|
-
metadata_keys:
|
|
335
|
+
metadata_keys: c.Sequence[str] = [
|
|
335
336
|
key
|
|
336
337
|
for key in batch.keys()
|
|
337
338
|
if key not in ["text", "messages", "label", "labels", "target_text"]
|
euroeval/generation_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to generative models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import itertools as it
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
@@ -25,7 +26,7 @@ def extract_few_shot_examples(
|
|
|
25
26
|
dataset_config: "DatasetConfig",
|
|
26
27
|
benchmark_config: "BenchmarkConfig",
|
|
27
28
|
itr_idx: int,
|
|
28
|
-
) ->
|
|
29
|
+
) -> c.Sequence[dict[str, t.Any]]:
|
|
29
30
|
"""Extract few-shot examples from a dataset.
|
|
30
31
|
|
|
31
32
|
This will always extract the examples from the training split.
|
|
@@ -78,7 +79,7 @@ def extract_few_shot_examples(
|
|
|
78
79
|
lambda example: len(example["text"]) < max_num_tokens
|
|
79
80
|
)
|
|
80
81
|
num_short_examples = len(train_with_short_examples)
|
|
81
|
-
if num_short_examples >=
|
|
82
|
+
if num_short_examples >= num_few_shots:
|
|
82
83
|
break
|
|
83
84
|
else:
|
|
84
85
|
raise InvalidBenchmark(
|
|
@@ -143,7 +144,7 @@ def extract_few_shot_examples(
|
|
|
143
144
|
lambda example: len(example["context"]) < max_num_tokens
|
|
144
145
|
)
|
|
145
146
|
num_short_examples = len(train_with_short_examples)
|
|
146
|
-
if num_short_examples >=
|
|
147
|
+
if num_short_examples >= num_few_shots:
|
|
147
148
|
break
|
|
148
149
|
else:
|
|
149
150
|
raise InvalidBenchmark(
|
|
@@ -170,7 +171,7 @@ def extract_few_shot_examples(
|
|
|
170
171
|
|
|
171
172
|
def apply_prompt(
|
|
172
173
|
examples: dict[str, t.Any],
|
|
173
|
-
few_shot_examples:
|
|
174
|
+
few_shot_examples: c.Sequence[dict[str, t.Any]],
|
|
174
175
|
model_config: "ModelConfig",
|
|
175
176
|
dataset_config: "DatasetConfig",
|
|
176
177
|
generative_type: GenerativeType | None,
|
|
@@ -431,7 +432,7 @@ def apply_prompt(
|
|
|
431
432
|
|
|
432
433
|
|
|
433
434
|
def raise_if_wrong_params(
|
|
434
|
-
model_config: "ModelConfig", allowed_params: dict[re.Pattern,
|
|
435
|
+
model_config: "ModelConfig", allowed_params: dict[re.Pattern, c.Sequence[str]]
|
|
435
436
|
) -> None:
|
|
436
437
|
"""Raise an error if the model configuration has invalid parameters.
|
|
437
438
|
|