EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +8 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +199 -139
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +19 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +73 -23
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +35 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +90 -20
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +276 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +128 -42
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +59 -73
- euroeval/tasks.py +33 -6
- euroeval/tokenization_utils.py +294 -207
- euroeval/utils.py +150 -35
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
- euroeval-16.0.1.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import PT
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -67,6 +67,17 @@ GOLDENSWAG_PT_CONFIG = DatasetConfig(
|
|
|
67
67
|
languages=[PT],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
+
EUROPEAN_VALUES_PT_CONFIG = DatasetConfig(
|
|
71
|
+
name="european-values-pt",
|
|
72
|
+
pretty_name="the Portuguese version of the European values evaluation dataset",
|
|
73
|
+
huggingface_id="EuroEval/european-values-pt",
|
|
74
|
+
task=EUROPEAN_VALUES,
|
|
75
|
+
languages=[PT],
|
|
76
|
+
splits=["test"],
|
|
77
|
+
bootstrap_samples=False,
|
|
78
|
+
_instruction_prompt="{text}",
|
|
79
|
+
)
|
|
80
|
+
|
|
70
81
|
|
|
71
82
|
### Unofficial datasets ###
|
|
72
83
|
|
|
@@ -79,3 +90,29 @@ BOOLQ_PT_CONFIG = DatasetConfig(
|
|
|
79
90
|
languages=[PT],
|
|
80
91
|
unofficial=True,
|
|
81
92
|
)
|
|
93
|
+
|
|
94
|
+
EUROPEAN_VALUES_SITUATIONAL_PT_CONFIG = DatasetConfig(
|
|
95
|
+
name="european-values-situational-pt",
|
|
96
|
+
pretty_name="the Portuguese version of the European values evaluation dataset, "
|
|
97
|
+
"where the questions are phrased in a situational way",
|
|
98
|
+
huggingface_id="EuroEval/european-values-situational-pt",
|
|
99
|
+
task=EUROPEAN_VALUES,
|
|
100
|
+
languages=[PT],
|
|
101
|
+
splits=["test"],
|
|
102
|
+
bootstrap_samples=False,
|
|
103
|
+
_instruction_prompt="{text}",
|
|
104
|
+
unofficial=True,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
EUROPEAN_VALUES_COMPLETIONS_PT_CONFIG = DatasetConfig(
|
|
108
|
+
name="european-values-completions-pt",
|
|
109
|
+
pretty_name="the Portuguese version of the European values evaluation dataset, "
|
|
110
|
+
"where the questions are phrased as sentence completions",
|
|
111
|
+
huggingface_id="EuroEval/european-values-completions-pt",
|
|
112
|
+
task=EUROPEAN_VALUES,
|
|
113
|
+
languages=[PT],
|
|
114
|
+
splits=["test"],
|
|
115
|
+
bootstrap_samples=False,
|
|
116
|
+
_instruction_prompt="{text}",
|
|
117
|
+
unofficial=True,
|
|
118
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import ES
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -66,6 +66,17 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
|
66
66
|
languages=[ES],
|
|
67
67
|
)
|
|
68
68
|
|
|
69
|
+
EUROPEAN_VALUES_ES_CONFIG = DatasetConfig(
|
|
70
|
+
name="european-values-es",
|
|
71
|
+
pretty_name="the Spanish version of the European values evaluation dataset",
|
|
72
|
+
huggingface_id="EuroEval/european-values-es",
|
|
73
|
+
task=EUROPEAN_VALUES,
|
|
74
|
+
languages=[ES],
|
|
75
|
+
splits=["test"],
|
|
76
|
+
bootstrap_samples=False,
|
|
77
|
+
_instruction_prompt="{text}",
|
|
78
|
+
)
|
|
79
|
+
|
|
69
80
|
|
|
70
81
|
### Unofficial datasets ###
|
|
71
82
|
|
|
@@ -107,3 +118,29 @@ GOLDENSWAG_ES_CONFIG = DatasetConfig(
|
|
|
107
118
|
languages=[ES],
|
|
108
119
|
unofficial=True,
|
|
109
120
|
)
|
|
121
|
+
|
|
122
|
+
EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
|
|
123
|
+
name="european-values-situational-es",
|
|
124
|
+
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
125
|
+
"the questions are phrased in a situational way",
|
|
126
|
+
huggingface_id="EuroEval/european-values-situational-es",
|
|
127
|
+
task=EUROPEAN_VALUES,
|
|
128
|
+
languages=[ES],
|
|
129
|
+
splits=["test"],
|
|
130
|
+
bootstrap_samples=False,
|
|
131
|
+
_instruction_prompt="{text}",
|
|
132
|
+
unofficial=True,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
EUROPEAN_VALUES_COMPLETIONS_ES_CONFIG = DatasetConfig(
|
|
136
|
+
name="european-values-completions-es",
|
|
137
|
+
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
138
|
+
"the questions are phrased as sentence completions",
|
|
139
|
+
huggingface_id="EuroEval/european-values-completions-es",
|
|
140
|
+
task=EUROPEAN_VALUES,
|
|
141
|
+
languages=[ES],
|
|
142
|
+
splits=["test"],
|
|
143
|
+
bootstrap_samples=False,
|
|
144
|
+
_instruction_prompt="{text}",
|
|
145
|
+
unofficial=True,
|
|
146
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import SV
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -67,6 +67,17 @@ HELLASWAG_SV_CONFIG = DatasetConfig(
|
|
|
67
67
|
languages=[SV],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
+
EUROPEAN_VALUES_SV_CONFIG = DatasetConfig(
|
|
71
|
+
name="european-values-sv",
|
|
72
|
+
pretty_name="the Swedish version of the European values evaluation dataset",
|
|
73
|
+
huggingface_id="EuroEval/european-values-sv",
|
|
74
|
+
task=EUROPEAN_VALUES,
|
|
75
|
+
languages=[SV],
|
|
76
|
+
splits=["test"],
|
|
77
|
+
bootstrap_samples=False,
|
|
78
|
+
_instruction_prompt="{text}",
|
|
79
|
+
)
|
|
80
|
+
|
|
70
81
|
|
|
71
82
|
### Unofficial datasets ###
|
|
72
83
|
|
|
@@ -118,3 +129,29 @@ GOLDENSWAG_SV_CONFIG = DatasetConfig(
|
|
|
118
129
|
languages=[SV],
|
|
119
130
|
unofficial=True,
|
|
120
131
|
)
|
|
132
|
+
|
|
133
|
+
EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
|
|
134
|
+
name="european-values-situational-sv",
|
|
135
|
+
pretty_name="the Swedish version of the European values evaluation dataset, where "
|
|
136
|
+
"the questions are phrased in a situational way",
|
|
137
|
+
huggingface_id="EuroEval/european-values-situational-sv",
|
|
138
|
+
task=EUROPEAN_VALUES,
|
|
139
|
+
languages=[SV],
|
|
140
|
+
splits=["test"],
|
|
141
|
+
bootstrap_samples=False,
|
|
142
|
+
_instruction_prompt="{text}",
|
|
143
|
+
unofficial=True,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
|
|
147
|
+
name="european-values-completions-sv",
|
|
148
|
+
pretty_name="the Swedish version of the European values evaluation dataset, where "
|
|
149
|
+
"the questions are phrased as sentence completions",
|
|
150
|
+
huggingface_id="EuroEval/european-values-completions-sv",
|
|
151
|
+
task=EUROPEAN_VALUES,
|
|
152
|
+
languages=[SV],
|
|
153
|
+
splits=["test"],
|
|
154
|
+
bootstrap_samples=False,
|
|
155
|
+
_instruction_prompt="{text}",
|
|
156
|
+
unofficial=True,
|
|
157
|
+
)
|
euroeval/enums.py
CHANGED
|
@@ -40,14 +40,11 @@ class InferenceBackend(AutoStrEnum):
|
|
|
40
40
|
VLLM library.
|
|
41
41
|
LITELLM:
|
|
42
42
|
LiteLLM library.
|
|
43
|
-
NONE:
|
|
44
|
-
No inference backend used (e.g., for human evaluation).
|
|
45
43
|
"""
|
|
46
44
|
|
|
47
45
|
TRANSFORMERS = auto()
|
|
48
46
|
VLLM = auto()
|
|
49
47
|
LITELLM = auto()
|
|
50
|
-
NONE = auto()
|
|
51
48
|
|
|
52
49
|
|
|
53
50
|
class ModelType(AutoStrEnum):
|
|
@@ -58,13 +55,10 @@ class ModelType(AutoStrEnum):
|
|
|
58
55
|
An encoder (i.e., BERT-style) model.
|
|
59
56
|
GENERATIVE:
|
|
60
57
|
A generative model. Can be either decoder or encoder-decoder (aka seq2seq).
|
|
61
|
-
HUMAN:
|
|
62
|
-
Human evaluator.
|
|
63
58
|
"""
|
|
64
59
|
|
|
65
60
|
ENCODER = auto()
|
|
66
61
|
GENERATIVE = auto()
|
|
67
|
-
HUMAN = auto()
|
|
68
62
|
|
|
69
63
|
|
|
70
64
|
class GenerativeType(AutoStrEnum):
|
euroeval/finetuning.py
CHANGED
|
@@ -119,7 +119,7 @@ def finetune(
|
|
|
119
119
|
# NaN values can appear in the model output when using mixed precision, as
|
|
120
120
|
# the hidden states get overflowed. In this case we try to disable mixed
|
|
121
121
|
# precision and try again.
|
|
122
|
-
except NaNValueInModelOutput:
|
|
122
|
+
except NaNValueInModelOutput as e:
|
|
123
123
|
if dtype != DataType.FP32:
|
|
124
124
|
dtype = DataType.FP32
|
|
125
125
|
model_already_initialized = False
|
|
@@ -131,11 +131,11 @@ def finetune(
|
|
|
131
131
|
raise InvalidBenchmark(
|
|
132
132
|
"NaN value detected in model outputs, even with mixed "
|
|
133
133
|
"precision disabled."
|
|
134
|
-
)
|
|
134
|
+
) from e
|
|
135
135
|
|
|
136
136
|
except Exception as e:
|
|
137
137
|
if "CUDA" not in str(e) and "out of memory" not in str(e):
|
|
138
|
-
raise InvalidBenchmark(str(e))
|
|
138
|
+
raise InvalidBenchmark(str(e)) from e
|
|
139
139
|
|
|
140
140
|
if bs <= 1:
|
|
141
141
|
msg = "Could not benchmark the model, even with a batch size of 1!"
|
|
@@ -146,7 +146,7 @@ def finetune(
|
|
|
146
146
|
"environment variable set, as this removes the upper bound "
|
|
147
147
|
"on the memory usage."
|
|
148
148
|
)
|
|
149
|
-
raise InvalidBenchmark(msg)
|
|
149
|
+
raise InvalidBenchmark(msg) from e
|
|
150
150
|
|
|
151
151
|
model_already_initialized = False
|
|
152
152
|
|
|
@@ -195,7 +195,7 @@ def finetune_single_iteration(
|
|
|
195
195
|
|
|
196
196
|
trainer = model.trainer_class(
|
|
197
197
|
model=model.get_pytorch_module(),
|
|
198
|
-
processing_class=model.
|
|
198
|
+
processing_class=model.get_tokeniser(),
|
|
199
199
|
args=training_args,
|
|
200
200
|
train_dataset=dataset["train"],
|
|
201
201
|
eval_dataset=dataset["val"],
|
|
@@ -245,7 +245,7 @@ def finetune_single_iteration(
|
|
|
245
245
|
clear_memory()
|
|
246
246
|
raise e
|
|
247
247
|
except (RuntimeError, ValueError, IndexError) as e:
|
|
248
|
-
raise InvalidBenchmark(str(e))
|
|
248
|
+
raise InvalidBenchmark(str(e)) from e
|
|
249
249
|
|
|
250
250
|
return test_scores
|
|
251
251
|
|
euroeval/generation.py
CHANGED
|
@@ -6,6 +6,7 @@ import typing as t
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
8
|
import more_itertools as mit
|
|
9
|
+
from datasets import Dataset
|
|
9
10
|
from tqdm.auto import tqdm
|
|
10
11
|
|
|
11
12
|
from .enums import BatchingPreference, TaskGroup
|
|
@@ -15,10 +16,10 @@ from .model_cache import (
|
|
|
15
16
|
load_cached_model_outputs,
|
|
16
17
|
split_dataset_into_cached_and_non_cached,
|
|
17
18
|
)
|
|
18
|
-
from .utils import clear_memory
|
|
19
|
+
from .utils import clear_memory, log_once
|
|
19
20
|
|
|
20
21
|
if t.TYPE_CHECKING:
|
|
21
|
-
from datasets import
|
|
22
|
+
from datasets import DatasetDict
|
|
22
23
|
|
|
23
24
|
from .benchmark_modules import BenchmarkModule
|
|
24
25
|
from .data_models import (
|
|
@@ -78,7 +79,7 @@ def generate(
|
|
|
78
79
|
|
|
79
80
|
scores: list[dict[str, float]] = list()
|
|
80
81
|
for idx in tqdm(
|
|
81
|
-
iterable=range(
|
|
82
|
+
iterable=range(len(datasets)),
|
|
82
83
|
desc="Benchmarking",
|
|
83
84
|
disable=not benchmark_config.progress_bar,
|
|
84
85
|
):
|
|
@@ -89,7 +90,6 @@ def generate(
|
|
|
89
90
|
dataset_config=dataset_config,
|
|
90
91
|
benchmark_config=benchmark_config,
|
|
91
92
|
)
|
|
92
|
-
|
|
93
93
|
logger.debug(f"Test scores for iteration {idx}: {test_scores}")
|
|
94
94
|
scores.append(test_scores)
|
|
95
95
|
clear_memory()
|
|
@@ -126,10 +126,15 @@ def generate_single_iteration(
|
|
|
126
126
|
"""
|
|
127
127
|
cache.load()
|
|
128
128
|
|
|
129
|
-
# Split up the dataset into a cached and non-cached part
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
# Split up the dataset into a cached and non-cached part, unless we are not
|
|
130
|
+
# bootstrapping the samples. In that case, we just use the dataset as is.
|
|
131
|
+
if dataset_config.bootstrap_samples:
|
|
132
|
+
cached_dataset, non_cached_dataset = split_dataset_into_cached_and_non_cached(
|
|
133
|
+
dataset=dataset, cache=cache
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
cached_dataset = Dataset.from_dict({})
|
|
137
|
+
non_cached_dataset = dataset
|
|
133
138
|
|
|
134
139
|
all_preds: list[str] = list()
|
|
135
140
|
|
|
@@ -230,9 +235,12 @@ def generate_single_iteration(
|
|
|
230
235
|
cached_labels = list(cached_labels)
|
|
231
236
|
ground_truth = non_cached_labels + cached_labels
|
|
232
237
|
else:
|
|
233
|
-
|
|
234
|
-
"
|
|
238
|
+
log_once(
|
|
239
|
+
"No labels found in the dataset. We assume that this is intentional, and "
|
|
240
|
+
"will not supply any ground truth labels for evaluation.",
|
|
241
|
+
level=logging.DEBUG,
|
|
235
242
|
)
|
|
243
|
+
ground_truth = []
|
|
236
244
|
|
|
237
245
|
itr_scores: dict[str, float] = model.compute_metrics(
|
|
238
246
|
model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
|
|
@@ -293,10 +301,13 @@ def debug_log(
|
|
|
293
301
|
case (
|
|
294
302
|
TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
295
303
|
):
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
304
|
+
if "label" in batch:
|
|
305
|
+
labels = [
|
|
306
|
+
dataset_config.prompt_label_mapping.get(label, label).lower()
|
|
307
|
+
for label in batch["label"]
|
|
308
|
+
]
|
|
309
|
+
else:
|
|
310
|
+
labels = ["N/A"] * len(extracted_labels)
|
|
300
311
|
|
|
301
312
|
case TaskGroup.QUESTION_ANSWERING:
|
|
302
313
|
extracted_labels = [
|
euroeval/generation_utils.py
CHANGED
|
@@ -8,19 +8,23 @@ import typing as t
|
|
|
8
8
|
|
|
9
9
|
from .enums import TaskGroup
|
|
10
10
|
from .exceptions import InvalidBenchmark
|
|
11
|
-
from .
|
|
11
|
+
from .tokenization_utils import apply_chat_template
|
|
12
|
+
from .utils import extract_multiple_choice_labels, log_once
|
|
12
13
|
|
|
13
14
|
if t.TYPE_CHECKING:
|
|
14
15
|
from datasets import DatasetDict
|
|
15
16
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
16
17
|
|
|
17
|
-
from .data_models import DatasetConfig, ModelConfig
|
|
18
|
+
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger("euroeval")
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def extract_few_shot_examples(
|
|
23
|
-
dataset: "DatasetDict",
|
|
24
|
+
dataset: "DatasetDict",
|
|
25
|
+
dataset_config: "DatasetConfig",
|
|
26
|
+
benchmark_config: "BenchmarkConfig",
|
|
27
|
+
itr_idx: int,
|
|
24
28
|
) -> list[dict[str, t.Any]]:
|
|
25
29
|
"""Extract few-shot examples from a dataset.
|
|
26
30
|
|
|
@@ -33,12 +37,32 @@ def extract_few_shot_examples(
|
|
|
33
37
|
The dataset to extract the few-shot examples from.
|
|
34
38
|
dataset_config:
|
|
35
39
|
The dataset configuration.
|
|
40
|
+
benchmark_config:
|
|
41
|
+
The benchmark configuration.
|
|
36
42
|
itr_idx:
|
|
37
43
|
The index of the dataset in the iterator.
|
|
38
44
|
|
|
39
45
|
Returns:
|
|
40
46
|
The few-shot examples.
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
InvalidBenchmark:
|
|
50
|
+
If there are not enough short examples for few-shot learning.
|
|
41
51
|
"""
|
|
52
|
+
if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
|
|
53
|
+
msg = (
|
|
54
|
+
"This task only allows zero-shot evaluation, so even though you have "
|
|
55
|
+
"requested few-shot evaluation "
|
|
56
|
+
)
|
|
57
|
+
if benchmark_config.run_with_cli:
|
|
58
|
+
msg += "(by not setting the --zero-shot flag), "
|
|
59
|
+
else:
|
|
60
|
+
msg += "(by setting the default `few_shot=True` argument), "
|
|
61
|
+
msg += "we will run the evaluation in zero-shot mode."
|
|
62
|
+
benchmark_config.few_shot = False
|
|
63
|
+
log_once(msg, level=logging.DEBUG)
|
|
64
|
+
return []
|
|
65
|
+
|
|
42
66
|
random_seed = 4242 + itr_idx
|
|
43
67
|
num_few_shots = dataset_config.num_few_shot_examples
|
|
44
68
|
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
@@ -63,12 +87,19 @@ def extract_few_shot_examples(
|
|
|
63
87
|
|
|
64
88
|
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
65
89
|
labels = it.cycle(dataset_config.labels)
|
|
90
|
+
labels_with_no_samples: set[str] = set()
|
|
66
91
|
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
92
|
+
if len(labels_with_no_samples) == len(dataset_config.labels):
|
|
93
|
+
raise InvalidBenchmark(
|
|
94
|
+
"Could not find enough examples for few-shot learning. "
|
|
95
|
+
"Please check the dataset and the labels."
|
|
96
|
+
)
|
|
67
97
|
label = next(labels)
|
|
68
98
|
possible_examples = shuffled_train.filter(
|
|
69
99
|
lambda x: x["label"].lower() == label.lower()
|
|
70
100
|
)
|
|
71
101
|
if len(possible_examples) == 0:
|
|
102
|
+
labels_with_no_samples.add(label)
|
|
72
103
|
continue
|
|
73
104
|
example = possible_examples.select(range(1))[0]
|
|
74
105
|
few_shot_examples.append(example)
|
|
@@ -144,7 +175,7 @@ def apply_prompt(
|
|
|
144
175
|
dataset_config: "DatasetConfig",
|
|
145
176
|
instruction_model: bool,
|
|
146
177
|
always_populate_text_field: bool,
|
|
147
|
-
|
|
178
|
+
tokeniser: "PreTrainedTokenizer | None",
|
|
148
179
|
) -> dict[str, t.Any]:
|
|
149
180
|
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
150
181
|
|
|
@@ -160,16 +191,16 @@ def apply_prompt(
|
|
|
160
191
|
always_populate_text_field:
|
|
161
192
|
Whether to always populate the 'text' field in the examples, as opposed to
|
|
162
193
|
the 'messages' field.
|
|
163
|
-
|
|
164
|
-
The
|
|
194
|
+
tokeniser:
|
|
195
|
+
The tokeniser to use for the model. If None, the tokeniser is not used.
|
|
165
196
|
|
|
166
197
|
Returns:
|
|
167
198
|
The example with the few-shot examples applied.
|
|
168
199
|
"""
|
|
169
200
|
# Sanity check
|
|
170
|
-
if instruction_model and always_populate_text_field and
|
|
201
|
+
if instruction_model and always_populate_text_field and tokeniser is None:
|
|
171
202
|
raise ValueError(
|
|
172
|
-
"The `
|
|
203
|
+
"The `tokeniser` argument must be provided when the model is instruction "
|
|
173
204
|
"tuned and when we are not just returning the raw messages."
|
|
174
205
|
)
|
|
175
206
|
|
|
@@ -199,18 +230,49 @@ def apply_prompt(
|
|
|
199
230
|
return dataset_config.prompt_template.format(**kwargs), ""
|
|
200
231
|
|
|
201
232
|
match dataset_config.task.task_group:
|
|
202
|
-
case
|
|
203
|
-
|
|
204
|
-
):
|
|
233
|
+
case TaskGroup.SEQUENCE_CLASSIFICATION:
|
|
234
|
+
labels_str = dataset_config.get_labels_str()
|
|
205
235
|
few_shot_sections = [
|
|
206
236
|
create_prompt(
|
|
207
237
|
text=example["text"].replace("\n", " ").strip(),
|
|
208
238
|
label=example["label"].replace("\n", " ").strip(),
|
|
239
|
+
labels_str=labels_str,
|
|
209
240
|
)
|
|
210
241
|
for example in few_shot_examples
|
|
211
242
|
]
|
|
212
243
|
new_sections = [
|
|
213
|
-
create_prompt(
|
|
244
|
+
create_prompt(
|
|
245
|
+
text=text.replace("\n", " ").strip(),
|
|
246
|
+
label="",
|
|
247
|
+
labels_str=labels_str,
|
|
248
|
+
)
|
|
249
|
+
for text in examples["text"]
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
253
|
+
few_shot_sections = [
|
|
254
|
+
create_prompt(
|
|
255
|
+
text=example["text"].replace("\n", " ").strip(),
|
|
256
|
+
label=example["label"].replace("\n", " ").strip(),
|
|
257
|
+
labels_str=dataset_config.get_labels_str(
|
|
258
|
+
labels=extract_multiple_choice_labels(
|
|
259
|
+
prompt=example["text"],
|
|
260
|
+
candidate_labels=dataset_config.labels,
|
|
261
|
+
)
|
|
262
|
+
),
|
|
263
|
+
)
|
|
264
|
+
for example in few_shot_examples
|
|
265
|
+
]
|
|
266
|
+
new_sections = [
|
|
267
|
+
create_prompt(
|
|
268
|
+
text=text.replace("\n", " ").strip(),
|
|
269
|
+
label="",
|
|
270
|
+
labels_str=dataset_config.get_labels_str(
|
|
271
|
+
labels=extract_multiple_choice_labels(
|
|
272
|
+
prompt=text, candidate_labels=dataset_config.labels
|
|
273
|
+
)
|
|
274
|
+
),
|
|
275
|
+
)
|
|
214
276
|
for text in examples["text"]
|
|
215
277
|
]
|
|
216
278
|
|
|
@@ -228,6 +290,7 @@ def apply_prompt(
|
|
|
228
290
|
]
|
|
229
291
|
|
|
230
292
|
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
293
|
+
labels_str = dataset_config.get_labels_str()
|
|
231
294
|
|
|
232
295
|
def create_label(example: dict) -> str:
|
|
233
296
|
prompt_labels = dataset_config.prompt_label_mapping.values()
|
|
@@ -249,12 +312,15 @@ def apply_prompt(
|
|
|
249
312
|
create_prompt(
|
|
250
313
|
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
251
314
|
label=create_label(example=example),
|
|
315
|
+
labels_str=labels_str,
|
|
252
316
|
)
|
|
253
317
|
for example in few_shot_examples
|
|
254
318
|
]
|
|
255
319
|
new_sections = [
|
|
256
320
|
create_prompt(
|
|
257
|
-
text=" ".join(tokens).replace("\n", " ").strip(),
|
|
321
|
+
text=" ".join(tokens).replace("\n", " ").strip(),
|
|
322
|
+
label="",
|
|
323
|
+
labels_str=labels_str,
|
|
258
324
|
)
|
|
259
325
|
for tokens in examples["tokens"]
|
|
260
326
|
]
|
|
@@ -298,30 +364,31 @@ def apply_prompt(
|
|
|
298
364
|
examples["messages"] = messages_list
|
|
299
365
|
|
|
300
366
|
else:
|
|
301
|
-
assert
|
|
367
|
+
assert tokeniser is not None
|
|
302
368
|
|
|
303
369
|
# Pick the chat template that matches the language of the dataset, if such a
|
|
304
370
|
# template exists
|
|
305
371
|
chat_template: str | None = None
|
|
306
|
-
if
|
|
372
|
+
if hasattr(tokeniser, "chat_template") and isinstance(
|
|
373
|
+
tokeniser.chat_template, dict
|
|
374
|
+
):
|
|
307
375
|
language_codes = [
|
|
308
376
|
language.code for language in dataset_config.languages
|
|
309
377
|
]
|
|
310
|
-
for name, candidate_template in
|
|
378
|
+
for name, candidate_template in tokeniser.chat_template.items():
|
|
311
379
|
if name.lower() in language_codes:
|
|
312
380
|
chat_template = candidate_template
|
|
313
381
|
log_once(
|
|
314
|
-
f"Using the {name!r} chat template for the
|
|
382
|
+
f"Using the {name!r} chat template for the tokeniser for "
|
|
315
383
|
f"model {model_config.model_id!r}.",
|
|
316
384
|
level=logging.DEBUG,
|
|
317
385
|
)
|
|
318
386
|
break
|
|
319
387
|
|
|
320
388
|
texts = [
|
|
321
|
-
|
|
389
|
+
apply_chat_template(
|
|
322
390
|
conversation=messages,
|
|
323
|
-
|
|
324
|
-
add_generation_prompt=True,
|
|
391
|
+
tokeniser=tokeniser,
|
|
325
392
|
chat_template=chat_template,
|
|
326
393
|
)
|
|
327
394
|
for messages in messages_list
|
|
@@ -343,4 +410,7 @@ def apply_prompt(
|
|
|
343
410
|
for new_prompt, _ in new_sections
|
|
344
411
|
]
|
|
345
412
|
|
|
413
|
+
# Always add the final prompts without few-shot examples, too, for analysis
|
|
414
|
+
examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
|
|
415
|
+
|
|
346
416
|
return examples
|