EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +8 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +199 -139
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +19 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +73 -23
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +35 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +90 -20
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +276 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +128 -42
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +59 -73
- euroeval/tasks.py +33 -6
- euroeval/tokenization_utils.py +294 -207
- euroeval/utils.py +150 -35
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
- euroeval-16.0.1.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -75,7 +75,11 @@ def compute_metrics(
|
|
|
75
75
|
while True:
|
|
76
76
|
try:
|
|
77
77
|
score: float | None = metric(
|
|
78
|
-
predictions=predictions,
|
|
78
|
+
predictions=predictions,
|
|
79
|
+
references=labels,
|
|
80
|
+
dataset=dataset,
|
|
81
|
+
dataset_config=dataset_config,
|
|
82
|
+
benchmark_config=benchmark_config,
|
|
79
83
|
)
|
|
80
84
|
break
|
|
81
85
|
except Exception as e:
|
|
@@ -85,7 +89,7 @@ def compute_metrics(
|
|
|
85
89
|
"MPS backend out of memory",
|
|
86
90
|
]
|
|
87
91
|
if not any(error in str(e) for error in oom_error):
|
|
88
|
-
raise InvalidBenchmark(str(e))
|
|
92
|
+
raise InvalidBenchmark(str(e)) from e
|
|
89
93
|
|
|
90
94
|
if (
|
|
91
95
|
isinstance(metric, HuggingFaceMetric)
|
|
@@ -98,7 +102,7 @@ def compute_metrics(
|
|
|
98
102
|
"the CPU."
|
|
99
103
|
)
|
|
100
104
|
else:
|
|
101
|
-
raise InvalidBenchmark(str(e))
|
|
105
|
+
raise InvalidBenchmark(str(e)) from e
|
|
102
106
|
finally:
|
|
103
107
|
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
104
108
|
if hasattr(metric, attribute):
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
"""Utility functions related to the token-classification task group."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
import re
|
|
5
4
|
import typing as t
|
|
6
5
|
from copy import deepcopy
|
|
7
6
|
|
|
8
|
-
import demjson3
|
|
9
7
|
import numpy as np
|
|
10
8
|
|
|
11
9
|
from ..exceptions import InvalidBenchmark
|
|
12
|
-
from ..utils import
|
|
10
|
+
from ..utils import (
|
|
11
|
+
extract_json_dict_from_string,
|
|
12
|
+
raise_if_model_output_contains_nan_values,
|
|
13
|
+
)
|
|
13
14
|
|
|
14
15
|
if t.TYPE_CHECKING:
|
|
15
16
|
from datasets.arrow_dataset import Dataset
|
|
@@ -17,7 +18,7 @@ if t.TYPE_CHECKING:
|
|
|
17
18
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
18
19
|
from transformers.trainer_utils import EvalPrediction
|
|
19
20
|
|
|
20
|
-
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
21
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
21
22
|
from ..types import Labels, Predictions
|
|
22
23
|
|
|
23
24
|
|
|
@@ -28,6 +29,7 @@ def compute_metrics(
|
|
|
28
29
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
29
30
|
has_misc_tags: bool,
|
|
30
31
|
dataset_config: "DatasetConfig",
|
|
32
|
+
benchmark_config: "BenchmarkConfig",
|
|
31
33
|
dataset: "Dataset",
|
|
32
34
|
) -> dict[str, float]:
|
|
33
35
|
"""Compute the metrics needed for evaluation.
|
|
@@ -40,6 +42,8 @@ def compute_metrics(
|
|
|
40
42
|
Whether the dataset has MISC tags.
|
|
41
43
|
dataset_config:
|
|
42
44
|
The configuration of the dataset.
|
|
45
|
+
benchmark_config:
|
|
46
|
+
The configuration of the benchmark.
|
|
43
47
|
dataset:
|
|
44
48
|
The dataset used for evaluation. This is only used in case any additional
|
|
45
49
|
metadata is used to compute the metrics.
|
|
@@ -142,7 +146,11 @@ def compute_metrics(
|
|
|
142
146
|
if metric.name == "micro_f1"
|
|
143
147
|
)
|
|
144
148
|
micro_f1_score = metric(
|
|
145
|
-
predictions=predictions,
|
|
149
|
+
predictions=predictions,
|
|
150
|
+
references=list(labels),
|
|
151
|
+
dataset=dataset,
|
|
152
|
+
dataset_config=dataset_config,
|
|
153
|
+
benchmark_config=benchmark_config,
|
|
146
154
|
)
|
|
147
155
|
|
|
148
156
|
# Compute the metrics without MISC tags
|
|
@@ -165,7 +173,11 @@ def compute_metrics(
|
|
|
165
173
|
if metric.name == "micro_f1_no_misc"
|
|
166
174
|
)
|
|
167
175
|
micro_f1_no_misc_score = metric(
|
|
168
|
-
predictions=predictions_no_misc,
|
|
176
|
+
predictions=predictions_no_misc,
|
|
177
|
+
references=labels_no_misc,
|
|
178
|
+
dataset=dataset,
|
|
179
|
+
dataset_config=dataset_config,
|
|
180
|
+
benchmark_config=benchmark_config,
|
|
169
181
|
)
|
|
170
182
|
|
|
171
183
|
# Raise error if the metrics are invalid
|
|
@@ -194,55 +206,29 @@ def extract_labels_from_generation(
|
|
|
194
206
|
Returns:
|
|
195
207
|
The predicted labels.
|
|
196
208
|
"""
|
|
197
|
-
raw_predictions = model_output.sequences
|
|
198
|
-
|
|
199
|
-
# Attempt to extract the JSON dictionary from the predictions
|
|
200
|
-
json_regex = r"\{[^{}]+?\}"
|
|
201
|
-
json_matches = [
|
|
202
|
-
re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
|
|
203
|
-
or raw_prediction
|
|
204
|
-
for raw_prediction in raw_predictions
|
|
205
|
-
]
|
|
206
|
-
raw_predictions = [
|
|
207
|
-
json_match.group() if isinstance(json_match, re.Match) else json_match
|
|
208
|
-
for json_match in json_matches
|
|
209
|
-
]
|
|
210
|
-
|
|
211
209
|
tokens = input_batch["tokens"]
|
|
212
210
|
predicted_labels: list[list[str]] = [["o"] * len(token_ids) for token_ids in tokens]
|
|
213
|
-
for idx, raw_prediction in enumerate(
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
continue
|
|
222
|
-
elif not all(isinstance(key, str) for key in json_output.keys()):
|
|
211
|
+
for idx, raw_prediction in enumerate(model_output.sequences):
|
|
212
|
+
prediction_dict = extract_json_dict_from_string(s=raw_prediction)
|
|
213
|
+
if prediction_dict is None:
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
prompt_label_mapping = dataset_config.prompt_label_mapping
|
|
217
|
+
for prompt_tag_name, named_entities in prediction_dict.items():
|
|
218
|
+
if not isinstance(named_entities, list):
|
|
223
219
|
logger.debug(
|
|
224
|
-
"The model
|
|
225
|
-
"
|
|
226
|
-
f"{raw_prediction}"
|
|
220
|
+
"The model produced an invalid format for the named entities. "
|
|
221
|
+
f"Expected a list but got {type(named_entities)}. Skipping."
|
|
227
222
|
)
|
|
228
223
|
continue
|
|
229
|
-
|
|
224
|
+
try:
|
|
225
|
+
named_entities = [str(ne) for ne in named_entities]
|
|
226
|
+
except Exception:
|
|
230
227
|
logger.debug(
|
|
231
|
-
"The model
|
|
232
|
-
"
|
|
233
|
-
f"{raw_prediction}"
|
|
228
|
+
"The model produced an invalid format for the named entities. "
|
|
229
|
+
f"Expected a list of strings but got {named_entities}. Skipping."
|
|
234
230
|
)
|
|
235
231
|
continue
|
|
236
|
-
prediction_dict: dict[str, list[str]] = json_output
|
|
237
|
-
except demjson3.JSONDecodeError:
|
|
238
|
-
logger.debug(
|
|
239
|
-
"The model output is not valid JSON, so cannot parse it. Skipping. "
|
|
240
|
-
f"Here is the output: {raw_prediction!r}"
|
|
241
|
-
)
|
|
242
|
-
continue
|
|
243
|
-
|
|
244
|
-
prompt_label_mapping = dataset_config.prompt_label_mapping
|
|
245
|
-
for prompt_tag_name, named_entities in prediction_dict.items():
|
|
246
232
|
try:
|
|
247
233
|
tag_name = [
|
|
248
234
|
tag[2:]
|
|
@@ -272,15 +258,15 @@ def extract_labels_from_generation(
|
|
|
272
258
|
|
|
273
259
|
|
|
274
260
|
def tokenize_and_align_labels(
|
|
275
|
-
examples: dict,
|
|
261
|
+
examples: dict, tokeniser: "PreTrainedTokenizer", label2id: dict[str, int]
|
|
276
262
|
) -> "BatchEncoding":
|
|
277
263
|
"""Tokenise all texts and align the labels with them.
|
|
278
264
|
|
|
279
265
|
Args:
|
|
280
266
|
examples:
|
|
281
267
|
The examples to be tokenised.
|
|
282
|
-
|
|
283
|
-
A pretrained
|
|
268
|
+
tokeniser:
|
|
269
|
+
A pretrained tokeniser.
|
|
284
270
|
label2id:
|
|
285
271
|
A dictionary that converts NER tags to IDs.
|
|
286
272
|
|
|
@@ -289,22 +275,22 @@ def tokenize_and_align_labels(
|
|
|
289
275
|
"""
|
|
290
276
|
# Tokenize the texts. We use the `is_split_into_words` argument here because
|
|
291
277
|
# the texts in our dataset are lists of words (with a label for each word)
|
|
292
|
-
tokenized_inputs =
|
|
278
|
+
tokenized_inputs = tokeniser(
|
|
293
279
|
examples["tokens"], is_split_into_words=True, truncation=True, padding=True
|
|
294
280
|
)
|
|
295
281
|
|
|
296
282
|
# Extract a mapping between all the tokens and their corresponding word. If the
|
|
297
|
-
#
|
|
283
|
+
# tokeniser is of a "fast" variant then this can be accessed through the
|
|
298
284
|
# `word_ids` method. Otherwise, we have to extract it manually.
|
|
299
285
|
all_labels: list[list[int]] = list()
|
|
300
286
|
labels: list[str]
|
|
301
287
|
word_ids: list[int | None]
|
|
302
288
|
for i, labels in enumerate(examples["labels"]):
|
|
303
|
-
# Try to get the word IDs from the
|
|
289
|
+
# Try to get the word IDs from the tokeniser
|
|
304
290
|
try:
|
|
305
291
|
word_ids = tokenized_inputs.word_ids(batch_index=i)
|
|
306
292
|
|
|
307
|
-
# If the
|
|
293
|
+
# If the tokeniser is not of a "fast" variant, we have to extract the word
|
|
308
294
|
# IDs manually
|
|
309
295
|
except ValueError:
|
|
310
296
|
# Get the list of words in the document
|
|
@@ -314,7 +300,7 @@ def tokenize_and_align_labels(
|
|
|
314
300
|
tok_ids: list[int] = tokenized_inputs.input_ids[i]
|
|
315
301
|
|
|
316
302
|
# Decode the token IDs
|
|
317
|
-
tokens =
|
|
303
|
+
tokens = tokeniser.convert_ids_to_tokens(tok_ids)
|
|
318
304
|
assert isinstance(tokens, list)
|
|
319
305
|
|
|
320
306
|
# Remove prefixes from the tokens
|
|
@@ -326,14 +312,14 @@ def tokenize_and_align_labels(
|
|
|
326
312
|
tokens[tok_idx] = tok[len(prefix) :]
|
|
327
313
|
|
|
328
314
|
# Replace UNK tokens with the correct word
|
|
329
|
-
tokens = handle_unk_tokens(
|
|
315
|
+
tokens = handle_unk_tokens(tokeniser=tokeniser, tokens=tokens, words=words)
|
|
330
316
|
|
|
331
|
-
# Get list of special tokens. Some
|
|
317
|
+
# Get list of special tokens. Some tokenisers do not record these
|
|
332
318
|
# properly, which is why we convert the values to their indices and
|
|
333
319
|
# then back to strings
|
|
334
320
|
sp_toks = [
|
|
335
|
-
|
|
336
|
-
for sp_tok in
|
|
321
|
+
tokeniser.convert_ids_to_tokens(tokeniser.convert_tokens_to_ids(sp_tok))
|
|
322
|
+
for sp_tok in tokeniser.special_tokens_map.values()
|
|
337
323
|
]
|
|
338
324
|
|
|
339
325
|
# Replace special tokens with `None`
|
|
@@ -357,7 +343,7 @@ def tokenize_and_align_labels(
|
|
|
357
343
|
if len(word_idxs) != len(token_idxs):
|
|
358
344
|
raise InvalidBenchmark(
|
|
359
345
|
"The tokens could not be aligned with the words during manual "
|
|
360
|
-
"word-token alignment. It seems that the
|
|
346
|
+
"word-token alignment. It seems that the tokeniser is neither "
|
|
361
347
|
"of the fast variant nor of a SentencePiece/WordPiece variant."
|
|
362
348
|
)
|
|
363
349
|
|
|
@@ -387,9 +373,9 @@ def tokenize_and_align_labels(
|
|
|
387
373
|
label = labels[word_id]
|
|
388
374
|
try:
|
|
389
375
|
label_id = label2id[label.lower()]
|
|
390
|
-
except KeyError:
|
|
376
|
+
except KeyError as e:
|
|
391
377
|
msg = f"The label {label} was not found in the model's config."
|
|
392
|
-
raise InvalidBenchmark(msg)
|
|
378
|
+
raise InvalidBenchmark(msg) from e
|
|
393
379
|
label_ids.append(label_id)
|
|
394
380
|
|
|
395
381
|
# For the other tokens in a word, we set the label to -100
|
|
@@ -404,13 +390,13 @@ def tokenize_and_align_labels(
|
|
|
404
390
|
|
|
405
391
|
|
|
406
392
|
def handle_unk_tokens(
|
|
407
|
-
|
|
393
|
+
tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
|
|
408
394
|
) -> list[str]:
|
|
409
395
|
"""Replace unknown tokens in the tokens with the corresponding word.
|
|
410
396
|
|
|
411
397
|
Args:
|
|
412
|
-
|
|
413
|
-
The
|
|
398
|
+
tokeniser:
|
|
399
|
+
The tokeniser used to tokenize the words.
|
|
414
400
|
tokens:
|
|
415
401
|
The list of tokens.
|
|
416
402
|
words:
|
|
@@ -420,15 +406,15 @@ def handle_unk_tokens(
|
|
|
420
406
|
The list of tokens with unknown tokens replaced by the corresponding word.
|
|
421
407
|
"""
|
|
422
408
|
# Locate the token indices of the unknown tokens
|
|
423
|
-
token_unk_idxs = [i for i, tok in enumerate(tokens) if tok ==
|
|
409
|
+
token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokeniser.unk_token]
|
|
424
410
|
|
|
425
411
|
# Locate the word indices of the words which contain an unknown token
|
|
426
412
|
word_unk_idxs = [
|
|
427
413
|
i
|
|
428
414
|
for i, word in enumerate(words)
|
|
429
|
-
if
|
|
430
|
-
in
|
|
431
|
-
|
|
415
|
+
if tokeniser.unk_token
|
|
416
|
+
in tokeniser.convert_ids_to_tokens(
|
|
417
|
+
tokeniser.encode(word, add_special_tokens=False)
|
|
432
418
|
)
|
|
433
419
|
]
|
|
434
420
|
|
|
@@ -438,8 +424,8 @@ def handle_unk_tokens(
|
|
|
438
424
|
word = words[word_idx]
|
|
439
425
|
|
|
440
426
|
# Tokenize the word, which is now a list containing at least one UNK token
|
|
441
|
-
tokens_with_unk =
|
|
442
|
-
|
|
427
|
+
tokens_with_unk = tokeniser.convert_ids_to_tokens(
|
|
428
|
+
tokeniser.encode(word, add_special_tokens=False)
|
|
443
429
|
)
|
|
444
430
|
|
|
445
431
|
# Iterate over the tokens in the word
|
|
@@ -448,10 +434,10 @@ def handle_unk_tokens(
|
|
|
448
434
|
# of the content of this token from the word. The result of the `word`
|
|
449
435
|
# variable will be the content of the UNK token.
|
|
450
436
|
# NOTE: This is a bit hacky and not bulletproof. For instance, if the
|
|
451
|
-
# word is "1925-1950" and the
|
|
437
|
+
# word is "1925-1950" and the tokeniser splits it into ["[UNK]", "-",
|
|
452
438
|
# "19", "50"], then the result will be 2519 instead of 1925. This
|
|
453
439
|
# happens almost never, however, so we can live with it.
|
|
454
|
-
if possible_unk_token !=
|
|
440
|
+
if possible_unk_token != tokeniser.unk_token:
|
|
455
441
|
word = word.replace(possible_unk_token, "", 1)
|
|
456
442
|
|
|
457
443
|
# Replace the token with the word
|
euroeval/tasks.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""All benchmarks tasks used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from . import metrics as m
|
|
4
|
+
from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
4
5
|
from .data_models import Task
|
|
5
|
-
from .enums import TaskGroup
|
|
6
|
+
from .enums import GenerativeType, ModelType, TaskGroup
|
|
6
7
|
from .prompt_templates import (
|
|
7
8
|
LA_TEMPLATES,
|
|
8
9
|
MULTIPLE_CHOICE_TEMPLATES,
|
|
@@ -28,8 +29,9 @@ LA = Task(
|
|
|
28
29
|
template_dict=LA_TEMPLATES,
|
|
29
30
|
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
30
31
|
default_num_few_shot_examples=12,
|
|
31
|
-
default_max_generated_tokens=
|
|
32
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
32
33
|
default_labels=["correct", "incorrect"],
|
|
34
|
+
uses_logprobs=True,
|
|
33
35
|
)
|
|
34
36
|
|
|
35
37
|
|
|
@@ -51,6 +53,7 @@ NER = Task(
|
|
|
51
53
|
"b-misc",
|
|
52
54
|
"i-misc",
|
|
53
55
|
],
|
|
56
|
+
uses_structured_output=True,
|
|
54
57
|
)
|
|
55
58
|
|
|
56
59
|
|
|
@@ -71,8 +74,9 @@ SENT = Task(
|
|
|
71
74
|
template_dict=SENT_TEMPLATES,
|
|
72
75
|
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
73
76
|
default_num_few_shot_examples=12,
|
|
74
|
-
default_max_generated_tokens=
|
|
77
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
75
78
|
default_labels=["positive", "neutral", "negative"],
|
|
79
|
+
uses_logprobs=True,
|
|
76
80
|
)
|
|
77
81
|
|
|
78
82
|
|
|
@@ -84,6 +88,7 @@ SUMM = Task(
|
|
|
84
88
|
default_num_few_shot_examples=1,
|
|
85
89
|
default_max_generated_tokens=256,
|
|
86
90
|
default_labels=[],
|
|
91
|
+
allowed_model_types=[ModelType.GENERATIVE],
|
|
87
92
|
)
|
|
88
93
|
|
|
89
94
|
|
|
@@ -93,8 +98,9 @@ KNOW = Task(
|
|
|
93
98
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
94
99
|
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
95
100
|
default_num_few_shot_examples=5,
|
|
96
|
-
default_max_generated_tokens=
|
|
101
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
97
102
|
default_labels=["a", "b", "c", "d"],
|
|
103
|
+
uses_logprobs=True,
|
|
98
104
|
)
|
|
99
105
|
|
|
100
106
|
|
|
@@ -104,8 +110,9 @@ MCRC = Task(
|
|
|
104
110
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
105
111
|
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
106
112
|
default_num_few_shot_examples=5,
|
|
107
|
-
default_max_generated_tokens=
|
|
113
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
108
114
|
default_labels=["a", "b", "c", "d"],
|
|
115
|
+
uses_logprobs=True,
|
|
109
116
|
)
|
|
110
117
|
|
|
111
118
|
|
|
@@ -115,8 +122,28 @@ COMMON_SENSE = Task(
|
|
|
115
122
|
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
116
123
|
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
117
124
|
default_num_few_shot_examples=5,
|
|
118
|
-
default_max_generated_tokens=
|
|
125
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
119
126
|
default_labels=["a", "b", "c", "d"],
|
|
127
|
+
uses_logprobs=True,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
EUROPEAN_VALUES = Task(
|
|
132
|
+
name="european-values",
|
|
133
|
+
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
134
|
+
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
135
|
+
metrics=[m.european_values_metric],
|
|
136
|
+
default_num_few_shot_examples=0,
|
|
137
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
138
|
+
default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
|
|
139
|
+
allowed_model_types=[ModelType.GENERATIVE],
|
|
140
|
+
allowed_generative_types=[
|
|
141
|
+
GenerativeType.INSTRUCTION_TUNED,
|
|
142
|
+
GenerativeType.REASONING,
|
|
143
|
+
],
|
|
144
|
+
requires_zero_shot=True,
|
|
145
|
+
uses_logprobs=True,
|
|
146
|
+
allow_invalid_model_outputs=False,
|
|
120
147
|
)
|
|
121
148
|
|
|
122
149
|
|