EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,464 @@
1
+ """Utility functions related to the token-classification task group."""
2
+
3
+ import importlib.util
4
+ import logging
5
+ import re
6
+ import typing as t
7
+ from copy import deepcopy
8
+
9
+ import evaluate
10
+ import numpy as np
11
+ from evaluate import EvaluationModule
12
+ from transformers import PreTrainedTokenizer
13
+
14
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
+ from ..exceptions import InvalidBenchmark, NeedsExtraInstalled
16
+ from ..utils import raise_if_model_output_contains_nan_values
17
+
18
+ if t.TYPE_CHECKING:
19
+ from transformers import BatchEncoding
20
+
21
+ from ..types import Labels, Predictions
22
+
23
+ if importlib.util.find_spec("demjson3") is not None:
24
+ import demjson3
25
+
26
+
27
+ logger = logging.getLogger("euroeval")
28
+
29
+
30
+ def compute_metrics(
31
+ model_outputs_and_labels: tuple["Predictions", "Labels"],
32
+ has_misc_tags: bool,
33
+ dataset_config: "DatasetConfig",
34
+ benchmark_config: "BenchmarkConfig",
35
+ ) -> dict[str, float]:
36
+ """Compute the metrics needed for evaluation.
37
+
38
+ Args:
39
+ model_outputs_and_labels:
40
+ The first array contains the probability predictions and the second
41
+ array contains the true labels.
42
+ has_misc_tags:
43
+ Whether the dataset has MISC tags.
44
+ dataset_config:
45
+ The configuration of the dataset.
46
+ benchmark_config:
47
+ The configuration of the benchmark.
48
+
49
+ Returns:
50
+ A dictionary with the names of the metrics as keys and the metric values as
51
+ values.
52
+ """
53
+ model_outputs, labels = model_outputs_and_labels
54
+ raise_if_model_output_contains_nan_values(model_output=model_outputs)
55
+
56
+ metrics = {
57
+ metric_cfg.name: (
58
+ evaluate.load(
59
+ path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
60
+ )
61
+ if metric_cfg.huggingface_id != ""
62
+ else None
63
+ )
64
+ for metric_cfg in dataset_config.task.metrics
65
+ }
66
+
67
+ predictions: list[list[str]]
68
+ if not isinstance(model_outputs[0][0], str):
69
+ raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
70
+
71
+ # Remove ignored index (special tokens)
72
+ predictions = [
73
+ [
74
+ dataset_config.id2label[pred_id]
75
+ for pred_id, lbl_id in zip(pred, label)
76
+ if lbl_id != -100
77
+ ]
78
+ for pred, label in zip(raw_predictions, labels)
79
+ ]
80
+ labels = [
81
+ [
82
+ (
83
+ dataset_config.id2label[int(lbl_id)]
84
+ if isinstance(lbl_id, int) or isinstance(lbl_id, np.int_)
85
+ else lbl_id
86
+ )
87
+ for lbl_id in label
88
+ if lbl_id != -100
89
+ ]
90
+ for label in labels
91
+ ]
92
+
93
+ else:
94
+ predictions = model_outputs # type: ignore[assignment]
95
+
96
+ # Replace predicted tag with either MISC or O tags if they are not part of the
97
+ # dataset
98
+ labels_without_misc = {
99
+ label
100
+ for label in dataset_config.id2label.values()
101
+ if label not in {"b-misc", "i-misc"}
102
+ }
103
+ ner_tag: str
104
+ for i, prediction_list in enumerate(predictions):
105
+ for j, ner_tag in enumerate(prediction_list):
106
+ if ner_tag not in labels_without_misc:
107
+ if has_misc_tags and ner_tag[:2] == "b-":
108
+ predictions[i][j] = "b-misc"
109
+ elif has_misc_tags and ner_tag[:2] == "i-":
110
+ predictions[i][j] = "i-misc"
111
+ else:
112
+ predictions[i][j] = "o"
113
+
114
+ # Remove MISC labels from predictions
115
+ predictions_no_misc = deepcopy(predictions)
116
+ for i, prediction_list in enumerate(predictions_no_misc):
117
+ for j, ner_tag in enumerate(prediction_list):
118
+ if ner_tag[-4:] == "misc":
119
+ predictions_no_misc[i][j] = "o"
120
+
121
+ # Remove MISC labels from labels
122
+ labels_no_misc: list[list[str]] = deepcopy(labels) # type: ignore[arg-type]
123
+ for i, label_list in enumerate(labels_no_misc):
124
+ for j, ner_tag in enumerate(label_list):
125
+ if (
126
+ isinstance(ner_tag, str)
127
+ and len(ner_tag) >= 4
128
+ and ner_tag[-4:] == "misc"
129
+ ):
130
+ labels_no_misc[i][j] = "o"
131
+
132
+ # Compute the metrics
133
+ # We manually set the F1 metric to be 100% if both the labels and the models
134
+ # have no NER tags in them, since this causes an error with the `compute`
135
+ # method otherwise
136
+ predictions_all_zero = all(
137
+ all(ner_tag == "o" for ner_tag in prediction_list)
138
+ for prediction_list in predictions
139
+ )
140
+ labels_all_zero = all(
141
+ all(ner_tag == "o" for ner_tag in label_list) for label_list in labels
142
+ )
143
+ if predictions_all_zero and labels_all_zero:
144
+ results = dict(overall_f1=1.0)
145
+ else:
146
+ metric = metrics["micro_f1"]
147
+ assert isinstance(metric, EvaluationModule)
148
+ results = metric.compute(predictions=predictions, references=labels)
149
+
150
+ # Compute the metrics without MISC tags
151
+ # We manually set the F1 metric to be 100% if both the labels and the models
152
+ # have no NER tags in them, since this causes an error with the `compute`
153
+ # method otherwise
154
+ predictions_no_misc_all_zero = all(
155
+ all(ner_tag == "o" for ner_tag in prediction_list)
156
+ for prediction_list in predictions_no_misc
157
+ )
158
+ labels_no_misc_all_zero = all(
159
+ all(ner_tag == "o" for ner_tag in label_list) for label_list in labels_no_misc
160
+ )
161
+ if predictions_no_misc_all_zero and labels_no_misc_all_zero:
162
+ results_no_misc = dict(overall_f1=1.0)
163
+ else:
164
+ metric = metrics["micro_f1_no_misc"]
165
+ assert isinstance(metric, EvaluationModule)
166
+ results_no_misc = metric.compute(
167
+ predictions=predictions_no_misc, references=labels_no_misc
168
+ )
169
+
170
+ # Raise error if the metrics are invalid
171
+ if results is None or results_no_misc is None:
172
+ raise InvalidBenchmark("The predictions and labels are not of the same length.")
173
+
174
+ return dict(
175
+ micro_f1_no_misc=results_no_misc["overall_f1"], micro_f1=results["overall_f1"]
176
+ )
177
+
178
+
179
+ def extract_labels_from_generation(
180
+ input_batch: dict[str, list],
181
+ model_output: "GenerativeModelOutput",
182
+ dataset_config: "DatasetConfig",
183
+ ) -> list[t.Any]:
184
+ """Extract the predicted labels from the generated output.
185
+
186
+ Args:
187
+ input_batch:
188
+ The input batch, where the keys are the feature names and the values
189
+ are lists with the feature values.
190
+ model_output:
191
+ The raw generated output of the model.
192
+ dataset_config:
193
+ The configuration of the dataset.
194
+
195
+ Returns:
196
+ The predicted labels.
197
+ """
198
+ if importlib.util.find_spec("demjson3") is None:
199
+ raise NeedsExtraInstalled(extra="generative")
200
+
201
+ raw_predictions = model_output.sequences
202
+
203
+ # Attempt to extract the JSON dictionary from the predictions
204
+ json_regex = r"\{.+?\}"
205
+ json_matches = [
206
+ re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
207
+ or raw_prediction
208
+ for raw_prediction in raw_predictions
209
+ ]
210
+ raw_predictions = [
211
+ json_match.group() if isinstance(json_match, re.Match) else json_match
212
+ for json_match in json_matches
213
+ ]
214
+
215
+ tokens = input_batch["tokens"]
216
+ predicted_labels: list[list[str]] = [["o"] * len(token_ids) for token_ids in tokens]
217
+ for idx, raw_prediction in enumerate(raw_predictions):
218
+ try:
219
+ json_output = demjson3.decode(txt=raw_prediction)
220
+ if not isinstance(json_output, dict):
221
+ logger.debug(
222
+ "The model output is not a JSON dictionary, so cannot parse "
223
+ f"it. Skipping. Here is the output: {raw_prediction}"
224
+ )
225
+ continue
226
+ elif not all(isinstance(key, str) for key in json_output.keys()):
227
+ logger.debug(
228
+ "The model output is not a JSON dictionary with string keys, "
229
+ "so cannot parse it. Skipping. Here is the output: "
230
+ f"{raw_prediction}"
231
+ )
232
+ continue
233
+ elif not all(isinstance(value, list) for value in json_output.values()):
234
+ logger.debug(
235
+ "The model output is not a JSON dictionary with list values, "
236
+ "so cannot parse it. Skipping. Here is the output: "
237
+ f"{raw_prediction}"
238
+ )
239
+ continue
240
+ prediction_dict: dict[str, list[str]] = json_output
241
+ except demjson3.JSONDecodeError:
242
+ logger.debug(
243
+ "The model output is not valid JSON, so cannot parse it. Skipping. "
244
+ f"Here is the output: {raw_prediction!r}"
245
+ )
246
+ continue
247
+
248
+ prompt_label_mapping = dataset_config.prompt_label_mapping
249
+ for prompt_tag_name, named_entities in prediction_dict.items():
250
+ try:
251
+ tag_name = [
252
+ tag[2:]
253
+ for tag, prompt_tag in prompt_label_mapping.items()
254
+ if prompt_tag == prompt_tag_name
255
+ ][0]
256
+ except IndexError:
257
+ logger.debug(
258
+ "The model produced an invalid prompt tag name, "
259
+ f"{prompt_tag_name}. Skipping."
260
+ )
261
+ continue
262
+
263
+ named_entities = [str(named_entity) for named_entity in named_entities]
264
+ for named_entity in named_entities:
265
+ for ne_idx, named_entity_word in enumerate(named_entity.split()):
266
+ for token_idx, token in enumerate(tokens[idx]):
267
+ if named_entity_word in token:
268
+ if ne_idx == 0:
269
+ predicted_labels[idx][token_idx] = f"b-{tag_name}"
270
+ elif (
271
+ predicted_labels[idx][token_idx] == "o"
272
+ and predicted_labels[idx][token_idx - 1][2:] == tag_name
273
+ ):
274
+ predicted_labels[idx][token_idx] = f"i-{tag_name}"
275
+ return predicted_labels
276
+
277
+
278
+ def tokenize_and_align_labels(
279
+ examples: dict, tokenizer: "PreTrainedTokenizer", label2id: dict[str, int]
280
+ ) -> "BatchEncoding":
281
+ """Tokenise all texts and align the labels with them.
282
+
283
+ Args:
284
+ examples:
285
+ The examples to be tokenised.
286
+ tokenizer:
287
+ A pretrained tokenizer.
288
+ label2id:
289
+ A dictionary that converts NER tags to IDs.
290
+
291
+ Returns:
292
+ A dictionary containing the tokenized data as well as labels.
293
+ """
294
+ # Tokenize the texts. We use the `is_split_into_words` argument here because
295
+ # the texts in our dataset are lists of words (with a label for each word)
296
+ tokenized_inputs = tokenizer(
297
+ examples["tokens"], is_split_into_words=True, truncation=True, padding=True
298
+ )
299
+
300
+ # Extract a mapping between all the tokens and their corresponding word. If the
301
+ # tokenizer is of a "fast" variant then this can be accessed through the
302
+ # `word_ids` method. Otherwise, we have to extract it manually.
303
+ all_labels: list[list[int]] = list()
304
+ labels: list[str]
305
+ word_ids: list[int | None]
306
+ for i, labels in enumerate(examples["labels"]):
307
+ # Try to get the word IDs from the tokenizer
308
+ try:
309
+ word_ids = tokenized_inputs.word_ids(batch_index=i)
310
+
311
+ # If the tokenizer is not of a "fast" variant, we have to extract the word
312
+ # IDs manually
313
+ except ValueError:
314
+ # Get the list of words in the document
315
+ words: list[str] = examples["tokens"][i]
316
+
317
+ # Get the list of token IDs in the document
318
+ tok_ids: list[int] = tokenized_inputs.input_ids[i]
319
+
320
+ # Decode the token IDs
321
+ tokens = tokenizer.convert_ids_to_tokens(tok_ids)
322
+ assert isinstance(tokens, list)
323
+
324
+ # Remove prefixes from the tokens
325
+ prefixes_to_remove = ["▁", "##"]
326
+ for tok_idx, tok in enumerate(tokens):
327
+ if tok:
328
+ for prefix in prefixes_to_remove:
329
+ if tok.startswith(prefix):
330
+ tokens[tok_idx] = tok[len(prefix) :]
331
+
332
+ # Replace UNK tokens with the correct word
333
+ tokens = handle_unk_tokens(tokenizer=tokenizer, tokens=tokens, words=words)
334
+
335
+ # Get list of special tokens. Some tokenizers do not record these
336
+ # properly, which is why we convert the values to their indices and
337
+ # then back to strings
338
+ sp_toks = [
339
+ tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(sp_tok))
340
+ for sp_tok in tokenizer.special_tokens_map.values()
341
+ ]
342
+
343
+ # Replace special tokens with `None`
344
+ tokens_with_none = [None if tok in sp_toks else tok for tok in tokens]
345
+
346
+ # Get the alignment between the words and the tokens, on a character
347
+ # level
348
+ word_idxs = [
349
+ word_idx for word_idx, word in enumerate(words) for _ in str(word)
350
+ ]
351
+ token_idxs = [
352
+ tok_idx
353
+ for tok_idx, tok_or_none in enumerate(tokens_with_none)
354
+ for _ in str(tok_or_none)
355
+ if tok_or_none is not None
356
+ ]
357
+ alignment = list(zip(word_idxs, token_idxs))
358
+
359
+ # Raise error if there are not as many characters in the words as in
360
+ # the tokens. This can be due to the use of a different prefix.
361
+ if len(word_idxs) != len(token_idxs):
362
+ raise InvalidBenchmark(
363
+ "The tokens could not be aligned with the words during manual "
364
+ "word-token alignment. It seems that the tokenizer is neither "
365
+ "of the fast variant nor of a SentencePiece/WordPiece variant."
366
+ )
367
+
368
+ # Get the aligned word IDs
369
+ word_ids = list()
370
+ for tok_idx, tok_or_none in enumerate(tokens_with_none):
371
+ if tok_or_none is None or tok_or_none == "":
372
+ word_ids.append(None)
373
+ else:
374
+ word_idx = [
375
+ word_idx
376
+ for word_idx, token_idx in alignment
377
+ if token_idx == tok_idx
378
+ ][0]
379
+ word_ids.append(word_idx)
380
+
381
+ previous_word_idx: int | None = None
382
+ label_ids: list[int] = list()
383
+ for word_id in word_ids:
384
+ # Special tokens have a word id that is None. We set the label to -100
385
+ # so they are automatically ignored in the loss function
386
+ if word_id is None:
387
+ label_ids.append(-100)
388
+
389
+ # We set the label for the first token of each word
390
+ elif word_id != previous_word_idx:
391
+ label = labels[word_id]
392
+ try:
393
+ label_id = label2id[label.lower()]
394
+ except KeyError:
395
+ msg = f"The label {label} was not found in the model's config."
396
+ raise InvalidBenchmark(msg)
397
+ label_ids.append(label_id)
398
+
399
+ # For the other tokens in a word, we set the label to -100
400
+ else:
401
+ label_ids.append(-100)
402
+
403
+ previous_word_idx = word_id
404
+
405
+ all_labels.append(label_ids)
406
+ tokenized_inputs["labels"] = all_labels
407
+ return tokenized_inputs
408
+
409
+
410
+ def handle_unk_tokens(
411
+ tokenizer: "PreTrainedTokenizer", tokens: list[str], words: list[str]
412
+ ) -> list[str]:
413
+ """Replace unknown tokens in the tokens with the corresponding word.
414
+
415
+ Args:
416
+ tokenizer:
417
+ The tokenizer used to tokenize the words.
418
+ tokens:
419
+ The list of tokens.
420
+ words:
421
+ The list of words.
422
+
423
+ Returns:
424
+ The list of tokens with unknown tokens replaced by the corresponding word.
425
+ """
426
+ # Locate the token indices of the unknown tokens
427
+ token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokenizer.unk_token]
428
+
429
+ # Locate the word indices of the words which contain an unknown token
430
+ word_unk_idxs = [
431
+ i
432
+ for i, word in enumerate(words)
433
+ if tokenizer.unk_token
434
+ in tokenizer.convert_ids_to_tokens(
435
+ tokenizer.encode(word, add_special_tokens=False)
436
+ )
437
+ ]
438
+
439
+ # Iterate over the token index and word index pairs
440
+ for tok_idx, word_idx in zip(token_unk_idxs, word_unk_idxs):
441
+ # Fetch the word
442
+ word = words[word_idx]
443
+
444
+ # Tokenize the word, which is now a list containing at least one UNK token
445
+ tokens_with_unk = tokenizer.convert_ids_to_tokens(
446
+ tokenizer.encode(word, add_special_tokens=False)
447
+ )
448
+
449
+ # Iterate over the tokens in the word
450
+ for possible_unk_token in tokens_with_unk:
451
+ # If the token is not an UNK token then we remove the first occurence
452
+ # of the content of this token from the word. The result of the `word`
453
+ # variable will be the content of the UNK token.
454
+ # NOTE: This is a bit hacky and not bulletproof. For instance, if the
455
+ # word is "1925-1950" and the tokenizer splits it into ["[UNK]", "-",
456
+ # "19", "50"], then the result will be 2519 instead of 1925. This
457
+ # happens almost never, however, so we can live with it.
458
+ if possible_unk_token != tokenizer.unk_token:
459
+ word = word.replace(possible_unk_token, "", 1)
460
+
461
+ # Replace the token with the word
462
+ tokens[tok_idx] = word
463
+
464
+ return tokens
euroeval/tasks.py ADDED
@@ -0,0 +1,202 @@
1
+ """All benchmarks tasks used in EuroEval."""
2
+
3
+ from .data_models import MetricConfig, Task
4
+ from .enums import TaskGroup
5
+
6
+
7
+ def get_all_tasks() -> dict[str, Task]:
8
+ """Get a list of all the dataset tasks.
9
+
10
+ Returns:
11
+ A mapping between names of dataset tasks and their configurations.
12
+ """
13
+ return {cfg.name: cfg for cfg in globals().values() if isinstance(cfg, Task)}
14
+
15
+
16
+ LA = Task(
17
+ name="linguistic-acceptability",
18
+ task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
19
+ metrics=[
20
+ MetricConfig(
21
+ name="mcc",
22
+ pretty_name="Matthew's Correlation Coefficient",
23
+ huggingface_id="matthews_correlation",
24
+ results_key="matthews_correlation",
25
+ ),
26
+ MetricConfig(
27
+ name="macro_f1",
28
+ pretty_name="Macro-average F1-score",
29
+ huggingface_id="f1",
30
+ results_key="f1",
31
+ compute_kwargs=dict(average="macro"),
32
+ ),
33
+ ],
34
+ )
35
+
36
+
37
+ NER = Task(
38
+ name="named-entity-recognition",
39
+ task_group=TaskGroup.TOKEN_CLASSIFICATION,
40
+ metrics=[
41
+ MetricConfig(
42
+ name="micro_f1_no_misc",
43
+ pretty_name="Micro-average F1-score without MISC tags",
44
+ huggingface_id="seqeval",
45
+ results_key="overall_f1",
46
+ ),
47
+ MetricConfig(
48
+ name="micro_f1",
49
+ pretty_name="Micro-average F1-score with MISC tags",
50
+ huggingface_id="seqeval",
51
+ results_key="overall_f1",
52
+ ),
53
+ ],
54
+ )
55
+
56
+
57
+ RC = Task(
58
+ name="reading-comprehension",
59
+ task_group=TaskGroup.QUESTION_ANSWERING,
60
+ metrics=[
61
+ MetricConfig(
62
+ name="f1",
63
+ pretty_name="F1-score",
64
+ huggingface_id="squad_v2",
65
+ results_key="f1",
66
+ postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:.2f}%"),
67
+ ),
68
+ MetricConfig(
69
+ name="em",
70
+ pretty_name="Exact Match",
71
+ huggingface_id="squad_v2",
72
+ results_key="exact",
73
+ postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:.2f}%"),
74
+ ),
75
+ ],
76
+ )
77
+
78
+
79
+ SENT = Task(
80
+ name="sentiment-classification",
81
+ task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
82
+ metrics=[
83
+ MetricConfig(
84
+ name="mcc",
85
+ pretty_name="Matthew's Correlation Coefficient",
86
+ huggingface_id="matthews_correlation",
87
+ results_key="matthews_correlation",
88
+ ),
89
+ MetricConfig(
90
+ name="macro_f1",
91
+ pretty_name="Macro-average F1-score",
92
+ huggingface_id="f1",
93
+ results_key="f1",
94
+ compute_kwargs=dict(average="macro"),
95
+ ),
96
+ ],
97
+ )
98
+
99
+
100
+ SUMM = Task(
101
+ name="summarization",
102
+ task_group=TaskGroup.TEXT_TO_TEXT,
103
+ metrics=[
104
+ MetricConfig(
105
+ name="bertscore",
106
+ pretty_name="BERTScore",
107
+ huggingface_id="bertscore",
108
+ results_key="f1",
109
+ compute_kwargs=dict(
110
+ model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=32
111
+ ),
112
+ ),
113
+ MetricConfig(
114
+ name="rouge_l",
115
+ pretty_name="ROUGE-L",
116
+ huggingface_id="rouge",
117
+ results_key="rougeL",
118
+ ),
119
+ ],
120
+ )
121
+
122
+
123
+ KNOW = Task(
124
+ name="knowledge",
125
+ task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
126
+ metrics=[
127
+ MetricConfig(
128
+ name="mcc",
129
+ pretty_name="Matthew's Correlation Coefficient",
130
+ huggingface_id="matthews_correlation",
131
+ results_key="matthews_correlation",
132
+ ),
133
+ MetricConfig(
134
+ name="accuracy",
135
+ pretty_name="Accuracy",
136
+ huggingface_id="accuracy",
137
+ results_key="accuracy",
138
+ ),
139
+ ],
140
+ )
141
+
142
+
143
+ MCRC = Task(
144
+ name="multiple-choice-reading-comprehension",
145
+ task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
146
+ metrics=[
147
+ MetricConfig(
148
+ name="mcc",
149
+ pretty_name="Matthew's Correlation Coefficient",
150
+ huggingface_id="matthews_correlation",
151
+ results_key="matthews_correlation",
152
+ ),
153
+ MetricConfig(
154
+ name="accuracy",
155
+ pretty_name="Accuracy",
156
+ huggingface_id="accuracy",
157
+ results_key="accuracy",
158
+ ),
159
+ ],
160
+ )
161
+
162
+
163
+ COMMON_SENSE = Task(
164
+ name="common-sense-reasoning",
165
+ task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
166
+ metrics=[
167
+ MetricConfig(
168
+ name="mcc",
169
+ pretty_name="Matthew's Correlation Coefficient",
170
+ huggingface_id="matthews_correlation",
171
+ results_key="matthews_correlation",
172
+ ),
173
+ MetricConfig(
174
+ name="accuracy",
175
+ pretty_name="Accuracy",
176
+ huggingface_id="accuracy",
177
+ results_key="accuracy",
178
+ ),
179
+ ],
180
+ )
181
+
182
+
183
+ SPEED = Task(
184
+ name="speed",
185
+ task_group=TaskGroup.SPEED,
186
+ metrics=[
187
+ MetricConfig(
188
+ name="speed",
189
+ pretty_name="Tokens per second",
190
+ huggingface_id="",
191
+ results_key="speed",
192
+ postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
193
+ ),
194
+ MetricConfig(
195
+ name="speed_short",
196
+ pretty_name="Tokens per second on short documents",
197
+ huggingface_id="",
198
+ results_key="speed",
199
+ postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
200
+ ),
201
+ ],
202
+ )