EuroEval 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -238,7 +238,7 @@ def prepare_languages(
238
238
  The default language codes of the languages to include.
239
239
 
240
240
  Returns:
241
- The prepared model or dataset languages.
241
+ The prepared dataset languages.
242
242
  """
243
243
  # Create a dictionary that maps languages to their associated language objects
244
244
  language_mapping = get_all_languages()
@@ -1,11 +1,8 @@
1
1
  """Generative models from an inference API, using the LiteLLM framework."""
2
2
 
3
3
  import collections.abc as c
4
- import itertools as it
5
- import json
6
4
  import logging
7
5
  import os
8
- import random
9
6
  import re
10
7
  import typing as t
11
8
  from functools import cached_property, partial
@@ -60,6 +57,7 @@ from ..exceptions import (
60
57
  NeedsEnvironmentVariable,
61
58
  NeedsExtraInstalled,
62
59
  )
60
+ from ..generation_utils import apply_prompt, extract_few_shot_examples
63
61
  from ..task_group_utils import (
64
62
  question_answering,
65
63
  sequence_classification,
@@ -943,14 +941,22 @@ class LiteLLMModel(BenchmarkModule):
943
941
  )
944
942
 
945
943
  if self.benchmark_config.few_shot:
946
- few_shot_examples = self._extract_few_shot_examples(
947
- dataset=dataset, task=task, itr_idx=itr_idx
944
+ few_shot_examples = extract_few_shot_examples(
945
+ dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
948
946
  )
949
947
  else:
950
948
  few_shot_examples = list()
951
949
 
952
950
  dataset["test"] = dataset["test"].map(
953
- partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
951
+ partial(
952
+ apply_prompt,
953
+ few_shot_examples=few_shot_examples,
954
+ model_config=self.model_config,
955
+ dataset_config=self.dataset_config,
956
+ instruction_model=True,
957
+ always_populate_text_field=False,
958
+ tokenizer=None,
959
+ ),
954
960
  batched=True,
955
961
  load_from_cache_file=False,
956
962
  keep_in_memory=True,
@@ -958,253 +964,6 @@ class LiteLLMModel(BenchmarkModule):
958
964
 
959
965
  return dataset
960
966
 
961
- def _extract_few_shot_examples(
962
- self, dataset: DatasetDict, task: Task, itr_idx: int
963
- ) -> list[dict[str, t.Any]]:
964
- """Extract few-shot examples from a dataset.
965
-
966
- This will always extract the examples from the training split.
967
-
968
- We ensure that the few-shot examples are unique by picking them one at a time.
969
-
970
- Args:
971
- dataset:
972
- The dataset to extract the few-shot examples from.
973
- task:
974
- The task that is being benchmarked.
975
- itr_idx:
976
- The index of the dataset in the iterator.
977
-
978
- Returns:
979
- The few-shot examples.
980
- """
981
- random_seed = 4242 + itr_idx
982
- num_few_shots = self.dataset_config.num_few_shot_examples
983
- few_shot_examples: list[dict[str, t.Any]] = list()
984
- shuffled_train = dataset["train"].shuffle(seed=random_seed)
985
-
986
- match task.task_group:
987
- case (
988
- TaskGroup.SEQUENCE_CLASSIFICATION
989
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
990
- ):
991
- labels = it.cycle(self.dataset_config.labels)
992
- while (
993
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
994
- ):
995
- label = next(labels)
996
- possible_examples = shuffled_train.filter(
997
- lambda x: x["label"].lower() == label.lower()
998
- )
999
- if len(possible_examples) == 0:
1000
- continue
1001
- example = possible_examples.select(range(1))[0]
1002
- few_shot_examples.append(example)
1003
- shuffled_train = shuffled_train.filter(
1004
- lambda x: x["text"] != example["text"]
1005
- )
1006
-
1007
- case TaskGroup.TEXT_TO_TEXT:
1008
- while (
1009
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
1010
- ):
1011
- example = shuffled_train.select(range(1))[0]
1012
- few_shot_examples.append(example)
1013
- shuffled_train = shuffled_train.filter(
1014
- lambda x: x["text"] != example["text"]
1015
- )
1016
-
1017
- case TaskGroup.TOKEN_CLASSIFICATION:
1018
- labels = it.cycle(
1019
- [
1020
- label.lower()
1021
- for label in self.dataset_config.labels
1022
- if label.lower().startswith("b-")
1023
- ]
1024
- )
1025
- while (
1026
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
1027
- ):
1028
- label = next(labels)
1029
- possible_examples = shuffled_train.filter(
1030
- lambda x: label in [tag.lower() for tag in x["labels"]]
1031
- )
1032
- if len(possible_examples) == 0:
1033
- continue
1034
- example = possible_examples.select(range(1))[0]
1035
- few_shot_examples.append(example)
1036
- shuffled_train = shuffled_train.filter(
1037
- lambda x: x["tokens"] != example["tokens"]
1038
- )
1039
-
1040
- case TaskGroup.QUESTION_ANSWERING:
1041
- # Locate the maximum number of tokens that constitutes a short example
1042
- for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
1043
- train_with_short_examples = dataset["train"].filter(
1044
- lambda example: len(example["context"]) < max_num_tokens
1045
- )
1046
- num_short_examples = len(train_with_short_examples)
1047
- if num_short_examples >= self.dataset_config.num_few_shot_examples:
1048
- break
1049
- else:
1050
- raise InvalidBenchmark(
1051
- "Could not find enough short examples for few-shot learning."
1052
- )
1053
-
1054
- shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
1055
- while (
1056
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
1057
- ):
1058
- example = shuffled_train.select(range(1))[0]
1059
- few_shot_examples.append(example)
1060
- shuffled_train = shuffled_train.filter(
1061
- lambda x: x["context"] != example["context"]
1062
- )
1063
-
1064
- case _:
1065
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
1066
-
1067
- random.seed(random_seed)
1068
- random.shuffle(few_shot_examples)
1069
- return few_shot_examples
1070
-
1071
- def _apply_prompt(
1072
- self,
1073
- examples: dict[str, t.Any],
1074
- few_shot_examples: list[dict[str, t.Any]],
1075
- task: Task,
1076
- ) -> dict[str, t.Any]:
1077
- """Apply prompt template to an example, potentially with few-shot examples.
1078
-
1079
- Args:
1080
- examples:
1081
- The examples to apply the few-shot examples to.
1082
- few_shot_examples:
1083
- The few-shot examples to apply.
1084
- task:
1085
- The task that is being benchmarked.
1086
-
1087
- Returns:
1088
- The example with the few-shot examples applied.
1089
- """
1090
-
1091
- def create_prompt(**kwargs: str) -> tuple[str, str]:
1092
- """Create a prompt from the given keyword arguments.
1093
-
1094
- Args:
1095
- kwargs:
1096
- The keyword arguments to use in the prompt.
1097
-
1098
- Returns:
1099
- A pair (prompt, label), where "label" is an empty string if the model is
1100
- not instruction tuned (as in this case it is included in the prompt).
1101
- """
1102
- label_key = "label" if "label" in kwargs else "target_text"
1103
- label = kwargs.pop(label_key)
1104
- label_mapping = self.dataset_config.prompt_label_mapping
1105
- label = label_mapping.get(label, label)
1106
- prompt = self.dataset_config.instruction_prompt.format(**kwargs)
1107
- return prompt, label
1108
-
1109
- match task.task_group:
1110
- case (
1111
- TaskGroup.SEQUENCE_CLASSIFICATION
1112
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
1113
- ):
1114
- few_shot_sections = [
1115
- create_prompt(
1116
- text=example["text"].replace("\n", " ").strip(),
1117
- label=example["label"].replace("\n", " ").strip(),
1118
- )
1119
- for example in few_shot_examples
1120
- ]
1121
- new_sections = [
1122
- create_prompt(text=text.replace("\n", " ").strip(), label="")
1123
- for text in examples["text"]
1124
- ]
1125
-
1126
- case TaskGroup.TEXT_TO_TEXT:
1127
- few_shot_sections = [
1128
- create_prompt(
1129
- text=example["text"].replace("\n", " ").strip(),
1130
- target_text=example["target_text"].replace("\n", " ").strip(),
1131
- )
1132
- for example in few_shot_examples
1133
- ]
1134
- new_sections = [
1135
- create_prompt(text=text.replace("\n", " ").strip(), target_text="")
1136
- for text in examples["text"]
1137
- ]
1138
-
1139
- case TaskGroup.TOKEN_CLASSIFICATION:
1140
-
1141
- def create_label(example: dict) -> str:
1142
- prompt_labels = self.dataset_config.prompt_label_mapping.values()
1143
- labels: dict[str, list[str]] = {
1144
- prompt_label: list() for prompt_label in prompt_labels
1145
- }
1146
- for token, label in zip(example["tokens"], example["labels"]):
1147
- label = label.lower()
1148
- if label == "o":
1149
- continue
1150
- prompt_label = self.dataset_config.prompt_label_mapping[label]
1151
- if label.startswith("b-"):
1152
- labels[prompt_label].append(token)
1153
- elif label.startswith("i-"):
1154
- labels[prompt_label][-1] += " " + token
1155
- return json.dumps(labels, ensure_ascii=False)
1156
-
1157
- few_shot_sections = [
1158
- create_prompt(
1159
- text=" ".join(example["tokens"]).replace("\n", " ").strip(),
1160
- label=create_label(example=example),
1161
- )
1162
- for example in few_shot_examples
1163
- ]
1164
- new_sections = [
1165
- create_prompt(
1166
- text=" ".join(tokens).replace("\n", " ").strip(), label=""
1167
- )
1168
- for tokens in examples["tokens"]
1169
- ]
1170
-
1171
- case TaskGroup.QUESTION_ANSWERING:
1172
- few_shot_sections = [
1173
- create_prompt(
1174
- text=example["context"].replace("\n", " ").strip(),
1175
- question=example["question"].replace("\n", " ").strip(),
1176
- label=example["answers"]["text"][0].replace("\n", " "),
1177
- )
1178
- for example in few_shot_examples
1179
- ]
1180
- new_sections = [
1181
- create_prompt(
1182
- text=context.replace("\n", " ").strip(),
1183
- question=question.replace("\n", " ").strip(),
1184
- label="",
1185
- )
1186
- for context, question in zip(
1187
- examples["context"], examples["question"]
1188
- )
1189
- ]
1190
-
1191
- case _:
1192
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
1193
-
1194
- few_shot_messages = [
1195
- dict(role=role, content=content)
1196
- for prompt, label in few_shot_sections
1197
- for role, content in [("user", prompt), ("assistant", label)]
1198
- ]
1199
-
1200
- messages_list = [
1201
- few_shot_messages + [dict(role="user", content=prompt)]
1202
- for prompt, _ in new_sections
1203
- ]
1204
-
1205
- examples["messages"] = messages_list
1206
- return examples
1207
-
1208
967
 
1209
968
  def raise_if_wrong_params(
1210
969
  model_config: ModelConfig, allowed_params: dict[str, list[str]]
@@ -1248,6 +1007,10 @@ def try_download_ollama_model(model_id: str) -> bool:
1248
1007
 
1249
1008
  Returns:
1250
1009
  Whether the model was downloaded successfully.
1010
+
1011
+ Raises:
1012
+ InvalidModel:
1013
+ If Ollama is not running or the model cannot be downloaded.
1251
1014
  """
1252
1015
  if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
1253
1016
  return False
@@ -1262,11 +1025,17 @@ def try_download_ollama_model(model_id: str) -> bool:
1262
1025
  level=logging.WARNING,
1263
1026
  )
1264
1027
 
1265
- downloaded_ollama_models: list[str] = [
1266
- model_obj.model
1267
- for model_obj in ollama.list().models
1268
- if model_obj.model is not None
1269
- ]
1028
+ try:
1029
+ downloaded_ollama_models: list[str] = [
1030
+ model_obj.model
1031
+ for model_obj in ollama.list().models
1032
+ if model_obj.model is not None
1033
+ ]
1034
+ except ConnectionError:
1035
+ raise InvalidModel(
1036
+ "Ollama does not seem to be running, so we cannot evaluate the model "
1037
+ f"{model_id!r}. Please make sure that Ollama is running and try again."
1038
+ )
1270
1039
 
1271
1040
  ollama_model_id = "/".join(model_id.split("/")[1:])
1272
1041
  if ollama_model_id not in downloaded_ollama_models: