eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +177 -0
  5. eval_framework/context/eval.py +121 -0
  6. eval_framework/context/local.py +78 -0
  7. eval_framework/evaluation_generator.py +234 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +432 -0
  16. eval_framework/llm/base.py +180 -0
  17. eval_framework/llm/huggingface.py +418 -0
  18. eval_framework/llm/mistral.py +88 -0
  19. eval_framework/llm/models.py +28 -0
  20. eval_framework/llm/openai.py +400 -0
  21. eval_framework/llm/vllm.py +554 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +166 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/aidanbench.py +28 -0
  29. eval_framework/metrics/completion/bleu.py +76 -0
  30. eval_framework/metrics/completion/chrf.py +62 -0
  31. eval_framework/metrics/completion/code_assertion.py +44 -0
  32. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  33. eval_framework/metrics/completion/comet.py +56 -0
  34. eval_framework/metrics/completion/concordance_index.py +38 -0
  35. eval_framework/metrics/completion/csv_format.py +102 -0
  36. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  37. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  38. eval_framework/metrics/completion/f1.py +42 -0
  39. eval_framework/metrics/completion/format_checker.py +56 -0
  40. eval_framework/metrics/completion/grid_difference.py +77 -0
  41. eval_framework/metrics/completion/ifeval.py +73 -0
  42. eval_framework/metrics/completion/json_format.py +179 -0
  43. eval_framework/metrics/completion/language_checker.py +74 -0
  44. eval_framework/metrics/completion/length_control.py +83 -0
  45. eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
  46. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  47. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  48. eval_framework/metrics/completion/repetition.py +88 -0
  49. eval_framework/metrics/completion/rouge_1.py +35 -0
  50. eval_framework/metrics/completion/rouge_2.py +45 -0
  51. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  52. eval_framework/metrics/completion/rouge_l.py +52 -0
  53. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  54. eval_framework/metrics/completion/ter.py +67 -0
  55. eval_framework/metrics/completion/text_counter.py +182 -0
  56. eval_framework/metrics/efficiency/__init__.py +0 -0
  57. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  58. eval_framework/metrics/llm/__init__.py +0 -0
  59. eval_framework/metrics/llm/base.py +34 -0
  60. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  61. eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
  62. eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
  63. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  64. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  65. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  66. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  67. eval_framework/metrics/llm/graders/language.py +56 -0
  68. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  69. eval_framework/metrics/llm/graders/models.py +74 -0
  70. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  71. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  72. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  73. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  74. eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
  75. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  76. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  77. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  78. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  79. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  80. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
  81. eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
  82. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  83. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  84. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  85. eval_framework/metrics/llm/utils.py +20 -0
  86. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  87. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  88. eval_framework/metrics/loglikelihood/base.py +50 -0
  89. eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
  90. eval_framework/metrics/loglikelihood/dcs.py +43 -0
  91. eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
  92. eval_framework/metrics/loglikelihood/ternary.py +42 -0
  93. eval_framework/py.typed +0 -0
  94. eval_framework/response_generator.py +351 -0
  95. eval_framework/result_processors/__init__.py +0 -0
  96. eval_framework/result_processors/base.py +88 -0
  97. eval_framework/result_processors/hf_uploader.py +75 -0
  98. eval_framework/result_processors/result_processor.py +129 -0
  99. eval_framework/result_processors/wandb_uploader.py +137 -0
  100. eval_framework/run.py +369 -0
  101. eval_framework/run_direct.py +42 -0
  102. eval_framework/shared/types.py +227 -0
  103. eval_framework/tasks/__init__.py +6 -0
  104. eval_framework/tasks/base.py +392 -0
  105. eval_framework/tasks/benchmarks/__init__.py +0 -0
  106. eval_framework/tasks/benchmarks/aidanbench.py +211 -0
  107. eval_framework/tasks/benchmarks/arc.py +70 -0
  108. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  109. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  110. eval_framework/tasks/benchmarks/belebele.py +60 -0
  111. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  112. eval_framework/tasks/benchmarks/casehold.py +47 -0
  113. eval_framework/tasks/benchmarks/chembench.py +85 -0
  114. eval_framework/tasks/benchmarks/copa.py +64 -0
  115. eval_framework/tasks/benchmarks/duc.py +91 -0
  116. eval_framework/tasks/benchmarks/flores200.py +133 -0
  117. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  118. eval_framework/tasks/benchmarks/gpqa.py +201 -0
  119. eval_framework/tasks/benchmarks/gsm8k.py +150 -0
  120. eval_framework/tasks/benchmarks/hellaswag.py +69 -0
  121. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  122. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  123. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  124. eval_framework/tasks/benchmarks/include.py +119 -0
  125. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  126. eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
  127. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  128. eval_framework/tasks/benchmarks/mmlu.py +215 -0
  129. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  130. eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
  131. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  132. eval_framework/tasks/benchmarks/openbookqa.py +85 -0
  133. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  134. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  135. eval_framework/tasks/benchmarks/piqa.py +64 -0
  136. eval_framework/tasks/benchmarks/quality.py +56 -0
  137. eval_framework/tasks/benchmarks/sciq.py +110 -0
  138. eval_framework/tasks/benchmarks/sphyr.py +79 -0
  139. eval_framework/tasks/benchmarks/squad.py +211 -0
  140. eval_framework/tasks/benchmarks/struct_eval.py +116 -0
  141. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  142. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  143. eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
  144. eval_framework/tasks/benchmarks/winogender.py +64 -0
  145. eval_framework/tasks/benchmarks/winogrande.py +69 -0
  146. eval_framework/tasks/benchmarks/winox.py +57 -0
  147. eval_framework/tasks/benchmarks/wmt.py +160 -0
  148. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  149. eval_framework/tasks/eval_config.py +136 -0
  150. eval_framework/tasks/perturbation.py +83 -0
  151. eval_framework/tasks/registry.py +186 -0
  152. eval_framework/tasks/task_loader.py +81 -0
  153. eval_framework/tasks/task_names.py +324 -0
  154. eval_framework/tasks/utils.py +584 -0
  155. eval_framework/utils/constants.py +9 -0
  156. eval_framework/utils/file_ops.py +245 -0
  157. eval_framework/utils/generate_task_docs.py +244 -0
  158. eval_framework/utils/helpers.py +32 -0
  159. eval_framework/utils/logging.py +62 -0
  160. eval_framework/utils/packaging.py +52 -0
  161. eval_framework/utils/tqdm_handler.py +14 -0
  162. eval_framework-0.2.7.dist-info/METADATA +548 -0
  163. eval_framework-0.2.7.dist-info/RECORD +170 -0
  164. eval_framework-0.2.7.dist-info/WHEEL +4 -0
  165. eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
  166. template_formatting/README.md +83 -0
  167. template_formatting/__init__.py +0 -0
  168. template_formatting/formatter.py +537 -0
  169. template_formatting/mistral_formatter.py +159 -0
  170. template_formatting/py.typed +0 -0
@@ -0,0 +1,150 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
5
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
6
+
7
+ ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
8
+
9
+ # Predefined fewshot examples
10
+ FEWSHOT_ITEMS = [
11
+ {
12
+ "question": (
13
+ "There are 15 trees in the grove. Grove workers will plant trees in the grove today. "
14
+ "After they are done, there will be 21 trees. "
15
+ "How many trees did the grove workers plant today?"
16
+ ),
17
+ "answer": (
18
+ "There are 15 trees originally. Then there were 21 trees after some more were planted. "
19
+ "So there must have been 21 - 15 = 6.\n#### 6"
20
+ ),
21
+ },
22
+ {
23
+ "question": (
24
+ "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?"
25
+ ),
26
+ "answer": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5.\n#### 5",
27
+ },
28
+ {
29
+ "question": (
30
+ "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?"
31
+ ),
32
+ "answer": (
33
+ "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. "
34
+ "After eating 35, they had 74 - 35 = 39.\n#### 39"
35
+ ),
36
+ },
37
+ {
38
+ "question": (
39
+ "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. "
40
+ "How many lollipops did Jason give to Denny?"
41
+ ),
42
+ "answer": (
43
+ "Jason started with 20 lollipops. Then he had 12 after giving some to Denny. "
44
+ "So he gave Denny 20 - 12 = 8.\n#### 8"
45
+ ),
46
+ },
47
+ {
48
+ "question": (
49
+ "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. "
50
+ "How many toys does he have now?"
51
+ ),
52
+ "answer": (
53
+ "Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. "
54
+ "5 + 4 = 9.\n#### 9"
55
+ ),
56
+ },
57
+ {
58
+ "question": (
59
+ "There were nine computers in the server room. Five more computers were installed each day, "
60
+ "from monday to thursday. "
61
+ "How many computers are now in the server room?"
62
+ ),
63
+ "answer": (
64
+ "There were originally 9 computers. For each of 4 days, 5 more computers were "
65
+ "added. So 5 * 4 = 20 computers were added. 9 + 20 is 29.\n#### 29"
66
+ ),
67
+ },
68
+ {
69
+ "question": (
70
+ "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. "
71
+ "How many golf balls did he have at the end of wednesday?"
72
+ ),
73
+ "answer": (
74
+ "Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. "
75
+ "After losing 2 more, he had 35 - 2 = 33 golf balls.\n#### 33"
76
+ ),
77
+ },
78
+ {
79
+ "question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
80
+ "answer": (
81
+ "Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. "
82
+ "So she has 23 - 15 dollars left. 23 - 15 is 8.\n#### 8"
83
+ ),
84
+ },
85
+ ]
86
+
87
+
88
+ class GSM8KEvalHarness(BaseTask[str]):
89
+ """GSM8K dataset: https://huggingface.co/datasets/openai/gsm8k
90
+ This version uses samples from the train split as fewshot examples.
91
+ """
92
+
93
+ NAME = "GSM8KEvalHarness"
94
+ DATASET_PATH = "gsm8k"
95
+ SAMPLE_SPLIT = "test"
96
+ FEWSHOT_SPLIT = "train"
97
+ RESPONSE_TYPE = ResponseType.COMPLETION
98
+ METRICS = [AccuracyCompletion]
99
+ SUBJECTS = ["main"]
100
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
101
+ LANGUAGE = Language.ENG
102
+
103
+ def __init__(self, num_fewshot: int = 0) -> None:
104
+ super().__init__(num_fewshot)
105
+
106
+ # until: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
107
+ self.stop_sequences: list[str] = ["Question:"]
108
+ self.max_tokens = 1600
109
+
110
+ def _extract_answer(self, completion: str) -> str:
111
+ match = ANS_RE.search(completion)
112
+ if match:
113
+ match_str = match.group(1).strip()
114
+ match_str = match_str.replace(",", "")
115
+ return match_str
116
+ else:
117
+ return "[invalid]"
118
+
119
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
120
+ for stop_sequence in self.stop_sequences:
121
+ if stop_sequence in completion_text:
122
+ completion_text = completion_text.split(stop_sequence)[0]
123
+ return self._extract_answer(completion_text)
124
+
125
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
126
+ return f"Question: {item['question']}\nAnswer:"
127
+
128
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
129
+ return f" {item['answer']}"
130
+
131
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
132
+ return self._extract_answer(item["answer"])
133
+
134
+
135
+ class GSM8K(GSM8KEvalHarness):
136
+ NAME = "GSM8K"
137
+ FEWSHOT_SPLIT = "" # Changed to empty string since we're using predefined examples
138
+
139
+ def __init__(self, num_fewshot: int = 0) -> None:
140
+ assert num_fewshot <= len(FEWSHOT_ITEMS), f"Fewshot larger than {len(FEWSHOT_ITEMS)} is not supported for GSM8K"
141
+ super().__init__(num_fewshot)
142
+
143
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
144
+ # Remove the bracketed computations from the question
145
+ question = re.sub(r"<<.*?>>", "", item["question"])
146
+ return f"Question: {question}\nAnswer:"
147
+
148
+ def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
149
+ """Override to use predefined fewshot examples instead of sampling from dataset"""
150
+ return FEWSHOT_ITEMS[: self.num_fewshot]
@@ -0,0 +1,69 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
5
+ AccuracyLoglikelihood,
6
+ AccuracyNormLoglikelihood,
7
+ )
8
+ from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
9
+ from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
10
+ from eval_framework.metrics.loglikelihood.ternary import TernaryScore
11
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
12
+
13
+
14
+ class HELLASWAG(BaseTask[str]):
15
+ """Hellaswag dataset: https://huggingface.co/datasets/Rowan/hellaswag
16
+ available data set sections: train, validation, test"""
17
+
18
+ NAME = "HellaSwag"
19
+ DATASET_PATH = "Rowan/hellaswag"
20
+ SAMPLE_SPLIT = "validation"
21
+ FEWSHOT_SPLIT = "train"
22
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
23
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
24
+ SUBJECTS = [NO_SUBJECT]
25
+ LANGUAGE = Language.ENG
26
+
27
+ @staticmethod
28
+ def _preprocess(prompt: str) -> str:
29
+ # remove bracketed text
30
+ prompt = prompt.strip()
31
+ prompt = prompt.replace(" [title]", ". ")
32
+ prompt = re.sub("\\[.*?\\]", "", prompt)
33
+ prompt = prompt.replace(" ", " ")
34
+ return prompt
35
+
36
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
37
+ subject = self._preprocess(item["activity_label"])
38
+ question = self._preprocess(item["ctx_a"] + " " + item["ctx_b"].capitalize()).strip()
39
+ return f"{subject}: {question}"
40
+
41
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
42
+ ground_truth_index = int(item["label"] if item["label"] != "" else 0)
43
+ choices = [self._preprocess(ending) for ending in item["endings"]]
44
+ return f" {choices[ground_truth_index]}"
45
+
46
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
47
+ return [f" {self._preprocess(ending)}" for ending in item["endings"]]
48
+
49
+
50
+ class HELLASWAG_IDK(HELLASWAG):
51
+ NAME = "HellaSwag_IDK"
52
+ METRICS = [
53
+ AccuracyLoglikelihood,
54
+ AccuracyNormLoglikelihood,
55
+ ConfidenceWeightedAccuracy,
56
+ DistributionalCorrectnessScore,
57
+ TernaryScore,
58
+ ]
59
+
60
+ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
61
+ return (
62
+ "Complete the sentence only if you are confident, since mistakes may be penalised, while correct "
63
+ "completions receive points. It is acceptable to answer with 'I do not know' if you are unsure, "
64
+ "and you will receive 0 points."
65
+ )
66
+
67
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
68
+ completions = super()._get_possible_completions(item)
69
+ return (completions or []) + [" I do not know."]
@@ -0,0 +1,52 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
5
+ AccuracyLoglikelihood,
6
+ AccuracyNormLoglikelihood,
7
+ )
8
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
9
+
10
+
11
+ class HELLASWAG_DE(BaseTask[str]):
12
+ """Hellaswag dataset: https://huggingface.co/datasets/LeoLM/HellaSwag_de
13
+ available data set sections: train (1k rows), validation (10k rows)"""
14
+
15
+ NAME = "HellaSwag German"
16
+ DATASET_PATH = "LeoLM/HellaSwag_de"
17
+ SAMPLE_SPLIT = "validation"
18
+ FEWSHOT_SPLIT = "train"
19
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
20
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
21
+ SUBJECTS = [NO_SUBJECT]
22
+ LANGUAGE = Language.DEU
23
+
24
+ @staticmethod
25
+ def _preprocess(prompt: str) -> str:
26
+ # remove bracketed text
27
+ prompt = prompt.strip()
28
+ prompt = prompt.replace(" [title]", ". ")
29
+ prompt = re.sub("\\[.*?\\]", "", prompt)
30
+ prompt = prompt.replace(" ", " ")
31
+ return prompt
32
+
33
+ def _load_dataset(self, subject: str) -> None:
34
+ super()._load_dataset(subject)
35
+ new_dataset = {}
36
+ for split, items in self.dataset.items():
37
+ # in the valid split, only 10035 out of 10042 items are well translated
38
+ new_dataset[split] = [item for item in items if len(item["endings_de"]) == len(item["endings"])]
39
+ self.dataset = new_dataset
40
+
41
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
42
+ subject = self._preprocess(item["activity_label_de"])
43
+ question = self._preprocess(item["ctx_de"]).strip()
44
+ return f"{subject}: {question}"
45
+
46
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
47
+ ground_truth_index = int(item["label"] if item["label"] != "" else 0)
48
+ choices = [self._preprocess(ending) for ending in item["endings_de"]]
49
+ return f" {choices[ground_truth_index]}"
50
+
51
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
52
+ return [f" {self._preprocess(ending)}" for ending in item["endings_de"]]
@@ -0,0 +1,97 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.completion.code_assertion import CodeCompletionAssertion
4
+ from eval_framework.shared.types import BaseMetricContext
5
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample
6
+
7
+ CODE_TO_EXECUTE = """
8
+ {start_of_code}
9
+ {completion_text}
10
+ {test_code}
11
+ try:
12
+ check({entry_point})
13
+ print(True)
14
+ except Exception as e:
15
+ print(e)
16
+ print(False)
17
+ """
18
+
19
+
20
+ class HumanEvalMetricContext(BaseMetricContext):
21
+ test: str
22
+ entry_point: str
23
+ prompt: str
24
+
25
+
26
+ class HumanEval(BaseTask[str]):
27
+ """HumanEval dataset: https://huggingface.co/datasets/openai/openai_humaneval/"""
28
+
29
+ NAME = "Human Eval"
30
+ DATASET_PATH = "openai/openai_humaneval"
31
+ SAMPLE_SPLIT = "test"
32
+ FEWSHOT_SPLIT = "test" # (there is no dedicated split, few-shot is not expected for this dataset)
33
+ RESPONSE_TYPE = ResponseType.COMPLETION
34
+ METRICS = [CodeCompletionAssertion]
35
+ SUBJECTS = [NO_SUBJECT]
36
+ LANGUAGE = Language.ENG
37
+
38
+ def __init__(self, num_fewshot: int = 0) -> None:
39
+ super().__init__(num_fewshot)
40
+ self.stop_sequences: list[str] = ["```"]
41
+
42
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
43
+ return f"```python\n{item['prompt'].lstrip()}"
44
+
45
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
46
+ return "Success"
47
+
48
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
49
+ return item["canonical_solution"]
50
+
51
+ def _get_context(self, item: dict[str, Any]) -> HumanEvalMetricContext:
52
+ return HumanEvalMetricContext(
53
+ test=item["test"],
54
+ entry_point=item["entry_point"],
55
+ prompt=item["prompt"],
56
+ )
57
+
58
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
59
+ assert sample is not None and sample.context is not None
60
+ assert isinstance(sample.context, HumanEvalMetricContext), "Expected HumanEvalMetricContext"
61
+ context = sample.context
62
+
63
+ for stop_sequence in self.stop_sequences:
64
+ if stop_sequence in completion_text:
65
+ completion_text = completion_text.split(stop_sequence)[0]
66
+
67
+ entry_point = context.entry_point
68
+ test_code = context.test
69
+ start_of_code = context.prompt
70
+ formatted_code = CODE_TO_EXECUTE.format(
71
+ start_of_code=start_of_code,
72
+ completion_text=completion_text,
73
+ test_code=test_code,
74
+ entry_point=entry_point,
75
+ )
76
+
77
+ return formatted_code
78
+
79
+
80
+ class HumanEvalInstruct(HumanEval):
81
+ # See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml
82
+ NAME = "Human Eval Instruct"
83
+ CUE_PREFIX = "Here is the completed function:\n```python\n"
84
+
85
+ def __init__(self, num_fewshot: int = 0) -> None:
86
+ assert num_fewshot == 0, "Fewshot is not supported for Human Eval Instruct"
87
+ super().__init__(num_fewshot)
88
+
89
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
90
+ instruction_text = (
91
+ "Write a solution to the following problem and make sure that "
92
+ f"it passes the tests:\n```python\n{item['prompt'].lstrip()}"
93
+ )
94
+ return instruction_text
95
+
96
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
97
+ return self.CUE_PREFIX + item["prompt"].lstrip()
@@ -0,0 +1,78 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
4
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
5
+
6
+
7
+ class IFEval(BaseTask[str]):
8
+ """IFEval: Instruction Following Eval (https://arxiv.org/pdf/2311.07911)."""
9
+
10
+ NAME = "IFEval"
11
+ DATASET_PATH = "google/IFEval"
12
+ SAMPLE_SPLIT = "train"
13
+ FEWSHOT_SPLIT = "train"
14
+ RESPONSE_TYPE = ResponseType.COMPLETION
15
+ METRICS = [IFEvalMetric]
16
+ SUBJECTS = [NO_SUBJECT]
17
+ LANGUAGE = {NO_SUBJECT: Language.ENG}
18
+
19
+ def __init__(self, num_fewshot: int = 0) -> None:
20
+ super().__init__(num_fewshot)
21
+ assert num_fewshot == 0, "IFEval does not support few-shot prompting."
22
+
23
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
24
+ return item["prompt"]
25
+
26
+ def _get_context(self, item: dict[str, Any]) -> IFEvalMetricContext:
27
+ assert "key" in item, "Expected 'key' in item"
28
+ assert "instruction_id_list" in item, "Expected 'instruction_id_list' in item"
29
+ assert "prompt" in item, "Expected 'prompt' in item"
30
+ assert "kwargs" in item, "Expected 'kwargs' in item"
31
+
32
+ new_kwargs = []
33
+ for d in item["kwargs"]:
34
+ # fixing undesired float fields in the dataset
35
+ assert all([abs(v - float(v)) < 1e-5 for v in d.values() if isinstance(v, float)])
36
+ new_kwargs.append({k: v if not isinstance(v, float) else int(v) for k, v in d.items()})
37
+
38
+ # fixing changes to the HF dataset done on Apr 10 2025
39
+ if item["key"] == 142:
40
+ new_kwargs[2]["relation"] = None
41
+ new_kwargs[2]["frequency"] = None
42
+ new_kwargs[2]["keywords"] = new_kwargs[2]["keyword"]
43
+ del new_kwargs[2]["keyword"]
44
+ if item["key"] == 1512:
45
+ new_kwargs[0]["relation"] = None
46
+
47
+ item["kwargs"] = new_kwargs
48
+
49
+ return IFEvalMetricContext(
50
+ key=item["key"],
51
+ instruction_id_list=item["instruction_id_list"],
52
+ prompt=item["prompt"],
53
+ additional_kwargs=item["kwargs"],
54
+ )
55
+
56
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
57
+ return None
58
+
59
+ def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
60
+ return []
61
+
62
+
63
+ class IFEvalFiSv(IFEval):
64
+ """Machine translated versions of the Instruction Following Evaluation (IFEval) benchmark."""
65
+
66
+ NAME = "IFEval Finnish & Swedish"
67
+ DATASET_PATH = "LumiOpen/ifeval_mt"
68
+ SUBJECTS = ["fi", "sv"]
69
+ LANGUAGE = {"fi": Language.FIN, "sv": Language.SWE}
70
+
71
+
72
+ class IFEvalDe(IFEval):
73
+ """German version of the Instruction Following Evaluation (IFEval) benchmark."""
74
+
75
+ NAME = "IFEval German"
76
+ DATASET_PATH = "jzhang86/de_ifeval"
77
+ SUBJECTS = [NO_SUBJECT]
78
+ LANGUAGE = {NO_SUBJECT: Language.DEU}
@@ -0,0 +1,119 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
4
+ AccuracyLoglikelihood,
5
+ AccuracyNormLoglikelihood,
6
+ )
7
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType
8
+ from eval_framework.tasks.utils import get_n_letters
9
+
10
+ INCLUDE_SUBJECTS = [
11
+ "Albanian",
12
+ "Arabic",
13
+ "Armenian",
14
+ "Azerbaijani",
15
+ "Basque",
16
+ "Belarusian",
17
+ "Bengali",
18
+ "Bulgarian",
19
+ "Chinese",
20
+ "Croatian",
21
+ "Dutch",
22
+ "Estonian",
23
+ "Finnish",
24
+ "French",
25
+ "Georgian",
26
+ "German",
27
+ "Greek",
28
+ "Hebrew",
29
+ "Hindi",
30
+ "Hungarian",
31
+ "Indonesian",
32
+ "Italian",
33
+ "Japanese",
34
+ "Kazakh",
35
+ "Korean",
36
+ "Lithuanian",
37
+ "Malay",
38
+ "Malayalam",
39
+ "Nepali",
40
+ "North Macedonian",
41
+ "Persian",
42
+ "Polish",
43
+ "Portuguese",
44
+ "Russian",
45
+ "Serbian",
46
+ "Spanish",
47
+ "Tagalog",
48
+ "Tamil",
49
+ "Telugu",
50
+ "Turkish",
51
+ "Ukrainian",
52
+ "Urdu",
53
+ "Uzbek",
54
+ "Vietnamese",
55
+ ]
56
+
57
+
58
+ def subject_to_language(subject: str) -> Language:
59
+ if subject == "Greek":
60
+ return Language.ELL # type: ignore[attr-defined]
61
+ elif subject == "Malay":
62
+ return Language.MSA # type: ignore[attr-defined]
63
+ elif subject == "Nepali":
64
+ return Language.NEP # type: ignore[attr-defined]
65
+ elif subject == "North Macedonian":
66
+ return Language.MKD # type: ignore[attr-defined]
67
+ elif subject == "Croatian":
68
+ return Language.HRV # type: ignore[attr-defined]
69
+ elif subject == "Serbian":
70
+ return Language.SRP # type: ignore[attr-defined]
71
+ else:
72
+ return Language(subject)
73
+
74
+
75
+ class INCLUDE(BaseTask[str]):
76
+ """INCLUDE dataset: https://huggingface.co/datasets/CohereLabs/include-base-44"""
77
+
78
+ NAME = "INCLUDE"
79
+ DATASET_PATH = "CohereLabs/include-base-44"
80
+ SAMPLE_SPLIT = "test"
81
+ FEWSHOT_SPLIT = "validation"
82
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
83
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
84
+ SUBJECTS = INCLUDE_SUBJECTS
85
+ LANGUAGE = {lang: subject_to_language(lang) for lang in INCLUDE_SUBJECTS}
86
+
87
+ def __init__(self, num_fewshot: int = 0) -> None:
88
+ super().__init__(num_fewshot)
89
+
90
+ self.keys = get_n_letters(4)
91
+
92
+ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
93
+ return f"The following are multiple choice questions (with answers) in {item['language']}." # noqa: E501
94
+
95
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
96
+ question = item["question"].strip()
97
+ choices = "".join(
98
+ [
99
+ f"{key}. {choice}\n"
100
+ for key, choice in zip(
101
+ self.keys, [item["option_a"], item["option_b"], item["option_c"], item["option_d"]]
102
+ )
103
+ ]
104
+ )
105
+ return f"Question: {question}\n{choices}"
106
+
107
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
108
+ ground_truth = self._get_ground_truth(item)
109
+ assert ground_truth is not None
110
+ return f"{self._get_cue_text(item)}{ground_truth}"
111
+
112
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
113
+ return "Answer:"
114
+
115
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
116
+ return f" {self.keys[item['answer']]}"
117
+
118
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
119
+ return [f" {key}" for key in self.keys]