eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,177 @@
1
+ import hashlib
2
+ import logging
3
+ import random
4
+ import re
5
+ from typing import Any
6
+
7
+ from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
8
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
9
+ AccuracyLoglikelihood,
10
+ AccuracyNormLoglikelihood,
11
+ )
12
+ from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
13
+ from eval_framework.tasks.utils import get_n_letters
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class GPQA(BaseTask[str]):
19
+ """GPQA dataset: https://huggingface.co/datasets/Idavidrein/gpqa"""
20
+
21
+ NAME = "GPQA"
22
+ DATASET_PATH = "Idavidrein/gpqa"
23
+ SAMPLE_SPLIT = "train"
24
+ FEWSHOT_SPLIT = "train"
25
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
26
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
27
+ SUBJECTS = ["gpqa_extended"] # ["gpqa_diamond", "gpqa_extended", "gpqa_main", "gpqa_experts"]
28
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(4)
29
+ LANGUAGE = Language.ENG
30
+
31
+ def __init__(self, num_fewshot: int = 0) -> None:
32
+ super().__init__(num_fewshot)
33
+ self.stop_sequences = ["Question:"]
34
+ self.keys = get_n_letters(4)
35
+ self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
36
+ self.rnd_choice_shuffle = random.Random(RANDOM_SEED)
37
+
38
+ def _load_dataset(self, subject: SubjectType) -> None:
39
+ name = subject if subject != NO_SUBJECT else None
40
+
41
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
42
+ self.dataset = {}
43
+
44
+ self.rnd = random.Random(RANDOM_SEED)
45
+
46
+ for split, data in hf_dataset.items():
47
+ data_list = list(data)
48
+
49
+ if split == self.SAMPLE_SPLIT:
50
+ self.rnd.shuffle(data_list)
51
+
52
+ if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
53
+ # exclude in the GPQA dataset one of the sample that has an too long prompt (DNA sequence)
54
+ data_list_filtered = [
55
+ item
56
+ for item in data_list
57
+ if item["Question"]
58
+ != "Hello, you are embarking on a new project. You need to produce the HP1alpha protein in E. coli. Which of these plasmids will you choose?" # noqa: E501
59
+ ]
60
+ if len(data_list) - len(data_list_filtered) > 0:
61
+ logger.info(f"Excluded {len(data_list) - len(data_list_filtered)} samples from {split} split.")
62
+ assert len(data_list) - len(data_list_filtered) < 2, "we expect to remove max one item"
63
+
64
+ self.dataset[split] = data_list_filtered
65
+
66
+ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
67
+ system_prompt_text = (
68
+ "Here are some example questions from experts. "
69
+ "An explanation is given before the final answer. "
70
+ "Answer the final question yourself, giving your reasoning beforehand."
71
+ )
72
+ return system_prompt_text
73
+
74
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
75
+ choices, _ = self._get_possible_completions_marked(item)
76
+ prompt = f"Question: {item['Question'].strip()}\n"
77
+ prompt += "\n".join(choices) + "\n"
78
+ return prompt
79
+
80
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
81
+ ground_truth = self._get_ground_truth(item)
82
+ assert ground_truth is not None
83
+ return f"{self._get_cue_text(item)}{ground_truth}"
84
+
85
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
86
+ return "Answer:"
87
+
88
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
89
+ choices, correct_answer_position = self._get_possible_completions_marked(item)
90
+ answer_key = choices[correct_answer_position][:3]
91
+ return f" {answer_key}"
92
+
93
+ def _get_possible_completions_marked(self, item: dict[str, Any]) -> tuple[list[str], int]:
94
+ choices = [self._preprocess(item[f"Incorrect Answer {x}"]) for x in range(1, 4)]
95
+ correct_answer = self._preprocess(item["Correct Answer"])
96
+ # we want to be random, but always the same for the same input
97
+ # so we hash the string, which always give you the same seed
98
+ hash_object = hashlib.sha256(f"{choices} {correct_answer}".encode())
99
+ self.rnd_choice_shuffle.seed(int(hash_object.hexdigest(), 16))
100
+ self.rnd_choice_shuffle.shuffle(choices)
101
+ correct_answer_position = self.rnd_choice_shuffle.randint(0, 3)
102
+ choices.insert(correct_answer_position, correct_answer)
103
+ choices = [f"({self.keys[i]}) {choice}" for i, choice in enumerate(choices)]
104
+ return choices, correct_answer_position
105
+
106
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
107
+ return [f" ({x})" for x in self.keys]
108
+
109
+ @staticmethod
110
+ def _preprocess(text: str | None) -> str:
111
+ if text is None:
112
+ return " "
113
+ text = text.strip()
114
+ text = text.replace(" [title]", ". ")
115
+ text = re.sub("\\[.*?\\]", "", text)
116
+ text = text.replace(" ", " ")
117
+ return text
118
+
119
+
120
+ class GPQA_COT(GPQA):
121
+ NAME = "GPQA_COT"
122
+ RESPONSE_TYPE = ResponseType.COMPLETION
123
+ METRICS = [AccuracyCompletion]
124
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Therefore", "the", "answer", "is", "ANSWER_LETTER"] + get_n_letters(
125
+ 4
126
+ )
127
+ ANS_RE = re.compile(r"Therefore, the answer is \(([ABCDEFGHIJ])\)")
128
+
129
+ def __init__(self, num_fewshot: int = 0) -> None:
130
+ assert num_fewshot == 0, "Fewshot is not supported for GPQA_COT"
131
+ super().__init__(num_fewshot)
132
+ self.stop_sequences: list[str] = ["Question:"]
133
+ self.keys = get_n_letters(4)
134
+ self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
135
+ self.rnd_choice_shuffle = random.Random(RANDOM_SEED)
136
+
137
+ def _extract_answer(self, completion: str) -> str:
138
+ match = self.ANS_RE.search(completion)
139
+ if match:
140
+ match_str = match.group(1)
141
+ return match_str
142
+ else:
143
+ return "[invalid]"
144
+
145
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
146
+ for stop_sequence in self.stop_sequences:
147
+ if stop_sequence in completion_text:
148
+ completion_text = completion_text.split(stop_sequence)[0]
149
+ return self._extract_answer(completion_text)
150
+
151
+ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
152
+ return ""
153
+
154
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
155
+ # using the reasoning prompt from "Figure 44 of Tülu 3 paper: https://arxiv.org/pdf/2411.15124"
156
+ choices, _ = self._get_possible_completions_marked(item)
157
+ instruction_text = (
158
+ "Answer the following multiple-choice question by giving the correct answer letter in parentheses. "
159
+ "Provide CONCISE reasoning for the answer, and make sure to finish the response with "
160
+ '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
161
+ )
162
+ instruction_text += f"\n\nQuestion: {item['Question'].strip()}\n"
163
+ instruction_text += "\n".join(choices)
164
+ instruction_text += (
165
+ "\n\nAnswer the above question and REMEMBER to finish your response with the exact phrase "
166
+ '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
167
+ )
168
+ return instruction_text
169
+
170
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
171
+ return ""
172
+
173
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
174
+ choices, correct_answer_position = self._get_possible_completions_marked(item)
175
+ # index 1 selects the letter
176
+ answer_key = choices[correct_answer_position][1]
177
+ return answer_key
@@ -0,0 +1,148 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
5
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
6
+
7
+ ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
8
+
9
+ # Predefined fewshot examples
10
+ FEWSHOT_ITEMS = [
11
+ {
12
+ "question": (
13
+ "There are 15 trees in the grove. Grove workers will plant trees in the grove today. "
14
+ "After they are done, there will be 21 trees. "
15
+ "How many trees did the grove workers plant today?"
16
+ ),
17
+ "answer": (
18
+ "There are 15 trees originally. Then there were 21 trees after some more were planted. "
19
+ "So there must have been 21 - 15 = 6.\n#### 6"
20
+ ),
21
+ },
22
+ {
23
+ "question": (
24
+ "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?"
25
+ ),
26
+ "answer": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5.\n#### 5",
27
+ },
28
+ {
29
+ "question": (
30
+ "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?"
31
+ ),
32
+ "answer": (
33
+ "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. "
34
+ "After eating 35, they had 74 - 35 = 39.\n#### 39"
35
+ ),
36
+ },
37
+ {
38
+ "question": (
39
+ "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. "
40
+ "How many lollipops did Jason give to Denny?"
41
+ ),
42
+ "answer": (
43
+ "Jason started with 20 lollipops. Then he had 12 after giving some to Denny. "
44
+ "So he gave Denny 20 - 12 = 8.\n#### 8"
45
+ ),
46
+ },
47
+ {
48
+ "question": (
49
+ "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. "
50
+ "How many toys does he have now?"
51
+ ),
52
+ "answer": (
53
+ "Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. "
54
+ "5 + 4 = 9.\n#### 9"
55
+ ),
56
+ },
57
+ {
58
+ "question": (
59
+ "There were nine computers in the server room. Five more computers were installed each day, "
60
+ "from monday to thursday. "
61
+ "How many computers are now in the server room?"
62
+ ),
63
+ "answer": (
64
+ "There were originally 9 computers. For each of 4 days, 5 more computers were "
65
+ "added. So 5 * 4 = 20 computers were added. 9 + 20 is 29.\n#### 29"
66
+ ),
67
+ },
68
+ {
69
+ "question": (
70
+ "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. "
71
+ "How many golf balls did he have at the end of wednesday?"
72
+ ),
73
+ "answer": (
74
+ "Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. "
75
+ "After losing 2 more, he had 35 - 2 = 33 golf balls.\n#### 33"
76
+ ),
77
+ },
78
+ {
79
+ "question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
80
+ "answer": (
81
+ "Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. "
82
+ "So she has 23 - 15 dollars left. 23 - 15 is 8.\n#### 8"
83
+ ),
84
+ },
85
+ ]
86
+
87
+
88
+ class GSM8K(BaseTask[str]):
89
+ """GSM8K dataset: https://huggingface.co/datasets/openai/gsm8k"""
90
+
91
+ NAME = "GSM8K"
92
+ DATASET_PATH = "gsm8k"
93
+ SAMPLE_SPLIT = "test"
94
+ FEWSHOT_SPLIT = "train"
95
+ RESPONSE_TYPE = ResponseType.COMPLETION
96
+ METRICS = [AccuracyCompletion]
97
+ SUBJECTS = ["main"]
98
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
99
+ LANGUAGE = Language.ENG
100
+
101
+ def __init__(self, num_fewshot: int = 0) -> None:
102
+ super().__init__(num_fewshot)
103
+
104
+ # until: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
105
+ self.stop_sequences: list[str] = ["Question:"]
106
+ self.max_tokens = 1600
107
+
108
+ def _extract_answer(self, completion: str) -> str:
109
+ match = ANS_RE.search(completion)
110
+ if match:
111
+ match_str = match.group(1).strip()
112
+ match_str = match_str.replace(",", "")
113
+ return match_str
114
+ else:
115
+ return "[invalid]"
116
+
117
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
118
+ for stop_sequence in self.stop_sequences:
119
+ if stop_sequence in completion_text:
120
+ completion_text = completion_text.split(stop_sequence)[0]
121
+ return self._extract_answer(completion_text)
122
+
123
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
124
+ return f"Question: {item['question']}\nAnswer:"
125
+
126
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
127
+ return f" {item['answer']}"
128
+
129
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
130
+ return self._extract_answer(item["answer"])
131
+
132
+
133
+ class GSM8KLlamaVersion(GSM8K):
134
+ NAME = "GSM8K Llama Version"
135
+ FEWSHOT_SPLIT = "" # Changed to empty string since we're using predefined examples
136
+
137
+ def __init__(self, num_fewshot: int = 0) -> None:
138
+ assert num_fewshot <= len(FEWSHOT_ITEMS), f"Fewshot larger than {len(FEWSHOT_ITEMS)} is not supported for GSM8K"
139
+ super().__init__(num_fewshot)
140
+
141
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
142
+ # Remove the bracketed computations from the question
143
+ question = re.sub(r"<<.*?>>", "", item["question"])
144
+ return f"Question: {question}\nAnswer:"
145
+
146
+ def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
147
+ """Override to use predefined fewshot examples instead of sampling from dataset"""
148
+ return FEWSHOT_ITEMS[: self.num_fewshot]
@@ -0,0 +1,44 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
5
+ AccuracyLoglikelihood,
6
+ AccuracyNormLoglikelihood,
7
+ )
8
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
9
+
10
+
11
+ class HELLASWAG(BaseTask[str]):
12
+ """Hellaswag dataset: https://huggingface.co/datasets/Rowan/hellaswag
13
+ available data set sections: train, validation, test"""
14
+
15
+ NAME = "HellaSwag"
16
+ DATASET_PATH = "Rowan/hellaswag"
17
+ SAMPLE_SPLIT = "validation"
18
+ FEWSHOT_SPLIT = "train"
19
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
20
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
21
+ SUBJECTS = [NO_SUBJECT]
22
+ LANGUAGE = Language.ENG
23
+
24
+ @staticmethod
25
+ def _preprocess(prompt: str) -> str:
26
+ # remove bracketed text
27
+ prompt = prompt.strip()
28
+ prompt = prompt.replace(" [title]", ". ")
29
+ prompt = re.sub("\\[.*?\\]", "", prompt)
30
+ prompt = prompt.replace(" ", " ")
31
+ return prompt
32
+
33
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
34
+ subject = self._preprocess(item["activity_label"])
35
+ question = self._preprocess(item["ctx_a"] + " " + item["ctx_b"].capitalize()).strip()
36
+ return f"{subject}: {question}"
37
+
38
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
39
+ ground_truth_index = int(item["label"] if item["label"] != "" else 0)
40
+ choices = [self._preprocess(ending) for ending in item["endings"]]
41
+ return f" {choices[ground_truth_index]}"
42
+
43
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
44
+ return [f" {self._preprocess(ending)}" for ending in item["endings"]]
@@ -0,0 +1,52 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
5
+ AccuracyLoglikelihood,
6
+ AccuracyNormLoglikelihood,
7
+ )
8
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
9
+
10
+
11
+ class HELLASWAG_DE(BaseTask[str]):
12
+ """Hellaswag dataset: https://huggingface.co/datasets/LeoLM/HellaSwag_de
13
+ available data set sections: train (1k rows), validation (10k rows)"""
14
+
15
+ NAME = "HellaSwag German"
16
+ DATASET_PATH = "LeoLM/HellaSwag_de"
17
+ SAMPLE_SPLIT = "validation"
18
+ FEWSHOT_SPLIT = "train"
19
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
20
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
21
+ SUBJECTS = [NO_SUBJECT]
22
+ LANGUAGE = Language.DEU
23
+
24
+ @staticmethod
25
+ def _preprocess(prompt: str) -> str:
26
+ # remove bracketed text
27
+ prompt = prompt.strip()
28
+ prompt = prompt.replace(" [title]", ". ")
29
+ prompt = re.sub("\\[.*?\\]", "", prompt)
30
+ prompt = prompt.replace(" ", " ")
31
+ return prompt
32
+
33
+ def _load_dataset(self, subject: str) -> None:
34
+ super()._load_dataset(subject)
35
+ new_dataset = {}
36
+ for split, items in self.dataset.items():
37
+ # in the valid split, only 10035 out of 10042 items are well translated
38
+ new_dataset[split] = [item for item in items if len(item["endings_de"]) == len(item["endings"])]
39
+ self.dataset = new_dataset
40
+
41
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
42
+ subject = self._preprocess(item["activity_label_de"])
43
+ question = self._preprocess(item["ctx_de"]).strip()
44
+ return f"{subject}: {question}"
45
+
46
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
47
+ ground_truth_index = int(item["label"] if item["label"] != "" else 0)
48
+ choices = [self._preprocess(ending) for ending in item["endings_de"]]
49
+ return f" {choices[ground_truth_index]}"
50
+
51
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
52
+ return [f" {self._preprocess(ending)}" for ending in item["endings_de"]]
@@ -0,0 +1,97 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.completion.code_assertion import CodeCompletionAssertion
4
+ from eval_framework.shared.types import BaseMetricContext
5
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample
6
+
7
+ CODE_TO_EXECUTE = """
8
+ {start_of_code}
9
+ {completion_text}
10
+ {test_code}
11
+ try:
12
+ check({entry_point})
13
+ print(True)
14
+ except Exception as e:
15
+ print(e)
16
+ print(False)
17
+ """
18
+
19
+
20
+ class HumanEvalMetricContext(BaseMetricContext):
21
+ test: str
22
+ entry_point: str
23
+ prompt: str
24
+
25
+
26
+ class HumanEval(BaseTask[str]):
27
+ """HumanEval dataset: https://huggingface.co/datasets/openai/openai_humaneval/"""
28
+
29
+ NAME = "Human Eval"
30
+ DATASET_PATH = "openai/openai_humaneval"
31
+ SAMPLE_SPLIT = "test"
32
+ FEWSHOT_SPLIT = "test" # (there is no dedicated split, few-shot is not expected for this dataset)
33
+ RESPONSE_TYPE = ResponseType.COMPLETION
34
+ METRICS = [CodeCompletionAssertion]
35
+ SUBJECTS = [NO_SUBJECT]
36
+ LANGUAGE = Language.ENG
37
+
38
+ def __init__(self, num_fewshot: int = 0) -> None:
39
+ super().__init__(num_fewshot)
40
+ self.stop_sequences: list[str] = ["```"]
41
+
42
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
43
+ return f"```python\n{item['prompt'].lstrip()}"
44
+
45
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
46
+ return "Success"
47
+
48
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
49
+ return item["canonical_solution"]
50
+
51
+ def _get_context(self, item: dict[str, Any]) -> HumanEvalMetricContext:
52
+ return HumanEvalMetricContext(
53
+ test=item["test"],
54
+ entry_point=item["entry_point"],
55
+ prompt=item["prompt"],
56
+ )
57
+
58
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
59
+ assert sample is not None and sample.context is not None
60
+ assert isinstance(sample.context, HumanEvalMetricContext), "Expected HumanEvalMetricContext"
61
+ context = sample.context
62
+
63
+ for stop_sequence in self.stop_sequences:
64
+ if stop_sequence in completion_text:
65
+ completion_text = completion_text.split(stop_sequence)[0]
66
+
67
+ entry_point = context.entry_point
68
+ test_code = context.test
69
+ start_of_code = context.prompt
70
+ formatted_code = CODE_TO_EXECUTE.format(
71
+ start_of_code=start_of_code,
72
+ completion_text=completion_text,
73
+ test_code=test_code,
74
+ entry_point=entry_point,
75
+ )
76
+
77
+ return formatted_code
78
+
79
+
80
+ class HumanEvalInstruct(HumanEval):
81
+ # See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml
82
+ NAME = "Human Eval Instruct"
83
+ CUE_PREFIX = "Here is the completed function:\n```python\n"
84
+
85
+ def __init__(self, num_fewshot: int = 0) -> None:
86
+ assert num_fewshot == 0, "Fewshot is not supported for Human Eval Instruct"
87
+ super().__init__(num_fewshot)
88
+
89
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
90
+ instruction_text = (
91
+ "Write a solution to the following problem and make sure that "
92
+ f"it passes the tests:\n```python\n{item['prompt'].lstrip()}"
93
+ )
94
+ return instruction_text
95
+
96
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
97
+ return self.CUE_PREFIX + item["prompt"].lstrip()
@@ -0,0 +1,78 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
4
+ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
5
+
6
+
7
+ class IFEval(BaseTask[str]):
8
+ """IFEval: Instruction Following Eval (https://arxiv.org/pdf/2311.07911)."""
9
+
10
+ NAME = "IFEval"
11
+ DATASET_PATH = "google/IFEval"
12
+ SAMPLE_SPLIT = "train"
13
+ FEWSHOT_SPLIT = "train"
14
+ RESPONSE_TYPE = ResponseType.COMPLETION
15
+ METRICS = [IFEvalMetric]
16
+ SUBJECTS = [NO_SUBJECT]
17
+ LANGUAGE = {NO_SUBJECT: Language.ENG}
18
+
19
+ def __init__(self, num_fewshot: int = 0) -> None:
20
+ super().__init__(num_fewshot)
21
+ assert num_fewshot == 0, "IFEval does not support few-shot prompting."
22
+
23
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
24
+ return item["prompt"]
25
+
26
+ def _get_context(self, item: dict[str, Any]) -> IFEvalMetricContext:
27
+ assert "key" in item, "Expected 'key' in item"
28
+ assert "instruction_id_list" in item, "Expected 'instruction_id_list' in item"
29
+ assert "prompt" in item, "Expected 'prompt' in item"
30
+ assert "kwargs" in item, "Expected 'kwargs' in item"
31
+
32
+ new_kwargs = []
33
+ for d in item["kwargs"]:
34
+ # fixing undesired float fields in the dataset
35
+ assert all([abs(v - float(v)) < 1e-5 for v in d.values() if isinstance(v, float)])
36
+ new_kwargs.append({k: v if not isinstance(v, float) else int(v) for k, v in d.items()})
37
+
38
+ # fixing changes to the HF dataset done on Apr 10 2025
39
+ if item["key"] == 142:
40
+ new_kwargs[2]["relation"] = None
41
+ new_kwargs[2]["frequency"] = None
42
+ new_kwargs[2]["keywords"] = new_kwargs[2]["keyword"]
43
+ del new_kwargs[2]["keyword"]
44
+ if item["key"] == 1512:
45
+ new_kwargs[0]["relation"] = None
46
+
47
+ item["kwargs"] = new_kwargs
48
+
49
+ return IFEvalMetricContext(
50
+ key=item["key"],
51
+ instruction_id_list=item["instruction_id_list"],
52
+ prompt=item["prompt"],
53
+ additional_kwargs=item["kwargs"],
54
+ )
55
+
56
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
57
+ return None
58
+
59
+ def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
60
+ return []
61
+
62
+
63
+ class IFEvalFiSv(IFEval):
64
+ """Machine translated versions of the Instruction Following Evaluation (IFEval) benchmark."""
65
+
66
+ NAME = "IFEval Finnish & Swedish"
67
+ DATASET_PATH = "LumiOpen/ifeval_mt"
68
+ SUBJECTS = ["fi", "sv"]
69
+ LANGUAGE = {"fi": Language.FIN, "sv": Language.SWE}
70
+
71
+
72
+ class IFEvalDe(IFEval):
73
+ """German version of the Instruction Following Evaluation (IFEval) benchmark."""
74
+
75
+ NAME = "IFEval German"
76
+ DATASET_PATH = "jzhang86/de_ifeval"
77
+ SUBJECTS = [NO_SUBJECT]
78
+ LANGUAGE = {NO_SUBJECT: Language.DEU}