eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
|
|
8
|
+
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
|
|
9
|
+
AccuracyLoglikelihood,
|
|
10
|
+
AccuracyNormLoglikelihood,
|
|
11
|
+
)
|
|
12
|
+
from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
|
|
13
|
+
from eval_framework.tasks.utils import get_n_letters
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GPQA(BaseTask[str]):
|
|
19
|
+
"""GPQA dataset: https://huggingface.co/datasets/Idavidrein/gpqa"""
|
|
20
|
+
|
|
21
|
+
NAME = "GPQA"
|
|
22
|
+
DATASET_PATH = "Idavidrein/gpqa"
|
|
23
|
+
SAMPLE_SPLIT = "train"
|
|
24
|
+
FEWSHOT_SPLIT = "train"
|
|
25
|
+
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
26
|
+
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
|
|
27
|
+
SUBJECTS = ["gpqa_extended"] # ["gpqa_diamond", "gpqa_extended", "gpqa_main", "gpqa_experts"]
|
|
28
|
+
PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(4)
|
|
29
|
+
LANGUAGE = Language.ENG
|
|
30
|
+
|
|
31
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
32
|
+
super().__init__(num_fewshot)
|
|
33
|
+
self.stop_sequences = ["Question:"]
|
|
34
|
+
self.keys = get_n_letters(4)
|
|
35
|
+
self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
|
|
36
|
+
self.rnd_choice_shuffle = random.Random(RANDOM_SEED)
|
|
37
|
+
|
|
38
|
+
def _load_dataset(self, subject: SubjectType) -> None:
|
|
39
|
+
name = subject if subject != NO_SUBJECT else None
|
|
40
|
+
|
|
41
|
+
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
|
|
42
|
+
self.dataset = {}
|
|
43
|
+
|
|
44
|
+
self.rnd = random.Random(RANDOM_SEED)
|
|
45
|
+
|
|
46
|
+
for split, data in hf_dataset.items():
|
|
47
|
+
data_list = list(data)
|
|
48
|
+
|
|
49
|
+
if split == self.SAMPLE_SPLIT:
|
|
50
|
+
self.rnd.shuffle(data_list)
|
|
51
|
+
|
|
52
|
+
if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
|
|
53
|
+
# exclude in the GPQA dataset one of the sample that has an too long prompt (DNA sequence)
|
|
54
|
+
data_list_filtered = [
|
|
55
|
+
item
|
|
56
|
+
for item in data_list
|
|
57
|
+
if item["Question"]
|
|
58
|
+
!= "Hello, you are embarking on a new project. You need to produce the HP1alpha protein in E. coli. Which of these plasmids will you choose?" # noqa: E501
|
|
59
|
+
]
|
|
60
|
+
if len(data_list) - len(data_list_filtered) > 0:
|
|
61
|
+
logger.info(f"Excluded {len(data_list) - len(data_list_filtered)} samples from {split} split.")
|
|
62
|
+
assert len(data_list) - len(data_list_filtered) < 2, "we expect to remove max one item"
|
|
63
|
+
|
|
64
|
+
self.dataset[split] = data_list_filtered
|
|
65
|
+
|
|
66
|
+
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
|
|
67
|
+
system_prompt_text = (
|
|
68
|
+
"Here are some example questions from experts. "
|
|
69
|
+
"An explanation is given before the final answer. "
|
|
70
|
+
"Answer the final question yourself, giving your reasoning beforehand."
|
|
71
|
+
)
|
|
72
|
+
return system_prompt_text
|
|
73
|
+
|
|
74
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
75
|
+
choices, _ = self._get_possible_completions_marked(item)
|
|
76
|
+
prompt = f"Question: {item['Question'].strip()}\n"
|
|
77
|
+
prompt += "\n".join(choices) + "\n"
|
|
78
|
+
return prompt
|
|
79
|
+
|
|
80
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
81
|
+
ground_truth = self._get_ground_truth(item)
|
|
82
|
+
assert ground_truth is not None
|
|
83
|
+
return f"{self._get_cue_text(item)}{ground_truth}"
|
|
84
|
+
|
|
85
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
86
|
+
return "Answer:"
|
|
87
|
+
|
|
88
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
89
|
+
choices, correct_answer_position = self._get_possible_completions_marked(item)
|
|
90
|
+
answer_key = choices[correct_answer_position][:3]
|
|
91
|
+
return f" {answer_key}"
|
|
92
|
+
|
|
93
|
+
def _get_possible_completions_marked(self, item: dict[str, Any]) -> tuple[list[str], int]:
|
|
94
|
+
choices = [self._preprocess(item[f"Incorrect Answer {x}"]) for x in range(1, 4)]
|
|
95
|
+
correct_answer = self._preprocess(item["Correct Answer"])
|
|
96
|
+
# we want to be random, but always the same for the same input
|
|
97
|
+
# so we hash the string, which always give you the same seed
|
|
98
|
+
hash_object = hashlib.sha256(f"{choices} {correct_answer}".encode())
|
|
99
|
+
self.rnd_choice_shuffle.seed(int(hash_object.hexdigest(), 16))
|
|
100
|
+
self.rnd_choice_shuffle.shuffle(choices)
|
|
101
|
+
correct_answer_position = self.rnd_choice_shuffle.randint(0, 3)
|
|
102
|
+
choices.insert(correct_answer_position, correct_answer)
|
|
103
|
+
choices = [f"({self.keys[i]}) {choice}" for i, choice in enumerate(choices)]
|
|
104
|
+
return choices, correct_answer_position
|
|
105
|
+
|
|
106
|
+
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
107
|
+
return [f" ({x})" for x in self.keys]
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def _preprocess(text: str | None) -> str:
|
|
111
|
+
if text is None:
|
|
112
|
+
return " "
|
|
113
|
+
text = text.strip()
|
|
114
|
+
text = text.replace(" [title]", ". ")
|
|
115
|
+
text = re.sub("\\[.*?\\]", "", text)
|
|
116
|
+
text = text.replace(" ", " ")
|
|
117
|
+
return text
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class GPQA_COT(GPQA):
|
|
121
|
+
NAME = "GPQA_COT"
|
|
122
|
+
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
123
|
+
METRICS = [AccuracyCompletion]
|
|
124
|
+
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Therefore", "the", "answer", "is", "ANSWER_LETTER"] + get_n_letters(
|
|
125
|
+
4
|
|
126
|
+
)
|
|
127
|
+
ANS_RE = re.compile(r"Therefore, the answer is \(([ABCDEFGHIJ])\)")
|
|
128
|
+
|
|
129
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
130
|
+
assert num_fewshot == 0, "Fewshot is not supported for GPQA_COT"
|
|
131
|
+
super().__init__(num_fewshot)
|
|
132
|
+
self.stop_sequences: list[str] = ["Question:"]
|
|
133
|
+
self.keys = get_n_letters(4)
|
|
134
|
+
self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
|
|
135
|
+
self.rnd_choice_shuffle = random.Random(RANDOM_SEED)
|
|
136
|
+
|
|
137
|
+
def _extract_answer(self, completion: str) -> str:
|
|
138
|
+
match = self.ANS_RE.search(completion)
|
|
139
|
+
if match:
|
|
140
|
+
match_str = match.group(1)
|
|
141
|
+
return match_str
|
|
142
|
+
else:
|
|
143
|
+
return "[invalid]"
|
|
144
|
+
|
|
145
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
146
|
+
for stop_sequence in self.stop_sequences:
|
|
147
|
+
if stop_sequence in completion_text:
|
|
148
|
+
completion_text = completion_text.split(stop_sequence)[0]
|
|
149
|
+
return self._extract_answer(completion_text)
|
|
150
|
+
|
|
151
|
+
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
|
|
152
|
+
return ""
|
|
153
|
+
|
|
154
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
155
|
+
# using the reasoning prompt from "Figure 44 of Tülu 3 paper: https://arxiv.org/pdf/2411.15124"
|
|
156
|
+
choices, _ = self._get_possible_completions_marked(item)
|
|
157
|
+
instruction_text = (
|
|
158
|
+
"Answer the following multiple-choice question by giving the correct answer letter in parentheses. "
|
|
159
|
+
"Provide CONCISE reasoning for the answer, and make sure to finish the response with "
|
|
160
|
+
'"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
|
|
161
|
+
)
|
|
162
|
+
instruction_text += f"\n\nQuestion: {item['Question'].strip()}\n"
|
|
163
|
+
instruction_text += "\n".join(choices)
|
|
164
|
+
instruction_text += (
|
|
165
|
+
"\n\nAnswer the above question and REMEMBER to finish your response with the exact phrase "
|
|
166
|
+
'"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
|
|
167
|
+
)
|
|
168
|
+
return instruction_text
|
|
169
|
+
|
|
170
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
171
|
+
return ""
|
|
172
|
+
|
|
173
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
174
|
+
choices, correct_answer_position = self._get_possible_completions_marked(item)
|
|
175
|
+
# index 1 selects the letter
|
|
176
|
+
answer_key = choices[correct_answer_position][1]
|
|
177
|
+
return answer_key
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
|
|
5
|
+
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
|
|
6
|
+
|
|
7
|
+
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
|
|
8
|
+
|
|
9
|
+
# Predefined fewshot examples
|
|
10
|
+
FEWSHOT_ITEMS = [
|
|
11
|
+
{
|
|
12
|
+
"question": (
|
|
13
|
+
"There are 15 trees in the grove. Grove workers will plant trees in the grove today. "
|
|
14
|
+
"After they are done, there will be 21 trees. "
|
|
15
|
+
"How many trees did the grove workers plant today?"
|
|
16
|
+
),
|
|
17
|
+
"answer": (
|
|
18
|
+
"There are 15 trees originally. Then there were 21 trees after some more were planted. "
|
|
19
|
+
"So there must have been 21 - 15 = 6.\n#### 6"
|
|
20
|
+
),
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"question": (
|
|
24
|
+
"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?"
|
|
25
|
+
),
|
|
26
|
+
"answer": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5.\n#### 5",
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"question": (
|
|
30
|
+
"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?"
|
|
31
|
+
),
|
|
32
|
+
"answer": (
|
|
33
|
+
"Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. "
|
|
34
|
+
"After eating 35, they had 74 - 35 = 39.\n#### 39"
|
|
35
|
+
),
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"question": (
|
|
39
|
+
"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. "
|
|
40
|
+
"How many lollipops did Jason give to Denny?"
|
|
41
|
+
),
|
|
42
|
+
"answer": (
|
|
43
|
+
"Jason started with 20 lollipops. Then he had 12 after giving some to Denny. "
|
|
44
|
+
"So he gave Denny 20 - 12 = 8.\n#### 8"
|
|
45
|
+
),
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"question": (
|
|
49
|
+
"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. "
|
|
50
|
+
"How many toys does he have now?"
|
|
51
|
+
),
|
|
52
|
+
"answer": (
|
|
53
|
+
"Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. "
|
|
54
|
+
"5 + 4 = 9.\n#### 9"
|
|
55
|
+
),
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"question": (
|
|
59
|
+
"There were nine computers in the server room. Five more computers were installed each day, "
|
|
60
|
+
"from monday to thursday. "
|
|
61
|
+
"How many computers are now in the server room?"
|
|
62
|
+
),
|
|
63
|
+
"answer": (
|
|
64
|
+
"There were originally 9 computers. For each of 4 days, 5 more computers were "
|
|
65
|
+
"added. So 5 * 4 = 20 computers were added. 9 + 20 is 29.\n#### 29"
|
|
66
|
+
),
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"question": (
|
|
70
|
+
"Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. "
|
|
71
|
+
"How many golf balls did he have at the end of wednesday?"
|
|
72
|
+
),
|
|
73
|
+
"answer": (
|
|
74
|
+
"Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. "
|
|
75
|
+
"After losing 2 more, he had 35 - 2 = 33 golf balls.\n#### 33"
|
|
76
|
+
),
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
|
|
80
|
+
"answer": (
|
|
81
|
+
"Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. "
|
|
82
|
+
"So she has 23 - 15 dollars left. 23 - 15 is 8.\n#### 8"
|
|
83
|
+
),
|
|
84
|
+
},
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class GSM8K(BaseTask[str]):
|
|
89
|
+
"""GSM8K dataset: https://huggingface.co/datasets/openai/gsm8k"""
|
|
90
|
+
|
|
91
|
+
NAME = "GSM8K"
|
|
92
|
+
DATASET_PATH = "gsm8k"
|
|
93
|
+
SAMPLE_SPLIT = "test"
|
|
94
|
+
FEWSHOT_SPLIT = "train"
|
|
95
|
+
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
96
|
+
METRICS = [AccuracyCompletion]
|
|
97
|
+
SUBJECTS = ["main"]
|
|
98
|
+
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
|
|
99
|
+
LANGUAGE = Language.ENG
|
|
100
|
+
|
|
101
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
102
|
+
super().__init__(num_fewshot)
|
|
103
|
+
|
|
104
|
+
# until: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
|
|
105
|
+
self.stop_sequences: list[str] = ["Question:"]
|
|
106
|
+
self.max_tokens = 1600
|
|
107
|
+
|
|
108
|
+
def _extract_answer(self, completion: str) -> str:
|
|
109
|
+
match = ANS_RE.search(completion)
|
|
110
|
+
if match:
|
|
111
|
+
match_str = match.group(1).strip()
|
|
112
|
+
match_str = match_str.replace(",", "")
|
|
113
|
+
return match_str
|
|
114
|
+
else:
|
|
115
|
+
return "[invalid]"
|
|
116
|
+
|
|
117
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
118
|
+
for stop_sequence in self.stop_sequences:
|
|
119
|
+
if stop_sequence in completion_text:
|
|
120
|
+
completion_text = completion_text.split(stop_sequence)[0]
|
|
121
|
+
return self._extract_answer(completion_text)
|
|
122
|
+
|
|
123
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
124
|
+
return f"Question: {item['question']}\nAnswer:"
|
|
125
|
+
|
|
126
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
127
|
+
return f" {item['answer']}"
|
|
128
|
+
|
|
129
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
130
|
+
return self._extract_answer(item["answer"])
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class GSM8KLlamaVersion(GSM8K):
|
|
134
|
+
NAME = "GSM8K Llama Version"
|
|
135
|
+
FEWSHOT_SPLIT = "" # Changed to empty string since we're using predefined examples
|
|
136
|
+
|
|
137
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
138
|
+
assert num_fewshot <= len(FEWSHOT_ITEMS), f"Fewshot larger than {len(FEWSHOT_ITEMS)} is not supported for GSM8K"
|
|
139
|
+
super().__init__(num_fewshot)
|
|
140
|
+
|
|
141
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
142
|
+
# Remove the bracketed computations from the question
|
|
143
|
+
question = re.sub(r"<<.*?>>", "", item["question"])
|
|
144
|
+
return f"Question: {question}\nAnswer:"
|
|
145
|
+
|
|
146
|
+
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
147
|
+
"""Override to use predefined fewshot examples instead of sampling from dataset"""
|
|
148
|
+
return FEWSHOT_ITEMS[: self.num_fewshot]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
|
|
5
|
+
AccuracyLoglikelihood,
|
|
6
|
+
AccuracyNormLoglikelihood,
|
|
7
|
+
)
|
|
8
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HELLASWAG(BaseTask[str]):
|
|
12
|
+
"""Hellaswag dataset: https://huggingface.co/datasets/Rowan/hellaswag
|
|
13
|
+
available data set sections: train, validation, test"""
|
|
14
|
+
|
|
15
|
+
NAME = "HellaSwag"
|
|
16
|
+
DATASET_PATH = "Rowan/hellaswag"
|
|
17
|
+
SAMPLE_SPLIT = "validation"
|
|
18
|
+
FEWSHOT_SPLIT = "train"
|
|
19
|
+
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
20
|
+
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
|
|
21
|
+
SUBJECTS = [NO_SUBJECT]
|
|
22
|
+
LANGUAGE = Language.ENG
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _preprocess(prompt: str) -> str:
|
|
26
|
+
# remove bracketed text
|
|
27
|
+
prompt = prompt.strip()
|
|
28
|
+
prompt = prompt.replace(" [title]", ". ")
|
|
29
|
+
prompt = re.sub("\\[.*?\\]", "", prompt)
|
|
30
|
+
prompt = prompt.replace(" ", " ")
|
|
31
|
+
return prompt
|
|
32
|
+
|
|
33
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
34
|
+
subject = self._preprocess(item["activity_label"])
|
|
35
|
+
question = self._preprocess(item["ctx_a"] + " " + item["ctx_b"].capitalize()).strip()
|
|
36
|
+
return f"{subject}: {question}"
|
|
37
|
+
|
|
38
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
39
|
+
ground_truth_index = int(item["label"] if item["label"] != "" else 0)
|
|
40
|
+
choices = [self._preprocess(ending) for ending in item["endings"]]
|
|
41
|
+
return f" {choices[ground_truth_index]}"
|
|
42
|
+
|
|
43
|
+
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
44
|
+
return [f" {self._preprocess(ending)}" for ending in item["endings"]]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
|
|
5
|
+
AccuracyLoglikelihood,
|
|
6
|
+
AccuracyNormLoglikelihood,
|
|
7
|
+
)
|
|
8
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HELLASWAG_DE(BaseTask[str]):
|
|
12
|
+
"""Hellaswag dataset: https://huggingface.co/datasets/LeoLM/HellaSwag_de
|
|
13
|
+
available data set sections: train (1k rows), validation (10k rows)"""
|
|
14
|
+
|
|
15
|
+
NAME = "HellaSwag German"
|
|
16
|
+
DATASET_PATH = "LeoLM/HellaSwag_de"
|
|
17
|
+
SAMPLE_SPLIT = "validation"
|
|
18
|
+
FEWSHOT_SPLIT = "train"
|
|
19
|
+
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
20
|
+
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
|
|
21
|
+
SUBJECTS = [NO_SUBJECT]
|
|
22
|
+
LANGUAGE = Language.DEU
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _preprocess(prompt: str) -> str:
|
|
26
|
+
# remove bracketed text
|
|
27
|
+
prompt = prompt.strip()
|
|
28
|
+
prompt = prompt.replace(" [title]", ". ")
|
|
29
|
+
prompt = re.sub("\\[.*?\\]", "", prompt)
|
|
30
|
+
prompt = prompt.replace(" ", " ")
|
|
31
|
+
return prompt
|
|
32
|
+
|
|
33
|
+
def _load_dataset(self, subject: str) -> None:
|
|
34
|
+
super()._load_dataset(subject)
|
|
35
|
+
new_dataset = {}
|
|
36
|
+
for split, items in self.dataset.items():
|
|
37
|
+
# in the valid split, only 10035 out of 10042 items are well translated
|
|
38
|
+
new_dataset[split] = [item for item in items if len(item["endings_de"]) == len(item["endings"])]
|
|
39
|
+
self.dataset = new_dataset
|
|
40
|
+
|
|
41
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
42
|
+
subject = self._preprocess(item["activity_label_de"])
|
|
43
|
+
question = self._preprocess(item["ctx_de"]).strip()
|
|
44
|
+
return f"{subject}: {question}"
|
|
45
|
+
|
|
46
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
47
|
+
ground_truth_index = int(item["label"] if item["label"] != "" else 0)
|
|
48
|
+
choices = [self._preprocess(ending) for ending in item["endings_de"]]
|
|
49
|
+
return f" {choices[ground_truth_index]}"
|
|
50
|
+
|
|
51
|
+
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
52
|
+
return [f" {self._preprocess(ending)}" for ending in item["endings_de"]]
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from eval_framework.metrics.completion.code_assertion import CodeCompletionAssertion
|
|
4
|
+
from eval_framework.shared.types import BaseMetricContext
|
|
5
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample
|
|
6
|
+
|
|
7
|
+
CODE_TO_EXECUTE = """
|
|
8
|
+
{start_of_code}
|
|
9
|
+
{completion_text}
|
|
10
|
+
{test_code}
|
|
11
|
+
try:
|
|
12
|
+
check({entry_point})
|
|
13
|
+
print(True)
|
|
14
|
+
except Exception as e:
|
|
15
|
+
print(e)
|
|
16
|
+
print(False)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HumanEvalMetricContext(BaseMetricContext):
|
|
21
|
+
test: str
|
|
22
|
+
entry_point: str
|
|
23
|
+
prompt: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HumanEval(BaseTask[str]):
|
|
27
|
+
"""HumanEval dataset: https://huggingface.co/datasets/openai/openai_humaneval/"""
|
|
28
|
+
|
|
29
|
+
NAME = "Human Eval"
|
|
30
|
+
DATASET_PATH = "openai/openai_humaneval"
|
|
31
|
+
SAMPLE_SPLIT = "test"
|
|
32
|
+
FEWSHOT_SPLIT = "test" # (there is no dedicated split, few-shot is not expected for this dataset)
|
|
33
|
+
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
34
|
+
METRICS = [CodeCompletionAssertion]
|
|
35
|
+
SUBJECTS = [NO_SUBJECT]
|
|
36
|
+
LANGUAGE = Language.ENG
|
|
37
|
+
|
|
38
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
39
|
+
super().__init__(num_fewshot)
|
|
40
|
+
self.stop_sequences: list[str] = ["```"]
|
|
41
|
+
|
|
42
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
43
|
+
return f"```python\n{item['prompt'].lstrip()}"
|
|
44
|
+
|
|
45
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
46
|
+
return "Success"
|
|
47
|
+
|
|
48
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
49
|
+
return item["canonical_solution"]
|
|
50
|
+
|
|
51
|
+
def _get_context(self, item: dict[str, Any]) -> HumanEvalMetricContext:
|
|
52
|
+
return HumanEvalMetricContext(
|
|
53
|
+
test=item["test"],
|
|
54
|
+
entry_point=item["entry_point"],
|
|
55
|
+
prompt=item["prompt"],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
59
|
+
assert sample is not None and sample.context is not None
|
|
60
|
+
assert isinstance(sample.context, HumanEvalMetricContext), "Expected HumanEvalMetricContext"
|
|
61
|
+
context = sample.context
|
|
62
|
+
|
|
63
|
+
for stop_sequence in self.stop_sequences:
|
|
64
|
+
if stop_sequence in completion_text:
|
|
65
|
+
completion_text = completion_text.split(stop_sequence)[0]
|
|
66
|
+
|
|
67
|
+
entry_point = context.entry_point
|
|
68
|
+
test_code = context.test
|
|
69
|
+
start_of_code = context.prompt
|
|
70
|
+
formatted_code = CODE_TO_EXECUTE.format(
|
|
71
|
+
start_of_code=start_of_code,
|
|
72
|
+
completion_text=completion_text,
|
|
73
|
+
test_code=test_code,
|
|
74
|
+
entry_point=entry_point,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return formatted_code
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class HumanEvalInstruct(HumanEval):
|
|
81
|
+
# See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml
|
|
82
|
+
NAME = "Human Eval Instruct"
|
|
83
|
+
CUE_PREFIX = "Here is the completed function:\n```python\n"
|
|
84
|
+
|
|
85
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
86
|
+
assert num_fewshot == 0, "Fewshot is not supported for Human Eval Instruct"
|
|
87
|
+
super().__init__(num_fewshot)
|
|
88
|
+
|
|
89
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
90
|
+
instruction_text = (
|
|
91
|
+
"Write a solution to the following problem and make sure that "
|
|
92
|
+
f"it passes the tests:\n```python\n{item['prompt'].lstrip()}"
|
|
93
|
+
)
|
|
94
|
+
return instruction_text
|
|
95
|
+
|
|
96
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
97
|
+
return self.CUE_PREFIX + item["prompt"].lstrip()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
|
|
4
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class IFEval(BaseTask[str]):
|
|
8
|
+
"""IFEval: Instruction Following Eval (https://arxiv.org/pdf/2311.07911)."""
|
|
9
|
+
|
|
10
|
+
NAME = "IFEval"
|
|
11
|
+
DATASET_PATH = "google/IFEval"
|
|
12
|
+
SAMPLE_SPLIT = "train"
|
|
13
|
+
FEWSHOT_SPLIT = "train"
|
|
14
|
+
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
15
|
+
METRICS = [IFEvalMetric]
|
|
16
|
+
SUBJECTS = [NO_SUBJECT]
|
|
17
|
+
LANGUAGE = {NO_SUBJECT: Language.ENG}
|
|
18
|
+
|
|
19
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
20
|
+
super().__init__(num_fewshot)
|
|
21
|
+
assert num_fewshot == 0, "IFEval does not support few-shot prompting."
|
|
22
|
+
|
|
23
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
24
|
+
return item["prompt"]
|
|
25
|
+
|
|
26
|
+
def _get_context(self, item: dict[str, Any]) -> IFEvalMetricContext:
|
|
27
|
+
assert "key" in item, "Expected 'key' in item"
|
|
28
|
+
assert "instruction_id_list" in item, "Expected 'instruction_id_list' in item"
|
|
29
|
+
assert "prompt" in item, "Expected 'prompt' in item"
|
|
30
|
+
assert "kwargs" in item, "Expected 'kwargs' in item"
|
|
31
|
+
|
|
32
|
+
new_kwargs = []
|
|
33
|
+
for d in item["kwargs"]:
|
|
34
|
+
# fixing undesired float fields in the dataset
|
|
35
|
+
assert all([abs(v - float(v)) < 1e-5 for v in d.values() if isinstance(v, float)])
|
|
36
|
+
new_kwargs.append({k: v if not isinstance(v, float) else int(v) for k, v in d.items()})
|
|
37
|
+
|
|
38
|
+
# fixing changes to the HF dataset done on Apr 10 2025
|
|
39
|
+
if item["key"] == 142:
|
|
40
|
+
new_kwargs[2]["relation"] = None
|
|
41
|
+
new_kwargs[2]["frequency"] = None
|
|
42
|
+
new_kwargs[2]["keywords"] = new_kwargs[2]["keyword"]
|
|
43
|
+
del new_kwargs[2]["keyword"]
|
|
44
|
+
if item["key"] == 1512:
|
|
45
|
+
new_kwargs[0]["relation"] = None
|
|
46
|
+
|
|
47
|
+
item["kwargs"] = new_kwargs
|
|
48
|
+
|
|
49
|
+
return IFEvalMetricContext(
|
|
50
|
+
key=item["key"],
|
|
51
|
+
instruction_id_list=item["instruction_id_list"],
|
|
52
|
+
prompt=item["prompt"],
|
|
53
|
+
additional_kwargs=item["kwargs"],
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class IFEvalFiSv(IFEval):
|
|
64
|
+
"""Machine translated versions of the Instruction Following Evaluation (IFEval) benchmark."""
|
|
65
|
+
|
|
66
|
+
NAME = "IFEval Finnish & Swedish"
|
|
67
|
+
DATASET_PATH = "LumiOpen/ifeval_mt"
|
|
68
|
+
SUBJECTS = ["fi", "sv"]
|
|
69
|
+
LANGUAGE = {"fi": Language.FIN, "sv": Language.SWE}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class IFEvalDe(IFEval):
|
|
73
|
+
"""German version of the Instruction Following Evaluation (IFEval) benchmark."""
|
|
74
|
+
|
|
75
|
+
NAME = "IFEval German"
|
|
76
|
+
DATASET_PATH = "jzhang86/de_ifeval"
|
|
77
|
+
SUBJECTS = [NO_SUBJECT]
|
|
78
|
+
LANGUAGE = {NO_SUBJECT: Language.DEU}
|