eval-framework 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +177 -0
- eval_framework/context/eval.py +121 -0
- eval_framework/context/local.py +78 -0
- eval_framework/evaluation_generator.py +234 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +432 -0
- eval_framework/llm/base.py +180 -0
- eval_framework/llm/huggingface.py +418 -0
- eval_framework/llm/mistral.py +88 -0
- eval_framework/llm/models.py +28 -0
- eval_framework/llm/openai.py +400 -0
- eval_framework/llm/vllm.py +554 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +166 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/aidanbench.py +28 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +179 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +34 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/llm/utils.py +20 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/base.py +50 -0
- eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
- eval_framework/metrics/loglikelihood/dcs.py +43 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
- eval_framework/metrics/loglikelihood/ternary.py +42 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +351 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +88 -0
- eval_framework/result_processors/hf_uploader.py +75 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/result_processors/wandb_uploader.py +137 -0
- eval_framework/run.py +369 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +392 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/aidanbench.py +211 -0
- eval_framework/tasks/benchmarks/arc.py +70 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +64 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +133 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +201 -0
- eval_framework/tasks/benchmarks/gsm8k.py +150 -0
- eval_framework/tasks/benchmarks/hellaswag.py +69 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +215 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +85 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +64 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +110 -0
- eval_framework/tasks/benchmarks/sphyr.py +79 -0
- eval_framework/tasks/benchmarks/squad.py +211 -0
- eval_framework/tasks/benchmarks/struct_eval.py +116 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
- eval_framework/tasks/benchmarks/winogender.py +64 -0
- eval_framework/tasks/benchmarks/winogrande.py +69 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +136 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +81 -0
- eval_framework/tasks/task_names.py +324 -0
- eval_framework/tasks/utils.py +584 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/file_ops.py +245 -0
- eval_framework/utils/generate_task_docs.py +244 -0
- eval_framework/utils/helpers.py +32 -0
- eval_framework/utils/logging.py +62 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework/utils/tqdm_handler.py +14 -0
- eval_framework-0.2.7.dist-info/METADATA +548 -0
- eval_framework-0.2.7.dist-info/RECORD +170 -0
- eval_framework-0.2.7.dist-info/WHEEL +4 -0
- eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +537 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
|
|
5
|
+
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
|
|
6
|
+
|
|
7
|
+
ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
|
|
8
|
+
|
|
9
|
+
# Predefined fewshot examples
|
|
10
|
+
FEWSHOT_ITEMS = [
|
|
11
|
+
{
|
|
12
|
+
"question": (
|
|
13
|
+
"There are 15 trees in the grove. Grove workers will plant trees in the grove today. "
|
|
14
|
+
"After they are done, there will be 21 trees. "
|
|
15
|
+
"How many trees did the grove workers plant today?"
|
|
16
|
+
),
|
|
17
|
+
"answer": (
|
|
18
|
+
"There are 15 trees originally. Then there were 21 trees after some more were planted. "
|
|
19
|
+
"So there must have been 21 - 15 = 6.\n#### 6"
|
|
20
|
+
),
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"question": (
|
|
24
|
+
"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?"
|
|
25
|
+
),
|
|
26
|
+
"answer": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5.\n#### 5",
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"question": (
|
|
30
|
+
"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?"
|
|
31
|
+
),
|
|
32
|
+
"answer": (
|
|
33
|
+
"Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. "
|
|
34
|
+
"After eating 35, they had 74 - 35 = 39.\n#### 39"
|
|
35
|
+
),
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"question": (
|
|
39
|
+
"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. "
|
|
40
|
+
"How many lollipops did Jason give to Denny?"
|
|
41
|
+
),
|
|
42
|
+
"answer": (
|
|
43
|
+
"Jason started with 20 lollipops. Then he had 12 after giving some to Denny. "
|
|
44
|
+
"So he gave Denny 20 - 12 = 8.\n#### 8"
|
|
45
|
+
),
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"question": (
|
|
49
|
+
"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. "
|
|
50
|
+
"How many toys does he have now?"
|
|
51
|
+
),
|
|
52
|
+
"answer": (
|
|
53
|
+
"Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. "
|
|
54
|
+
"5 + 4 = 9.\n#### 9"
|
|
55
|
+
),
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"question": (
|
|
59
|
+
"There were nine computers in the server room. Five more computers were installed each day, "
|
|
60
|
+
"from monday to thursday. "
|
|
61
|
+
"How many computers are now in the server room?"
|
|
62
|
+
),
|
|
63
|
+
"answer": (
|
|
64
|
+
"There were originally 9 computers. For each of 4 days, 5 more computers were "
|
|
65
|
+
"added. So 5 * 4 = 20 computers were added. 9 + 20 is 29.\n#### 29"
|
|
66
|
+
),
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"question": (
|
|
70
|
+
"Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. "
|
|
71
|
+
"How many golf balls did he have at the end of wednesday?"
|
|
72
|
+
),
|
|
73
|
+
"answer": (
|
|
74
|
+
"Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. "
|
|
75
|
+
"After losing 2 more, he had 35 - 2 = 33 golf balls.\n#### 33"
|
|
76
|
+
),
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
|
|
80
|
+
"answer": (
|
|
81
|
+
"Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. "
|
|
82
|
+
"So she has 23 - 15 dollars left. 23 - 15 is 8.\n#### 8"
|
|
83
|
+
),
|
|
84
|
+
},
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class GSM8KEvalHarness(BaseTask[str]):
|
|
89
|
+
"""GSM8K dataset: https://huggingface.co/datasets/openai/gsm8k
|
|
90
|
+
This version uses samples from the train split as fewshot examples.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
NAME = "GSM8KEvalHarness"
|
|
94
|
+
DATASET_PATH = "gsm8k"
|
|
95
|
+
SAMPLE_SPLIT = "test"
|
|
96
|
+
FEWSHOT_SPLIT = "train"
|
|
97
|
+
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
98
|
+
METRICS = [AccuracyCompletion]
|
|
99
|
+
SUBJECTS = ["main"]
|
|
100
|
+
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
|
|
101
|
+
LANGUAGE = Language.ENG
|
|
102
|
+
|
|
103
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
104
|
+
super().__init__(num_fewshot)
|
|
105
|
+
|
|
106
|
+
# until: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
|
|
107
|
+
self.stop_sequences: list[str] = ["Question:"]
|
|
108
|
+
self.max_tokens = 1600
|
|
109
|
+
|
|
110
|
+
def _extract_answer(self, completion: str) -> str:
|
|
111
|
+
match = ANS_RE.search(completion)
|
|
112
|
+
if match:
|
|
113
|
+
match_str = match.group(1).strip()
|
|
114
|
+
match_str = match_str.replace(",", "")
|
|
115
|
+
return match_str
|
|
116
|
+
else:
|
|
117
|
+
return "[invalid]"
|
|
118
|
+
|
|
119
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
120
|
+
for stop_sequence in self.stop_sequences:
|
|
121
|
+
if stop_sequence in completion_text:
|
|
122
|
+
completion_text = completion_text.split(stop_sequence)[0]
|
|
123
|
+
return self._extract_answer(completion_text)
|
|
124
|
+
|
|
125
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
126
|
+
return f"Question: {item['question']}\nAnswer:"
|
|
127
|
+
|
|
128
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
129
|
+
return f" {item['answer']}"
|
|
130
|
+
|
|
131
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
132
|
+
return self._extract_answer(item["answer"])
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class GSM8K(GSM8KEvalHarness):
|
|
136
|
+
NAME = "GSM8K"
|
|
137
|
+
FEWSHOT_SPLIT = "" # Changed to empty string since we're using predefined examples
|
|
138
|
+
|
|
139
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
140
|
+
assert num_fewshot <= len(FEWSHOT_ITEMS), f"Fewshot larger than {len(FEWSHOT_ITEMS)} is not supported for GSM8K"
|
|
141
|
+
super().__init__(num_fewshot)
|
|
142
|
+
|
|
143
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
144
|
+
# Remove the bracketed computations from the question
|
|
145
|
+
question = re.sub(r"<<.*?>>", "", item["question"])
|
|
146
|
+
return f"Question: {question}\nAnswer:"
|
|
147
|
+
|
|
148
|
+
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
149
|
+
"""Override to use predefined fewshot examples instead of sampling from dataset"""
|
|
150
|
+
return FEWSHOT_ITEMS[: self.num_fewshot]
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
|
|
5
|
+
AccuracyLoglikelihood,
|
|
6
|
+
AccuracyNormLoglikelihood,
|
|
7
|
+
)
|
|
8
|
+
from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
|
|
9
|
+
from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
|
|
10
|
+
from eval_framework.metrics.loglikelihood.ternary import TernaryScore
|
|
11
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HELLASWAG(BaseTask[str]):
|
|
15
|
+
"""Hellaswag dataset: https://huggingface.co/datasets/Rowan/hellaswag
|
|
16
|
+
available data set sections: train, validation, test"""
|
|
17
|
+
|
|
18
|
+
NAME = "HellaSwag"
|
|
19
|
+
DATASET_PATH = "Rowan/hellaswag"
|
|
20
|
+
SAMPLE_SPLIT = "validation"
|
|
21
|
+
FEWSHOT_SPLIT = "train"
|
|
22
|
+
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
23
|
+
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
|
|
24
|
+
SUBJECTS = [NO_SUBJECT]
|
|
25
|
+
LANGUAGE = Language.ENG
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def _preprocess(prompt: str) -> str:
|
|
29
|
+
# remove bracketed text
|
|
30
|
+
prompt = prompt.strip()
|
|
31
|
+
prompt = prompt.replace(" [title]", ". ")
|
|
32
|
+
prompt = re.sub("\\[.*?\\]", "", prompt)
|
|
33
|
+
prompt = prompt.replace(" ", " ")
|
|
34
|
+
return prompt
|
|
35
|
+
|
|
36
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
37
|
+
subject = self._preprocess(item["activity_label"])
|
|
38
|
+
question = self._preprocess(item["ctx_a"] + " " + item["ctx_b"].capitalize()).strip()
|
|
39
|
+
return f"{subject}: {question}"
|
|
40
|
+
|
|
41
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
42
|
+
ground_truth_index = int(item["label"] if item["label"] != "" else 0)
|
|
43
|
+
choices = [self._preprocess(ending) for ending in item["endings"]]
|
|
44
|
+
return f" {choices[ground_truth_index]}"
|
|
45
|
+
|
|
46
|
+
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
47
|
+
return [f" {self._preprocess(ending)}" for ending in item["endings"]]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class HELLASWAG_IDK(HELLASWAG):
|
|
51
|
+
NAME = "HellaSwag_IDK"
|
|
52
|
+
METRICS = [
|
|
53
|
+
AccuracyLoglikelihood,
|
|
54
|
+
AccuracyNormLoglikelihood,
|
|
55
|
+
ConfidenceWeightedAccuracy,
|
|
56
|
+
DistributionalCorrectnessScore,
|
|
57
|
+
TernaryScore,
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
|
|
61
|
+
return (
|
|
62
|
+
"Complete the sentence only if you are confident, since mistakes may be penalised, while correct "
|
|
63
|
+
"completions receive points. It is acceptable to answer with 'I do not know' if you are unsure, "
|
|
64
|
+
"and you will receive 0 points."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
68
|
+
completions = super()._get_possible_completions(item)
|
|
69
|
+
return (completions or []) + [" I do not know."]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
|
|
5
|
+
AccuracyLoglikelihood,
|
|
6
|
+
AccuracyNormLoglikelihood,
|
|
7
|
+
)
|
|
8
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HELLASWAG_DE(BaseTask[str]):
|
|
12
|
+
"""Hellaswag dataset: https://huggingface.co/datasets/LeoLM/HellaSwag_de
|
|
13
|
+
available data set sections: train (1k rows), validation (10k rows)"""
|
|
14
|
+
|
|
15
|
+
NAME = "HellaSwag German"
|
|
16
|
+
DATASET_PATH = "LeoLM/HellaSwag_de"
|
|
17
|
+
SAMPLE_SPLIT = "validation"
|
|
18
|
+
FEWSHOT_SPLIT = "train"
|
|
19
|
+
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
20
|
+
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
|
|
21
|
+
SUBJECTS = [NO_SUBJECT]
|
|
22
|
+
LANGUAGE = Language.DEU
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _preprocess(prompt: str) -> str:
|
|
26
|
+
# remove bracketed text
|
|
27
|
+
prompt = prompt.strip()
|
|
28
|
+
prompt = prompt.replace(" [title]", ". ")
|
|
29
|
+
prompt = re.sub("\\[.*?\\]", "", prompt)
|
|
30
|
+
prompt = prompt.replace(" ", " ")
|
|
31
|
+
return prompt
|
|
32
|
+
|
|
33
|
+
def _load_dataset(self, subject: str) -> None:
|
|
34
|
+
super()._load_dataset(subject)
|
|
35
|
+
new_dataset = {}
|
|
36
|
+
for split, items in self.dataset.items():
|
|
37
|
+
# in the valid split, only 10035 out of 10042 items are well translated
|
|
38
|
+
new_dataset[split] = [item for item in items if len(item["endings_de"]) == len(item["endings"])]
|
|
39
|
+
self.dataset = new_dataset
|
|
40
|
+
|
|
41
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
42
|
+
subject = self._preprocess(item["activity_label_de"])
|
|
43
|
+
question = self._preprocess(item["ctx_de"]).strip()
|
|
44
|
+
return f"{subject}: {question}"
|
|
45
|
+
|
|
46
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
47
|
+
ground_truth_index = int(item["label"] if item["label"] != "" else 0)
|
|
48
|
+
choices = [self._preprocess(ending) for ending in item["endings_de"]]
|
|
49
|
+
return f" {choices[ground_truth_index]}"
|
|
50
|
+
|
|
51
|
+
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
52
|
+
return [f" {self._preprocess(ending)}" for ending in item["endings_de"]]
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from eval_framework.metrics.completion.code_assertion import CodeCompletionAssertion
|
|
4
|
+
from eval_framework.shared.types import BaseMetricContext
|
|
5
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample
|
|
6
|
+
|
|
7
|
+
CODE_TO_EXECUTE = """
|
|
8
|
+
{start_of_code}
|
|
9
|
+
{completion_text}
|
|
10
|
+
{test_code}
|
|
11
|
+
try:
|
|
12
|
+
check({entry_point})
|
|
13
|
+
print(True)
|
|
14
|
+
except Exception as e:
|
|
15
|
+
print(e)
|
|
16
|
+
print(False)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HumanEvalMetricContext(BaseMetricContext):
|
|
21
|
+
test: str
|
|
22
|
+
entry_point: str
|
|
23
|
+
prompt: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HumanEval(BaseTask[str]):
|
|
27
|
+
"""HumanEval dataset: https://huggingface.co/datasets/openai/openai_humaneval/"""
|
|
28
|
+
|
|
29
|
+
NAME = "Human Eval"
|
|
30
|
+
DATASET_PATH = "openai/openai_humaneval"
|
|
31
|
+
SAMPLE_SPLIT = "test"
|
|
32
|
+
FEWSHOT_SPLIT = "test" # (there is no dedicated split, few-shot is not expected for this dataset)
|
|
33
|
+
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
34
|
+
METRICS = [CodeCompletionAssertion]
|
|
35
|
+
SUBJECTS = [NO_SUBJECT]
|
|
36
|
+
LANGUAGE = Language.ENG
|
|
37
|
+
|
|
38
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
39
|
+
super().__init__(num_fewshot)
|
|
40
|
+
self.stop_sequences: list[str] = ["```"]
|
|
41
|
+
|
|
42
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
43
|
+
return f"```python\n{item['prompt'].lstrip()}"
|
|
44
|
+
|
|
45
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
46
|
+
return "Success"
|
|
47
|
+
|
|
48
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
49
|
+
return item["canonical_solution"]
|
|
50
|
+
|
|
51
|
+
def _get_context(self, item: dict[str, Any]) -> HumanEvalMetricContext:
|
|
52
|
+
return HumanEvalMetricContext(
|
|
53
|
+
test=item["test"],
|
|
54
|
+
entry_point=item["entry_point"],
|
|
55
|
+
prompt=item["prompt"],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
|
|
59
|
+
assert sample is not None and sample.context is not None
|
|
60
|
+
assert isinstance(sample.context, HumanEvalMetricContext), "Expected HumanEvalMetricContext"
|
|
61
|
+
context = sample.context
|
|
62
|
+
|
|
63
|
+
for stop_sequence in self.stop_sequences:
|
|
64
|
+
if stop_sequence in completion_text:
|
|
65
|
+
completion_text = completion_text.split(stop_sequence)[0]
|
|
66
|
+
|
|
67
|
+
entry_point = context.entry_point
|
|
68
|
+
test_code = context.test
|
|
69
|
+
start_of_code = context.prompt
|
|
70
|
+
formatted_code = CODE_TO_EXECUTE.format(
|
|
71
|
+
start_of_code=start_of_code,
|
|
72
|
+
completion_text=completion_text,
|
|
73
|
+
test_code=test_code,
|
|
74
|
+
entry_point=entry_point,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return formatted_code
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class HumanEvalInstruct(HumanEval):
|
|
81
|
+
# See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml
|
|
82
|
+
NAME = "Human Eval Instruct"
|
|
83
|
+
CUE_PREFIX = "Here is the completed function:\n```python\n"
|
|
84
|
+
|
|
85
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
86
|
+
assert num_fewshot == 0, "Fewshot is not supported for Human Eval Instruct"
|
|
87
|
+
super().__init__(num_fewshot)
|
|
88
|
+
|
|
89
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
90
|
+
instruction_text = (
|
|
91
|
+
"Write a solution to the following problem and make sure that "
|
|
92
|
+
f"it passes the tests:\n```python\n{item['prompt'].lstrip()}"
|
|
93
|
+
)
|
|
94
|
+
return instruction_text
|
|
95
|
+
|
|
96
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
97
|
+
return self.CUE_PREFIX + item["prompt"].lstrip()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
|
|
4
|
+
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class IFEval(BaseTask[str]):
|
|
8
|
+
"""IFEval: Instruction Following Eval (https://arxiv.org/pdf/2311.07911)."""
|
|
9
|
+
|
|
10
|
+
NAME = "IFEval"
|
|
11
|
+
DATASET_PATH = "google/IFEval"
|
|
12
|
+
SAMPLE_SPLIT = "train"
|
|
13
|
+
FEWSHOT_SPLIT = "train"
|
|
14
|
+
RESPONSE_TYPE = ResponseType.COMPLETION
|
|
15
|
+
METRICS = [IFEvalMetric]
|
|
16
|
+
SUBJECTS = [NO_SUBJECT]
|
|
17
|
+
LANGUAGE = {NO_SUBJECT: Language.ENG}
|
|
18
|
+
|
|
19
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
20
|
+
super().__init__(num_fewshot)
|
|
21
|
+
assert num_fewshot == 0, "IFEval does not support few-shot prompting."
|
|
22
|
+
|
|
23
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
24
|
+
return item["prompt"]
|
|
25
|
+
|
|
26
|
+
def _get_context(self, item: dict[str, Any]) -> IFEvalMetricContext:
|
|
27
|
+
assert "key" in item, "Expected 'key' in item"
|
|
28
|
+
assert "instruction_id_list" in item, "Expected 'instruction_id_list' in item"
|
|
29
|
+
assert "prompt" in item, "Expected 'prompt' in item"
|
|
30
|
+
assert "kwargs" in item, "Expected 'kwargs' in item"
|
|
31
|
+
|
|
32
|
+
new_kwargs = []
|
|
33
|
+
for d in item["kwargs"]:
|
|
34
|
+
# fixing undesired float fields in the dataset
|
|
35
|
+
assert all([abs(v - float(v)) < 1e-5 for v in d.values() if isinstance(v, float)])
|
|
36
|
+
new_kwargs.append({k: v if not isinstance(v, float) else int(v) for k, v in d.items()})
|
|
37
|
+
|
|
38
|
+
# fixing changes to the HF dataset done on Apr 10 2025
|
|
39
|
+
if item["key"] == 142:
|
|
40
|
+
new_kwargs[2]["relation"] = None
|
|
41
|
+
new_kwargs[2]["frequency"] = None
|
|
42
|
+
new_kwargs[2]["keywords"] = new_kwargs[2]["keyword"]
|
|
43
|
+
del new_kwargs[2]["keyword"]
|
|
44
|
+
if item["key"] == 1512:
|
|
45
|
+
new_kwargs[0]["relation"] = None
|
|
46
|
+
|
|
47
|
+
item["kwargs"] = new_kwargs
|
|
48
|
+
|
|
49
|
+
return IFEvalMetricContext(
|
|
50
|
+
key=item["key"],
|
|
51
|
+
instruction_id_list=item["instruction_id_list"],
|
|
52
|
+
prompt=item["prompt"],
|
|
53
|
+
additional_kwargs=item["kwargs"],
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class IFEvalFiSv(IFEval):
|
|
64
|
+
"""Machine translated versions of the Instruction Following Evaluation (IFEval) benchmark."""
|
|
65
|
+
|
|
66
|
+
NAME = "IFEval Finnish & Swedish"
|
|
67
|
+
DATASET_PATH = "LumiOpen/ifeval_mt"
|
|
68
|
+
SUBJECTS = ["fi", "sv"]
|
|
69
|
+
LANGUAGE = {"fi": Language.FIN, "sv": Language.SWE}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class IFEvalDe(IFEval):
|
|
73
|
+
"""German version of the Instruction Following Evaluation (IFEval) benchmark."""
|
|
74
|
+
|
|
75
|
+
NAME = "IFEval German"
|
|
76
|
+
DATASET_PATH = "jzhang86/de_ifeval"
|
|
77
|
+
SUBJECTS = [NO_SUBJECT]
|
|
78
|
+
LANGUAGE = {NO_SUBJECT: Language.DEU}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
|
|
4
|
+
AccuracyLoglikelihood,
|
|
5
|
+
AccuracyNormLoglikelihood,
|
|
6
|
+
)
|
|
7
|
+
from eval_framework.tasks.base import BaseTask, Language, ResponseType
|
|
8
|
+
from eval_framework.tasks.utils import get_n_letters
|
|
9
|
+
|
|
10
|
+
INCLUDE_SUBJECTS = [
|
|
11
|
+
"Albanian",
|
|
12
|
+
"Arabic",
|
|
13
|
+
"Armenian",
|
|
14
|
+
"Azerbaijani",
|
|
15
|
+
"Basque",
|
|
16
|
+
"Belarusian",
|
|
17
|
+
"Bengali",
|
|
18
|
+
"Bulgarian",
|
|
19
|
+
"Chinese",
|
|
20
|
+
"Croatian",
|
|
21
|
+
"Dutch",
|
|
22
|
+
"Estonian",
|
|
23
|
+
"Finnish",
|
|
24
|
+
"French",
|
|
25
|
+
"Georgian",
|
|
26
|
+
"German",
|
|
27
|
+
"Greek",
|
|
28
|
+
"Hebrew",
|
|
29
|
+
"Hindi",
|
|
30
|
+
"Hungarian",
|
|
31
|
+
"Indonesian",
|
|
32
|
+
"Italian",
|
|
33
|
+
"Japanese",
|
|
34
|
+
"Kazakh",
|
|
35
|
+
"Korean",
|
|
36
|
+
"Lithuanian",
|
|
37
|
+
"Malay",
|
|
38
|
+
"Malayalam",
|
|
39
|
+
"Nepali",
|
|
40
|
+
"North Macedonian",
|
|
41
|
+
"Persian",
|
|
42
|
+
"Polish",
|
|
43
|
+
"Portuguese",
|
|
44
|
+
"Russian",
|
|
45
|
+
"Serbian",
|
|
46
|
+
"Spanish",
|
|
47
|
+
"Tagalog",
|
|
48
|
+
"Tamil",
|
|
49
|
+
"Telugu",
|
|
50
|
+
"Turkish",
|
|
51
|
+
"Ukrainian",
|
|
52
|
+
"Urdu",
|
|
53
|
+
"Uzbek",
|
|
54
|
+
"Vietnamese",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def subject_to_language(subject: str) -> Language:
|
|
59
|
+
if subject == "Greek":
|
|
60
|
+
return Language.ELL # type: ignore[attr-defined]
|
|
61
|
+
elif subject == "Malay":
|
|
62
|
+
return Language.MSA # type: ignore[attr-defined]
|
|
63
|
+
elif subject == "Nepali":
|
|
64
|
+
return Language.NEP # type: ignore[attr-defined]
|
|
65
|
+
elif subject == "North Macedonian":
|
|
66
|
+
return Language.MKD # type: ignore[attr-defined]
|
|
67
|
+
elif subject == "Croatian":
|
|
68
|
+
return Language.HRV # type: ignore[attr-defined]
|
|
69
|
+
elif subject == "Serbian":
|
|
70
|
+
return Language.SRP # type: ignore[attr-defined]
|
|
71
|
+
else:
|
|
72
|
+
return Language(subject)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class INCLUDE(BaseTask[str]):
|
|
76
|
+
"""INCLUDE dataset: https://huggingface.co/datasets/CohereLabs/include-base-44"""
|
|
77
|
+
|
|
78
|
+
NAME = "INCLUDE"
|
|
79
|
+
DATASET_PATH = "CohereLabs/include-base-44"
|
|
80
|
+
SAMPLE_SPLIT = "test"
|
|
81
|
+
FEWSHOT_SPLIT = "validation"
|
|
82
|
+
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
|
|
83
|
+
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
|
|
84
|
+
SUBJECTS = INCLUDE_SUBJECTS
|
|
85
|
+
LANGUAGE = {lang: subject_to_language(lang) for lang in INCLUDE_SUBJECTS}
|
|
86
|
+
|
|
87
|
+
def __init__(self, num_fewshot: int = 0) -> None:
|
|
88
|
+
super().__init__(num_fewshot)
|
|
89
|
+
|
|
90
|
+
self.keys = get_n_letters(4)
|
|
91
|
+
|
|
92
|
+
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
|
|
93
|
+
return f"The following are multiple choice questions (with answers) in {item['language']}." # noqa: E501
|
|
94
|
+
|
|
95
|
+
def _get_instruction_text(self, item: dict[str, Any]) -> str:
|
|
96
|
+
question = item["question"].strip()
|
|
97
|
+
choices = "".join(
|
|
98
|
+
[
|
|
99
|
+
f"{key}. {choice}\n"
|
|
100
|
+
for key, choice in zip(
|
|
101
|
+
self.keys, [item["option_a"], item["option_b"], item["option_c"], item["option_d"]]
|
|
102
|
+
)
|
|
103
|
+
]
|
|
104
|
+
)
|
|
105
|
+
return f"Question: {question}\n{choices}"
|
|
106
|
+
|
|
107
|
+
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
|
|
108
|
+
ground_truth = self._get_ground_truth(item)
|
|
109
|
+
assert ground_truth is not None
|
|
110
|
+
return f"{self._get_cue_text(item)}{ground_truth}"
|
|
111
|
+
|
|
112
|
+
def _get_cue_text(self, item: dict[str, Any]) -> str:
|
|
113
|
+
return "Answer:"
|
|
114
|
+
|
|
115
|
+
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
|
|
116
|
+
return f" {self.keys[item['answer']]}"
|
|
117
|
+
|
|
118
|
+
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
|
|
119
|
+
return [f" {key}" for key in self.keys]
|