eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +177 -0
  5. eval_framework/context/eval.py +121 -0
  6. eval_framework/context/local.py +78 -0
  7. eval_framework/evaluation_generator.py +234 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +432 -0
  16. eval_framework/llm/base.py +180 -0
  17. eval_framework/llm/huggingface.py +418 -0
  18. eval_framework/llm/mistral.py +88 -0
  19. eval_framework/llm/models.py +28 -0
  20. eval_framework/llm/openai.py +400 -0
  21. eval_framework/llm/vllm.py +554 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +166 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/aidanbench.py +28 -0
  29. eval_framework/metrics/completion/bleu.py +76 -0
  30. eval_framework/metrics/completion/chrf.py +62 -0
  31. eval_framework/metrics/completion/code_assertion.py +44 -0
  32. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  33. eval_framework/metrics/completion/comet.py +56 -0
  34. eval_framework/metrics/completion/concordance_index.py +38 -0
  35. eval_framework/metrics/completion/csv_format.py +102 -0
  36. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  37. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  38. eval_framework/metrics/completion/f1.py +42 -0
  39. eval_framework/metrics/completion/format_checker.py +56 -0
  40. eval_framework/metrics/completion/grid_difference.py +77 -0
  41. eval_framework/metrics/completion/ifeval.py +73 -0
  42. eval_framework/metrics/completion/json_format.py +179 -0
  43. eval_framework/metrics/completion/language_checker.py +74 -0
  44. eval_framework/metrics/completion/length_control.py +83 -0
  45. eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
  46. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  47. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  48. eval_framework/metrics/completion/repetition.py +88 -0
  49. eval_framework/metrics/completion/rouge_1.py +35 -0
  50. eval_framework/metrics/completion/rouge_2.py +45 -0
  51. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  52. eval_framework/metrics/completion/rouge_l.py +52 -0
  53. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  54. eval_framework/metrics/completion/ter.py +67 -0
  55. eval_framework/metrics/completion/text_counter.py +182 -0
  56. eval_framework/metrics/efficiency/__init__.py +0 -0
  57. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  58. eval_framework/metrics/llm/__init__.py +0 -0
  59. eval_framework/metrics/llm/base.py +34 -0
  60. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  61. eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
  62. eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
  63. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  64. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  65. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  66. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  67. eval_framework/metrics/llm/graders/language.py +56 -0
  68. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  69. eval_framework/metrics/llm/graders/models.py +74 -0
  70. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  71. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  72. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  73. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  74. eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
  75. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  76. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  77. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  78. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  79. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  80. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
  81. eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
  82. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  83. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  84. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  85. eval_framework/metrics/llm/utils.py +20 -0
  86. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  87. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  88. eval_framework/metrics/loglikelihood/base.py +50 -0
  89. eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
  90. eval_framework/metrics/loglikelihood/dcs.py +43 -0
  91. eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
  92. eval_framework/metrics/loglikelihood/ternary.py +42 -0
  93. eval_framework/py.typed +0 -0
  94. eval_framework/response_generator.py +351 -0
  95. eval_framework/result_processors/__init__.py +0 -0
  96. eval_framework/result_processors/base.py +88 -0
  97. eval_framework/result_processors/hf_uploader.py +75 -0
  98. eval_framework/result_processors/result_processor.py +129 -0
  99. eval_framework/result_processors/wandb_uploader.py +137 -0
  100. eval_framework/run.py +369 -0
  101. eval_framework/run_direct.py +42 -0
  102. eval_framework/shared/types.py +227 -0
  103. eval_framework/tasks/__init__.py +6 -0
  104. eval_framework/tasks/base.py +392 -0
  105. eval_framework/tasks/benchmarks/__init__.py +0 -0
  106. eval_framework/tasks/benchmarks/aidanbench.py +211 -0
  107. eval_framework/tasks/benchmarks/arc.py +70 -0
  108. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  109. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  110. eval_framework/tasks/benchmarks/belebele.py +60 -0
  111. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  112. eval_framework/tasks/benchmarks/casehold.py +47 -0
  113. eval_framework/tasks/benchmarks/chembench.py +85 -0
  114. eval_framework/tasks/benchmarks/copa.py +64 -0
  115. eval_framework/tasks/benchmarks/duc.py +91 -0
  116. eval_framework/tasks/benchmarks/flores200.py +133 -0
  117. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  118. eval_framework/tasks/benchmarks/gpqa.py +201 -0
  119. eval_framework/tasks/benchmarks/gsm8k.py +150 -0
  120. eval_framework/tasks/benchmarks/hellaswag.py +69 -0
  121. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  122. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  123. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  124. eval_framework/tasks/benchmarks/include.py +119 -0
  125. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  126. eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
  127. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  128. eval_framework/tasks/benchmarks/mmlu.py +215 -0
  129. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  130. eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
  131. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  132. eval_framework/tasks/benchmarks/openbookqa.py +85 -0
  133. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  134. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  135. eval_framework/tasks/benchmarks/piqa.py +64 -0
  136. eval_framework/tasks/benchmarks/quality.py +56 -0
  137. eval_framework/tasks/benchmarks/sciq.py +110 -0
  138. eval_framework/tasks/benchmarks/sphyr.py +79 -0
  139. eval_framework/tasks/benchmarks/squad.py +211 -0
  140. eval_framework/tasks/benchmarks/struct_eval.py +116 -0
  141. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  142. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  143. eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
  144. eval_framework/tasks/benchmarks/winogender.py +64 -0
  145. eval_framework/tasks/benchmarks/winogrande.py +69 -0
  146. eval_framework/tasks/benchmarks/winox.py +57 -0
  147. eval_framework/tasks/benchmarks/wmt.py +160 -0
  148. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  149. eval_framework/tasks/eval_config.py +136 -0
  150. eval_framework/tasks/perturbation.py +83 -0
  151. eval_framework/tasks/registry.py +186 -0
  152. eval_framework/tasks/task_loader.py +81 -0
  153. eval_framework/tasks/task_names.py +324 -0
  154. eval_framework/tasks/utils.py +584 -0
  155. eval_framework/utils/constants.py +9 -0
  156. eval_framework/utils/file_ops.py +245 -0
  157. eval_framework/utils/generate_task_docs.py +244 -0
  158. eval_framework/utils/helpers.py +32 -0
  159. eval_framework/utils/logging.py +62 -0
  160. eval_framework/utils/packaging.py +52 -0
  161. eval_framework/utils/tqdm_handler.py +14 -0
  162. eval_framework-0.2.7.dist-info/METADATA +548 -0
  163. eval_framework-0.2.7.dist-info/RECORD +170 -0
  164. eval_framework-0.2.7.dist-info/WHEEL +4 -0
  165. eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
  166. template_formatting/README.md +83 -0
  167. template_formatting/__init__.py +0 -0
  168. template_formatting/formatter.py +537 -0
  169. template_formatting/mistral_formatter.py +159 -0
  170. template_formatting/py.typed +0 -0
@@ -0,0 +1,302 @@
1
+ import os
2
+ import re
3
+ from abc import ABC
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from datasets import DownloadConfig, Features, Sequence, Value, load_dataset
8
+
9
+ from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
10
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import AccuracyLoglikelihood
11
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
12
+
13
+
14
+ class InfiniteBench(BaseTask[str], ABC):
15
+ """
16
+ InfiniteBench: Extending Long Context Evaluation Beyond 100K Tokens
17
+ https://github.com/OpenBMB/InfiniteBench
18
+ """
19
+
20
+ DATASET_PATH = "xinrongzhang2022/InfiniteBench"
21
+ SUBJECTS = ["default"]
22
+ LANGUAGE = Language.ENG
23
+ PERTURBATION_UNMODIFIABLE_WORDS = None
24
+
25
+ def __init__(self, num_fewshot: int = 0) -> None:
26
+ assert num_fewshot == 0, "Few-shots are not supported for long-context InfiniteBench tasks"
27
+ super().__init__(num_fewshot)
28
+
29
+ def _load_hf_dataset(self, **kwargs: Any) -> Any:
30
+ cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
31
+ download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
32
+ ft = Features(
33
+ {
34
+ "id": Value("int64"),
35
+ "context": Value("string"),
36
+ "input": Value("string"),
37
+ "answer": Sequence(Value("string")),
38
+ "options": Sequence(Value("string")),
39
+ }
40
+ )
41
+ try:
42
+ return load_dataset(
43
+ **kwargs, trust_remote_code=True, cache_dir=cache_dir, download_config=download_config, features=ft
44
+ )
45
+ except Exception:
46
+ return load_dataset(
47
+ **kwargs,
48
+ trust_remote_code=True,
49
+ cache_dir=f"{Path.home()}/.cache/eval-framework",
50
+ features=ft,
51
+ )
52
+
53
+
54
+ class InfiniteBenchLoglikelihood(InfiniteBench, ABC):
55
+ """Base class for loglikelihood tasks."""
56
+
57
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
58
+ METRICS = [AccuracyLoglikelihood]
59
+
60
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
61
+ return f"{item['context']}\n\n{item['input']}"
62
+
63
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
64
+ assert item["answer"][0] in item["options"], f"Ground truth {item['answer']} is not in {item['options']}"
65
+ return f" {item['answer'][0]}"
66
+
67
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
68
+ return [f" {choice}" for choice in item["options"]]
69
+
70
+
71
+ class InfiniteBenchCompletion(InfiniteBench, ABC):
72
+ """Base class for completion tasks."""
73
+
74
+ RESPONSE_TYPE = ResponseType.COMPLETION
75
+ METRICS = [AccuracyCompletion]
76
+
77
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
78
+ return f"{item['context']}\n\n{item['input']}"
79
+
80
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
81
+ return item["answer"]
82
+
83
+
84
+ class InfiniteBench_CodeDebug(InfiniteBenchLoglikelihood):
85
+ """Finding which function in a code repo contains a crashing error (MC form)."""
86
+
87
+ NAME = "InfiniteBench_CodeDebug"
88
+ SAMPLE_SPLIT = "code_debug"
89
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
90
+
91
+
92
+ class InfiniteBench_EnMC(InfiniteBenchLoglikelihood):
93
+ """Multiple choice questions derived from the fake book."""
94
+
95
+ NAME = "InfiniteBench_EnMC"
96
+ SAMPLE_SPLIT = "longbook_choice_eng"
97
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
98
+
99
+
100
+ class InfiniteBench_CodeRun(InfiniteBenchCompletion):
101
+ """Simulating execution of multiple simple, synthetic functions."""
102
+
103
+ NAME = "InfiniteBench_CodeRun"
104
+ SAMPLE_SPLIT = "code_run"
105
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
106
+
107
+ def __init__(self, num_fewshot: int = 0) -> None:
108
+ super().__init__(num_fewshot)
109
+ self.stop_sequences: list[str] = ["\n"]
110
+ self.max_tokens = 30 # Avg Output Tokens: 1.3
111
+
112
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
113
+ for stop_sequence in self.stop_sequences:
114
+ if stop_sequence in completion_text:
115
+ completion_text = completion_text.split(stop_sequence)[0]
116
+
117
+ ANS_RE = re.compile(r"The return value is: (\-?[0-9\.\,]+)")
118
+ match = ANS_RE.search(completion_text)
119
+ if match:
120
+ match_str = match.group(1).strip()
121
+ return match_str
122
+ else:
123
+ return "[invalid]"
124
+
125
+
126
+ class InfiniteBench_EnDia(InfiniteBenchCompletion):
127
+ """Identification of talkers in partially anonymized scripts."""
128
+
129
+ NAME = "InfiniteBench_EnDia"
130
+ SAMPLE_SPLIT = "longdialogue_qa_eng"
131
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
132
+
133
+ def __init__(self, num_fewshot: int = 0) -> None:
134
+ super().__init__(num_fewshot)
135
+
136
+ self.stop_sequences: list[str] = ["\n"]
137
+ self.max_tokens = 30 # Avg Output Tokens: 3.4
138
+
139
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
140
+ answers = [i.lower() for i in item["answer"]]
141
+ return answers
142
+
143
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
144
+ return f"{item['context']}\n\n{item['input']}\n"
145
+
146
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
147
+ return "The character which is $$MASK$$ is:"
148
+
149
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
150
+ for stop_sequence in self.stop_sequences:
151
+ if stop_sequence in completion_text:
152
+ completion_text = completion_text.split(stop_sequence)[0]
153
+ return completion_text.lower()
154
+
155
+
156
+ class InfiniteBench_EnQA(InfiniteBenchCompletion):
157
+ """Free-form question answering based on the fake book."""
158
+
159
+ NAME = "InfiniteBench_EnQA"
160
+ SAMPLE_SPLIT = "longbook_qa_eng"
161
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
162
+
163
+ def __init__(self, num_fewshot: int = 0) -> None:
164
+ super().__init__(num_fewshot)
165
+
166
+ self.stop_sequences: list[str] = ["\n"]
167
+ self.max_tokens = 30 # Avg Output Tokens: 4.8
168
+
169
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
170
+ return f"{item['context']}\n{item['input']}"
171
+
172
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
173
+ answers = [i.replace('"', "").lower() for i in item["answer"]]
174
+ return answers
175
+
176
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
177
+ for stop_sequence in self.stop_sequences:
178
+ if stop_sequence in completion_text:
179
+ completion_text = completion_text.split(stop_sequence)[0]
180
+ return completion_text.lower()
181
+
182
+
183
+ class InfiniteBench_MathFind(InfiniteBenchCompletion):
184
+ """Finding special integers in a lengthy list."""
185
+
186
+ NAME = "InfiniteBench_MathFind"
187
+ SAMPLE_SPLIT = "math_find"
188
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
189
+
190
+ def __init__(self, num_fewshot: int = 0) -> None:
191
+ super().__init__(num_fewshot)
192
+
193
+ self.stop_sequences: list[str] = ["\n"]
194
+ self.max_tokens = 30 # Avg Output Tokens: 1.3
195
+
196
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
197
+ for stop_sequence in self.stop_sequences:
198
+ if stop_sequence in completion_text:
199
+ completion_text = completion_text.split(stop_sequence)[0]
200
+
201
+ ANS_RE = re.compile(r"(\-?[0-9\.\,]+)")
202
+ match = ANS_RE.search(completion_text)
203
+ if match:
204
+ match_str = match.group(0).strip()
205
+ return match_str
206
+ else:
207
+ return "[invalid]"
208
+
209
+
210
+ class InfiniteBench_RetrieveKV2(InfiniteBenchCompletion):
211
+ """Finding the corresponding value from a dictionary and a key."""
212
+
213
+ NAME = "InfiniteBench_RetrieveKV2"
214
+ SAMPLE_SPLIT = "kv_retrieval"
215
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
216
+
217
+ def __init__(self, num_fewshot: int = 0) -> None:
218
+ super().__init__(num_fewshot)
219
+ self.stop_sequences: list[str] = ["\n"]
220
+ self.max_tokens = 40 # Avg Output Tokens: 22.7 (all answers are 36 chars)
221
+
222
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
223
+ return f"{item['context']}\n{item['input']}"
224
+
225
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
226
+ for stop_sequence in self.stop_sequences:
227
+ if stop_sequence in completion_text:
228
+ completion_text = completion_text.split(stop_sequence)[0]
229
+
230
+ ANS_RE = re.compile(r"([0-9a-f\-]+)")
231
+ match = ANS_RE.search(completion_text)
232
+ if match:
233
+ match_str = match.group(1).strip()
234
+ return match_str
235
+ else:
236
+ return "[invalid]"
237
+
238
+
239
+ class InfiniteBench_RetrieveNumber(InfiniteBenchCompletion):
240
+ """Locating repeated hidden numbers in a noisy long context."""
241
+
242
+ NAME = "InfiniteBench_RetrieveNumber"
243
+ SAMPLE_SPLIT = "number_string"
244
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
245
+
246
+ def __init__(self, num_fewshot: int = 0) -> None:
247
+ super().__init__(num_fewshot)
248
+
249
+ self.stop_sequences: list[str] = ["\n"]
250
+ self.max_tokens = 12 # Avg Output Tokens: 4.0 (all answers are 10 digits integers)
251
+
252
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
253
+ return f"{item['context']}\n{item['input']}\n"
254
+
255
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
256
+ return "The sequence of digits is:"
257
+
258
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
259
+ for stop_sequence in self.stop_sequences:
260
+ if stop_sequence in completion_text:
261
+ completion_text = completion_text.split(stop_sequence)[0]
262
+
263
+ ANS_RE = re.compile(r"([0-9]+)")
264
+ match = ANS_RE.search(completion_text)
265
+ if match:
266
+ match_str = match.group(1).strip()
267
+ return match_str
268
+ else:
269
+ return "[invalid]"
270
+
271
+
272
+ class InfiniteBench_RetrievePassKey1(InfiniteBenchCompletion):
273
+ """Retrieving hidden keys in a noisy long context."""
274
+
275
+ NAME = "InfiniteBench_RetrievePassKey1"
276
+ SAMPLE_SPLIT = "passkey"
277
+ FEWSHOT_SPLIT = SAMPLE_SPLIT
278
+
279
+ def __init__(self, num_fewshot: int = 0) -> None:
280
+ super().__init__(num_fewshot)
281
+
282
+ self.stop_sequences: list[str] = ["\n"]
283
+ self.max_tokens = 8 # Avg Output Tokens: 2.0 (all answers are 5 digits integers)
284
+
285
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
286
+ return f"{item['context']}\n{item['input']}\n"
287
+
288
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
289
+ return "The pass key is:"
290
+
291
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
292
+ for stop_sequence in self.stop_sequences:
293
+ if stop_sequence in completion_text:
294
+ completion_text = completion_text.split(stop_sequence)[0]
295
+
296
+ ANS_RE = re.compile(r"([0-9]+)")
297
+ match = ANS_RE.search(completion_text)
298
+ if match:
299
+ match_str = match.group(1).strip()
300
+ return match_str
301
+ else:
302
+ return "[invalid]"