eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
development/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This file makes the 'development' directory a Python package.
|
|
@@ -0,0 +1,628 @@
|
|
|
1
|
+
"""Normalizes various coding dataset formats from SandboxFusion.
|
|
2
|
+
|
|
3
|
+
This script converts datasets into a unified OpenAI-compatible JSONL format.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import ast
|
|
7
|
+
import hashlib
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from typing import ( # Removed Callable, Union. Added List, TypedDict
|
|
11
|
+
Any,
|
|
12
|
+
Dict,
|
|
13
|
+
List,
|
|
14
|
+
Optional,
|
|
15
|
+
TypedDict,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from transformers import AutoTokenizer # For Repobench-P
|
|
19
|
+
|
|
20
|
+
# Define the root path to the SandboxFusion sample datasets
|
|
21
|
+
# (Relative to the reward-kit project root)
|
|
22
|
+
SANDBOX_SAMPLES_DIR = "./SandboxFusion/sandbox/tests/datasets/samples/"
|
|
23
|
+
|
|
24
|
+
# List of Python-specific dataset .jsonl files
|
|
25
|
+
PYTHON_SPECIFIC_JSONL_FILES = [
|
|
26
|
+
"code_eval_shadow_humaneval_python.jsonl",
|
|
27
|
+
"code_eval_mbpp.jsonl",
|
|
28
|
+
"code_eval_mhpp.jsonl",
|
|
29
|
+
"code_eval_ncb_python_en.jsonl",
|
|
30
|
+
"code_eval_ncb_python_zh.jsonl",
|
|
31
|
+
"code_eval_repobench_c_python_sampled.jsonl", # RepoBench-C, Python subset
|
|
32
|
+
"code_eval_repobench_p_python_sampled.jsonl", # RepoBench-P, Python subset
|
|
33
|
+
"code_eval_cruxeval.jsonl", # Python by default
|
|
34
|
+
"code_eval_cruxeval_x.jsonl", # Multilingual, filter for Python
|
|
35
|
+
"code_eval_aider_benchmark_v1.jsonl",
|
|
36
|
+
"code_eval_bigcodebench.jsonl",
|
|
37
|
+
"code_eval_EvoEval.jsonl",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
# List of multilingual dataset .jsonl files that need filtering for Python
|
|
41
|
+
MULTILINGUAL_JSONL_FILES_TO_FILTER = [
|
|
42
|
+
"code_eval_mbxp_v1_en.jsonl",
|
|
43
|
+
"code_eval_humanevalds_v1_en.jsonl",
|
|
44
|
+
"code_eval_humanevalds_v2_en.jsonl",
|
|
45
|
+
"code_eval_mbxp_v2_en.jsonl",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
ALL_SOURCE_JSONL_FILES = (
|
|
49
|
+
PYTHON_SPECIFIC_JSONL_FILES + MULTILINGUAL_JSONL_FILES_TO_FILTER
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Output file path
|
|
53
|
+
OUTPUT_JSONL_FILE = "./development/CODING_DATASET.jsonl"
|
|
54
|
+
|
|
55
|
+
# --- Helper for Repobench-P ---
|
|
56
|
+
# Global tokenizer instance for Repobench-P to avoid reloading it for each problem
|
|
57
|
+
# Note: This assumes "assets/tokenizer/gpt2" is accessible relative to the execution path.
|
|
58
|
+
try:
|
|
59
|
+
repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
|
60
|
+
except OSError:
|
|
61
|
+
print(
|
|
62
|
+
"Warning: Could not load gpt2 tokenizer for Repobench-P. "
|
|
63
|
+
"Falling back to basic split for token counting."
|
|
64
|
+
)
|
|
65
|
+
repobench_p_tokenizer = None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def count_tokens_for_repobench_p(text: str) -> int:
|
|
69
|
+
"""Count tokens for Repobench-P, using the gpt2 tokenizer or a fallback."""
|
|
70
|
+
if repobench_p_tokenizer:
|
|
71
|
+
return len(repobench_p_tokenizer.encode(text))
|
|
72
|
+
return len(text.split()) # Basic fallback
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def decode_tokens_for_repobench_p(tokens: list) -> str:
|
|
76
|
+
"""Decode tokens for Repobench-P, using the gpt2 tokenizer or a fallback."""
|
|
77
|
+
if repobench_p_tokenizer:
|
|
78
|
+
return repobench_p_tokenizer.decode(tokens)
|
|
79
|
+
return " ".join(map(str, tokens)) # Basic fallback
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def comment_repobench_p_snippet(code: str, language: str):
|
|
83
|
+
"""Comment out a code snippet based on its language for Repobench-P."""
|
|
84
|
+
if language == "python":
|
|
85
|
+
return "\n".join([f"# {line}" for line in code.split("\n")])
|
|
86
|
+
# Add other languages if necessary, though we focus on Python
|
|
87
|
+
return code
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# --- End Helper for Repobench-P ---
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ContextInfo(TypedDict):
|
|
94
|
+
text: str
|
|
95
|
+
tokens: int
|
|
96
|
+
original_index: int
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def extract_python_docstring(code_string: str) -> Optional[str]:
|
|
100
|
+
"""Extract docstring from first func/class in Python code."""
|
|
101
|
+
try:
|
|
102
|
+
tree = ast.parse(code_string.strip())
|
|
103
|
+
for node in tree.body:
|
|
104
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
105
|
+
docstring = ast.get_docstring(node)
|
|
106
|
+
return docstring.strip() if docstring else None
|
|
107
|
+
return None
|
|
108
|
+
except SyntaxError:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def format_aider_prompt(problem_json: dict) -> str:
|
|
113
|
+
"""Format the prompt for Aider benchmark style problems."""
|
|
114
|
+
question = problem_json.get("content", "")
|
|
115
|
+
return (
|
|
116
|
+
f"{question}\n\nPlease generate the code in the following format:\n"
|
|
117
|
+
"```python\n# Your code response here\n```"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def format_mbpp_prompt(problem_json: dict) -> str:
|
|
122
|
+
"""Format the prompt for MBPP and MBXP style problems."""
|
|
123
|
+
question = problem_json.get("content", "")
|
|
124
|
+
test_list = problem_json.get("test_list", []) # MBPP specific
|
|
125
|
+
if not test_list and isinstance(problem_json.get("labels"), dict): # For MBXP
|
|
126
|
+
test_list = problem_json["labels"].get("test_list", [])
|
|
127
|
+
|
|
128
|
+
tests_string = "\n".join(test_list)
|
|
129
|
+
return (
|
|
130
|
+
f"You are an expert Python programmer, and here is your task: {question} "
|
|
131
|
+
f"Your code should pass these tests:\n\n{tests_string}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def format_repobench_p_prompt(problem_json: dict, lang: str = "python") -> str:
|
|
136
|
+
"""Format the prompt for Repobench-P style problems.
|
|
137
|
+
|
|
138
|
+
This is a simplified port of Repobench-P's _generate_single_prompt.
|
|
139
|
+
"""
|
|
140
|
+
# This is complex and may need the actual tokenizer assets to be perfectly replicated.
|
|
141
|
+
# Using gpt2 tokenizer as a stand-in. Max prompt length default from parser.
|
|
142
|
+
max_prompt_length_tokens = 8000
|
|
143
|
+
# Max prompt tokens
|
|
144
|
+
current_file_max_tokens = 1600 # Max tokens for current file code
|
|
145
|
+
|
|
146
|
+
code_context = problem_json.get("code", "")
|
|
147
|
+
file_path = problem_json.get("file_path", "unknown_file.py")
|
|
148
|
+
import_statement = problem_json.get("import_statement", "")
|
|
149
|
+
|
|
150
|
+
# Prepare current file's code
|
|
151
|
+
code_snippet = "\n".join(code_context.split("\n")[-60:]) # Last 60 lines
|
|
152
|
+
if lang == "python":
|
|
153
|
+
code_snippet = f"# Path: {file_path}\n{import_statement}\n{code_snippet}"
|
|
154
|
+
# (Add other lang handling if needed)
|
|
155
|
+
|
|
156
|
+
code_tokens = count_tokens_for_repobench_p(code_snippet)
|
|
157
|
+
if code_tokens > current_file_max_tokens:
|
|
158
|
+
# This truncation needs to be done carefully with actual tokens
|
|
159
|
+
# For simplicity, we're using a rough character-based trim if tokenizer failed.
|
|
160
|
+
if repobench_p_tokenizer:
|
|
161
|
+
encoded_tokens = repobench_p_tokenizer.encode(code_snippet)[
|
|
162
|
+
-current_file_max_tokens:
|
|
163
|
+
]
|
|
164
|
+
code_snippet = decode_tokens_for_repobench_p(encoded_tokens)
|
|
165
|
+
else: # Fallback if tokenizer is not available
|
|
166
|
+
code_snippet = code_snippet[
|
|
167
|
+
-int(current_file_max_tokens * 4) :
|
|
168
|
+
] # Approx char length
|
|
169
|
+
|
|
170
|
+
current_prompt_tokens = count_tokens_for_repobench_p(code_snippet)
|
|
171
|
+
final_prompt_parts: List[str] = [code_snippet] # Current code is the last part
|
|
172
|
+
|
|
173
|
+
# Prepare context snippets
|
|
174
|
+
contexts_info: List[ContextInfo] = []
|
|
175
|
+
raw_contexts = problem_json.get("context", [])
|
|
176
|
+
if isinstance(raw_contexts, list):
|
|
177
|
+
for i, ctx_item in enumerate(raw_contexts):
|
|
178
|
+
if not isinstance(ctx_item, dict):
|
|
179
|
+
continue
|
|
180
|
+
snippet_path = ctx_item.get("path", "unknown_context_file.py")
|
|
181
|
+
snippet_content = ctx_item.get("snippet", "")
|
|
182
|
+
|
|
183
|
+
commented_snippet = comment_repobench_p_snippet(snippet_content, lang)
|
|
184
|
+
|
|
185
|
+
if lang == "python":
|
|
186
|
+
formatted_snippet = f"# Path: {snippet_path}\n{commented_snippet}\n"
|
|
187
|
+
# (Add other lang handling)
|
|
188
|
+
else:
|
|
189
|
+
formatted_snippet = f"// Path: {snippet_path}\n{commented_snippet}\n"
|
|
190
|
+
|
|
191
|
+
contexts_info.append(
|
|
192
|
+
{
|
|
193
|
+
"text": formatted_snippet,
|
|
194
|
+
"tokens": count_tokens_for_repobench_p(formatted_snippet),
|
|
195
|
+
"original_index": i,
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Add gold snippet first if specified and exists
|
|
200
|
+
gold_snippet_idx = problem_json.get("gold_snippet_index", -1)
|
|
201
|
+
if isinstance(gold_snippet_idx, int) and 0 <= gold_snippet_idx < len(contexts_info):
|
|
202
|
+
gold_snippet_info = next(
|
|
203
|
+
(c for c in contexts_info if c["original_index"] == gold_snippet_idx), None
|
|
204
|
+
)
|
|
205
|
+
if gold_snippet_info and (
|
|
206
|
+
current_prompt_tokens + gold_snippet_info["tokens"]
|
|
207
|
+
<= max_prompt_length_tokens
|
|
208
|
+
):
|
|
209
|
+
final_prompt_parts.insert(0, gold_snippet_info["text"]) # Prepend
|
|
210
|
+
current_prompt_tokens += gold_snippet_info["tokens"]
|
|
211
|
+
contexts_info = [
|
|
212
|
+
c for c in contexts_info if c["original_index"] != gold_snippet_idx
|
|
213
|
+
] # Remove from further processing
|
|
214
|
+
|
|
215
|
+
# Add other contexts sorted by md5 hash, until token limit
|
|
216
|
+
contexts_info.sort(
|
|
217
|
+
key=lambda x: hashlib.md5(str(x["text"]).encode("utf8")).hexdigest()
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
for ctx_info in contexts_info:
|
|
221
|
+
if current_prompt_tokens + ctx_info["tokens"] <= max_prompt_length_tokens:
|
|
222
|
+
final_prompt_parts.insert(0, ctx_info["text"]) # Prepend
|
|
223
|
+
current_prompt_tokens += ctx_info["tokens"]
|
|
224
|
+
else:
|
|
225
|
+
break # Token limit reached
|
|
226
|
+
|
|
227
|
+
return "".join(
|
|
228
|
+
reversed(final_prompt_parts)
|
|
229
|
+
) # They were prepended, so reverse to get correct order
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def format_cruxeval_output_prompt(problem_json: dict) -> str:
|
|
233
|
+
"""Format the prompt for CruxEval output prediction tasks."""
|
|
234
|
+
# Using 'direct output prompt' style from cruxeval.py
|
|
235
|
+
code = problem_json.get("code", "")
|
|
236
|
+
test_input = problem_json.get("input", "") # This is the input to f()
|
|
237
|
+
|
|
238
|
+
# Ensure test_input is represented as a string literal if it's not already
|
|
239
|
+
# The problem_json['input'] should already be in the correct string representation.
|
|
240
|
+
|
|
241
|
+
return (
|
|
242
|
+
"You are given a Python function and an assertion containing an input to "
|
|
243
|
+
"the function. Complete the assertion with a literal (no unsimplified "
|
|
244
|
+
"expressions, no function calls) containing the output when executing the "
|
|
245
|
+
"provided code on the given input, even if the function is incorrect or "
|
|
246
|
+
"incomplete. Do NOT output any extra information. Provide the full assertion "
|
|
247
|
+
"with the correct output in [ANSWER] and [/ANSWER] tags, "
|
|
248
|
+
"following the examples.\n\n"
|
|
249
|
+
"[PYTHON]\n"
|
|
250
|
+
"def f(n):\n"
|
|
251
|
+
" return n\n"
|
|
252
|
+
"assert f(17) == ??\n"
|
|
253
|
+
"[/PYTHON]\n"
|
|
254
|
+
"[ANSWER]\n"
|
|
255
|
+
"assert f(17) == 17\n"
|
|
256
|
+
"[/ANSWER]\n\n"
|
|
257
|
+
"[PYTHON]\n"
|
|
258
|
+
"def f(s):\n"
|
|
259
|
+
' return s + "a"\n'
|
|
260
|
+
'assert f("x9j") == ??\n'
|
|
261
|
+
"[/PYTHON]\n"
|
|
262
|
+
"[ANSWER]\n"
|
|
263
|
+
'assert f("x9j") == "x9ja"\n'
|
|
264
|
+
"[/ANSWER]\n\n"
|
|
265
|
+
f"[PYTHON]\n{code}\nassert f({{test_input}}) == ??\n[/PYTHON]\n[ANSWER]\n"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def format_cruxeval_output_assistant(problem_json: dict) -> str:
|
|
270
|
+
"""Format the assistant's response for CruxEval output prediction tasks."""
|
|
271
|
+
test_input = problem_json.get("input", "")
|
|
272
|
+
expected_output = problem_json.get("output", "") # This is the value f() returns
|
|
273
|
+
# Ensure expected_output is represented as a string literal
|
|
274
|
+
# The problem_json['output'] should already be in the correct string representation.
|
|
275
|
+
return f"assert f({test_input}) == {expected_output}"
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def normalize_problem_to_openai_format(
|
|
279
|
+
problem_json: dict, filename: str, is_multilingual_file: bool
|
|
280
|
+
) -> Optional[Dict[str, Any]]:
|
|
281
|
+
"""Normalize a problem from various dataset formats to the OpenAI messages format.
|
|
282
|
+
|
|
283
|
+
Handles dataset-specific prompt engineering and language filtering.
|
|
284
|
+
"""
|
|
285
|
+
problem_id_str = str(problem_json.get("id", "N/A"))
|
|
286
|
+
try:
|
|
287
|
+
# Robust key finding from the original script
|
|
288
|
+
user_content_keys = [
|
|
289
|
+
"content",
|
|
290
|
+
"prompt",
|
|
291
|
+
"problem",
|
|
292
|
+
"text",
|
|
293
|
+
"code",
|
|
294
|
+
] # Added "code" as a fallback
|
|
295
|
+
assistant_content_keys = [
|
|
296
|
+
"canonical_solution",
|
|
297
|
+
"solution",
|
|
298
|
+
"code",
|
|
299
|
+
"completion",
|
|
300
|
+
"next_line",
|
|
301
|
+
"output",
|
|
302
|
+
] # Added "next_line", "output"
|
|
303
|
+
|
|
304
|
+
raw_user_content = None
|
|
305
|
+
primary_user_key_was_wrong_type = False
|
|
306
|
+
for key_idx, key in enumerate(user_content_keys):
|
|
307
|
+
if key in problem_json:
|
|
308
|
+
if isinstance(problem_json[key], str):
|
|
309
|
+
raw_user_content = problem_json[key]
|
|
310
|
+
break
|
|
311
|
+
elif (
|
|
312
|
+
key_idx == 0 and key == user_content_keys[0]
|
|
313
|
+
): # Only log if primary 'content' is wrong type
|
|
314
|
+
primary_user_key_was_wrong_type = True
|
|
315
|
+
|
|
316
|
+
raw_assistant_content = None
|
|
317
|
+
for key_idx, key in enumerate(assistant_content_keys):
|
|
318
|
+
if key in problem_json:
|
|
319
|
+
if isinstance(
|
|
320
|
+
problem_json[key], (str, int, float, bool, list, dict)
|
|
321
|
+
): # Allow more types for raw_assistant
|
|
322
|
+
raw_assistant_content = problem_json[key]
|
|
323
|
+
if isinstance(
|
|
324
|
+
raw_assistant_content, (int, float, bool, list, dict)
|
|
325
|
+
): # Convert non-strings for now
|
|
326
|
+
raw_assistant_content = str(raw_assistant_content)
|
|
327
|
+
break
|
|
328
|
+
elif key_idx == 0 and key == assistant_content_keys[0]:
|
|
329
|
+
pass
|
|
330
|
+
|
|
331
|
+
if raw_user_content is None:
|
|
332
|
+
if primary_user_key_was_wrong_type:
|
|
333
|
+
print(
|
|
334
|
+
f"Warning: Skipping ID {problem_id_str} in {filename} - "
|
|
335
|
+
f"primary user key '{user_content_keys[0]}' not a string."
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
print(
|
|
339
|
+
f"Warning: Skipping ID {problem_id_str} in {filename} - "
|
|
340
|
+
f"missing user content (keys: {user_content_keys})."
|
|
341
|
+
)
|
|
342
|
+
return None
|
|
343
|
+
|
|
344
|
+
labels_data = problem_json.get("labels")
|
|
345
|
+
labels = {}
|
|
346
|
+
if isinstance(labels_data, str):
|
|
347
|
+
try:
|
|
348
|
+
labels = json.loads(labels_data)
|
|
349
|
+
except json.JSONDecodeError:
|
|
350
|
+
print(
|
|
351
|
+
f"Warning: Skipping ID {problem_id_str} in {filename} "
|
|
352
|
+
"- malformed JSON in labels."
|
|
353
|
+
)
|
|
354
|
+
return None
|
|
355
|
+
elif isinstance(labels_data, dict):
|
|
356
|
+
labels = labels_data
|
|
357
|
+
|
|
358
|
+
programming_language = labels.get(
|
|
359
|
+
"programming_language", "python" if "python" in filename else None
|
|
360
|
+
)
|
|
361
|
+
if (
|
|
362
|
+
not programming_language
|
|
363
|
+
and "cruxeval_x" in filename
|
|
364
|
+
and isinstance(problem_json.get("id"), str)
|
|
365
|
+
):
|
|
366
|
+
lang_part = problem_json["id"].split("_")[0]
|
|
367
|
+
if lang_part in ["python", "py"]:
|
|
368
|
+
programming_language = "python"
|
|
369
|
+
|
|
370
|
+
if is_multilingual_file or "cruxeval_x" in filename:
|
|
371
|
+
if programming_language != "python":
|
|
372
|
+
return None
|
|
373
|
+
|
|
374
|
+
final_user_content = raw_user_content
|
|
375
|
+
final_assistant_content = (
|
|
376
|
+
str(raw_assistant_content) if raw_assistant_content is not None else ""
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
if "aider_benchmark" in filename:
|
|
380
|
+
final_user_content = format_aider_prompt(problem_json)
|
|
381
|
+
elif "mbpp" in filename and "mbxp" not in filename:
|
|
382
|
+
final_user_content = format_mbpp_prompt(problem_json)
|
|
383
|
+
test_setup_code = labels.get("test_setup_code", "")
|
|
384
|
+
if (
|
|
385
|
+
test_setup_code
|
|
386
|
+
and isinstance(test_setup_code, str)
|
|
387
|
+
and test_setup_code not in final_assistant_content
|
|
388
|
+
):
|
|
389
|
+
final_assistant_content = (
|
|
390
|
+
test_setup_code.strip() + "\n\n" + final_assistant_content
|
|
391
|
+
)
|
|
392
|
+
elif "mhpp" in filename:
|
|
393
|
+
original_content_for_mhpp = problem_json.get("content", "")
|
|
394
|
+
first_line_of_test = ""
|
|
395
|
+
if problem_json.get("test") and isinstance(problem_json["test"], str):
|
|
396
|
+
first_line_of_test = problem_json["test"].split("\n")[0]
|
|
397
|
+
prompt_stub = original_content_for_mhpp
|
|
398
|
+
if '"""' in prompt_stub:
|
|
399
|
+
prompt_stub = prompt_stub[: prompt_stub.rfind('"""')]
|
|
400
|
+
final_user_content = f'{prompt_stub}\n e.g. {first_line_of_test} """'
|
|
401
|
+
if not (
|
|
402
|
+
"def " in final_assistant_content.strip()
|
|
403
|
+
or "class " in final_assistant_content.strip()
|
|
404
|
+
):
|
|
405
|
+
if original_content_for_mhpp.rstrip().endswith(":"):
|
|
406
|
+
final_assistant_content = (
|
|
407
|
+
original_content_for_mhpp.rstrip()
|
|
408
|
+
+ "\n"
|
|
409
|
+
+ final_assistant_content
|
|
410
|
+
)
|
|
411
|
+
elif original_content_for_mhpp.endswith("\n"):
|
|
412
|
+
final_assistant_content = (
|
|
413
|
+
original_content_for_mhpp + final_assistant_content
|
|
414
|
+
)
|
|
415
|
+
else:
|
|
416
|
+
final_assistant_content = (
|
|
417
|
+
original_content_for_mhpp + "\n" + final_assistant_content
|
|
418
|
+
)
|
|
419
|
+
elif "ncb_python" in filename:
|
|
420
|
+
final_user_content = problem_json.get("content", raw_user_content)
|
|
421
|
+
final_assistant_content = problem_json.get(
|
|
422
|
+
"canonical_solution", raw_assistant_content
|
|
423
|
+
)
|
|
424
|
+
elif "repobench_c" in filename:
|
|
425
|
+
final_user_content = problem_json.get("prompt", raw_user_content)
|
|
426
|
+
final_assistant_content = problem_json.get(
|
|
427
|
+
"next_line", raw_assistant_content
|
|
428
|
+
)
|
|
429
|
+
elif "repobench_p" in filename:
|
|
430
|
+
final_user_content = format_repobench_p_prompt(problem_json, lang="python")
|
|
431
|
+
final_assistant_content = problem_json.get(
|
|
432
|
+
"next_line", raw_assistant_content
|
|
433
|
+
)
|
|
434
|
+
elif "cruxeval" in filename:
|
|
435
|
+
final_user_content = format_cruxeval_output_prompt(problem_json)
|
|
436
|
+
final_assistant_content = format_cruxeval_output_assistant(problem_json)
|
|
437
|
+
elif (
|
|
438
|
+
"humaneval" in filename
|
|
439
|
+
or "evoeval" in filename
|
|
440
|
+
or "bigcodebench" in filename
|
|
441
|
+
or (
|
|
442
|
+
is_multilingual_file
|
|
443
|
+
and (
|
|
444
|
+
"humanevalds" in filename
|
|
445
|
+
or labels.get("task_id", "").startswith("humanevalds")
|
|
446
|
+
)
|
|
447
|
+
)
|
|
448
|
+
):
|
|
449
|
+
extracted_docstring = extract_python_docstring(raw_user_content)
|
|
450
|
+
if extracted_docstring:
|
|
451
|
+
final_user_content = extracted_docstring
|
|
452
|
+
if not (
|
|
453
|
+
"def " in final_assistant_content.strip()
|
|
454
|
+
or "class " in final_assistant_content.strip()
|
|
455
|
+
):
|
|
456
|
+
if raw_user_content.rstrip().endswith(":"):
|
|
457
|
+
final_assistant_content = (
|
|
458
|
+
raw_user_content.rstrip() + "\n" + final_assistant_content
|
|
459
|
+
)
|
|
460
|
+
elif raw_user_content.endswith("\n"):
|
|
461
|
+
final_assistant_content = (
|
|
462
|
+
raw_user_content + final_assistant_content
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
final_assistant_content = (
|
|
466
|
+
raw_user_content + "\n" + final_assistant_content
|
|
467
|
+
)
|
|
468
|
+
else:
|
|
469
|
+
final_user_content = raw_user_content
|
|
470
|
+
elif is_multilingual_file and (
|
|
471
|
+
"mbxp" in filename or labels.get("task_id", "").startswith("mbxp")
|
|
472
|
+
):
|
|
473
|
+
final_user_content = format_mbpp_prompt(problem_json)
|
|
474
|
+
test_setup_code = labels.get("test_setup_code", "")
|
|
475
|
+
if (
|
|
476
|
+
test_setup_code
|
|
477
|
+
and isinstance(test_setup_code, str)
|
|
478
|
+
and test_setup_code not in final_assistant_content
|
|
479
|
+
):
|
|
480
|
+
final_assistant_content = (
|
|
481
|
+
test_setup_code.strip() + "\n\n" + final_assistant_content
|
|
482
|
+
)
|
|
483
|
+
else:
|
|
484
|
+
extracted_docstring = extract_python_docstring(raw_user_content)
|
|
485
|
+
if extracted_docstring:
|
|
486
|
+
final_user_content = extracted_docstring
|
|
487
|
+
if not (
|
|
488
|
+
"def " in final_assistant_content.strip()
|
|
489
|
+
or "class " in final_assistant_content.strip()
|
|
490
|
+
):
|
|
491
|
+
if raw_user_content.rstrip().endswith(":"):
|
|
492
|
+
final_assistant_content = (
|
|
493
|
+
raw_user_content.rstrip() + "\n" + final_assistant_content
|
|
494
|
+
)
|
|
495
|
+
elif raw_user_content.endswith("\n"):
|
|
496
|
+
final_assistant_content = (
|
|
497
|
+
raw_user_content + final_assistant_content
|
|
498
|
+
)
|
|
499
|
+
else:
|
|
500
|
+
final_assistant_content = (
|
|
501
|
+
raw_user_content + "\n" + final_assistant_content
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
if not isinstance(final_user_content, str) or not isinstance(
|
|
505
|
+
final_assistant_content, str
|
|
506
|
+
):
|
|
507
|
+
print(
|
|
508
|
+
f"Warning: Skipping ID {problem_id_str} in {filename} - "
|
|
509
|
+
f"invalid content types (user: {type(final_user_content)}, "
|
|
510
|
+
f"assistant: {type(final_assistant_content)})."
|
|
511
|
+
)
|
|
512
|
+
return None
|
|
513
|
+
if not final_user_content.strip() or not final_assistant_content.strip():
|
|
514
|
+
print(
|
|
515
|
+
f"Warning: Skipping ID {problem_id_str} in {filename} - "
|
|
516
|
+
"empty processed content."
|
|
517
|
+
)
|
|
518
|
+
return None
|
|
519
|
+
if final_assistant_content.strip() == "import sys; sys.exit(0)":
|
|
520
|
+
print(
|
|
521
|
+
f"Warning: Skipping ID {problem_id_str} in {filename} - "
|
|
522
|
+
"placeholder solution."
|
|
523
|
+
)
|
|
524
|
+
return None
|
|
525
|
+
|
|
526
|
+
return {
|
|
527
|
+
"messages": [
|
|
528
|
+
{"role": "user", "content": final_user_content.strip()},
|
|
529
|
+
{"role": "assistant", "content": final_assistant_content.strip()},
|
|
530
|
+
]
|
|
531
|
+
}
|
|
532
|
+
except Exception as e:
|
|
533
|
+
print(
|
|
534
|
+
f"Warning: Skipping ID {problem_id_str} in {filename} - "
|
|
535
|
+
f"error ({type(e).__name__}: {e})."
|
|
536
|
+
)
|
|
537
|
+
import traceback
|
|
538
|
+
|
|
539
|
+
traceback.print_exc()
|
|
540
|
+
return None
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def main():
|
|
544
|
+
"""Process SandboxFusion datasets and normalize them to OpenAI JSONL format.
|
|
545
|
+
|
|
546
|
+
This function iterates through specified dataset files, normalizes each problem
|
|
547
|
+
to an OpenAI-compatible format, and writes the results to an output JSONL file.
|
|
548
|
+
It handles language filtering for multilingual datasets and logs errors or
|
|
549
|
+
skipped problems.
|
|
550
|
+
"""
|
|
551
|
+
output_dir = os.path.dirname(OUTPUT_JSONL_FILE)
|
|
552
|
+
if output_dir and not os.path.exists(output_dir):
|
|
553
|
+
os.makedirs(output_dir)
|
|
554
|
+
|
|
555
|
+
processed_count = 0
|
|
556
|
+
skipped_count = 0
|
|
557
|
+
file_error_count = 0
|
|
558
|
+
|
|
559
|
+
print(
|
|
560
|
+
f"Starting dataset normalization. Output will be written to {OUTPUT_JSONL_FILE}"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
with open(OUTPUT_JSONL_FILE, "w", encoding="utf-8") as outfile:
|
|
564
|
+
for filename_idx, filename in enumerate(ALL_SOURCE_JSONL_FILES):
|
|
565
|
+
filepath = os.path.join(SANDBOX_SAMPLES_DIR, filename)
|
|
566
|
+
is_multilingual = filename in MULTILINGUAL_JSONL_FILES_TO_FILTER
|
|
567
|
+
|
|
568
|
+
if not os.path.exists(filepath):
|
|
569
|
+
print(f"Warning: File not found, skipping: {filepath}")
|
|
570
|
+
file_error_count += 1
|
|
571
|
+
continue
|
|
572
|
+
|
|
573
|
+
print(
|
|
574
|
+
f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: "
|
|
575
|
+
f"{filename}..."
|
|
576
|
+
)
|
|
577
|
+
lines_in_file = 0
|
|
578
|
+
processed_in_file = 0
|
|
579
|
+
skipped_in_file = 0
|
|
580
|
+
try:
|
|
581
|
+
with open(filepath, "r", encoding="utf-8") as infile:
|
|
582
|
+
for line_number, line in enumerate(infile, 1):
|
|
583
|
+
lines_in_file += 1
|
|
584
|
+
stripped_line = line.strip()
|
|
585
|
+
if not stripped_line:
|
|
586
|
+
continue
|
|
587
|
+
try:
|
|
588
|
+
problem_data = json.loads(stripped_line)
|
|
589
|
+
except json.JSONDecodeError:
|
|
590
|
+
print(
|
|
591
|
+
f"Warning: Malformed JSON on line {line_number} "
|
|
592
|
+
f"in {filepath}. Skipping line."
|
|
593
|
+
)
|
|
594
|
+
skipped_in_file += 1
|
|
595
|
+
continue
|
|
596
|
+
|
|
597
|
+
normalized_problem = normalize_problem_to_openai_format(
|
|
598
|
+
problem_data, filename, is_multilingual
|
|
599
|
+
)
|
|
600
|
+
if normalized_problem:
|
|
601
|
+
outfile.write(json.dumps(normalized_problem) + "\n")
|
|
602
|
+
processed_in_file += 1
|
|
603
|
+
else:
|
|
604
|
+
skipped_in_file += 1
|
|
605
|
+
print(
|
|
606
|
+
f"Finished {filename}. Lines: {lines_in_file}, "
|
|
607
|
+
f"Processed: {processed_in_file}, Skipped: {skipped_in_file}"
|
|
608
|
+
)
|
|
609
|
+
processed_count += processed_in_file
|
|
610
|
+
skipped_count += skipped_in_file
|
|
611
|
+
except Exception as e:
|
|
612
|
+
print(
|
|
613
|
+
f"Error processing file {filepath}: {type(e).__name__}: {e}. "
|
|
614
|
+
"Skipping rest of file."
|
|
615
|
+
)
|
|
616
|
+
import traceback
|
|
617
|
+
|
|
618
|
+
traceback.print_exc()
|
|
619
|
+
file_error_count += 1
|
|
620
|
+
|
|
621
|
+
print("\nDataset normalization complete.")
|
|
622
|
+
print(f"Total problems processed and written: {processed_count}")
|
|
623
|
+
print(f"Total problems/lines skipped: {skipped_count}")
|
|
624
|
+
print(f"Total files with errors or not found: {file_error_count}")
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
if __name__ == "__main__":
|
|
628
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This file makes the 'utils' directory a Python sub-package of 'development'.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import secrets
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def generate_api_key(length: int = 32) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Generates a cryptographically strong random string to be used as an API key.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
length: The desired length of the API key in bytes.
|
|
11
|
+
The resulting hex string will be twice this length.
|
|
12
|
+
Default is 32 bytes, resulting in a 64-character hex string.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
A hex-encoded secure random string.
|
|
16
|
+
"""
|
|
17
|
+
return secrets.token_hex(length)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
parser = argparse.ArgumentParser(description="Generate a secure API key.")
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"--length",
|
|
24
|
+
type=int,
|
|
25
|
+
default=32,
|
|
26
|
+
help="Length of the key in bytes (default: 32, produces a 64-char hex string).",
|
|
27
|
+
)
|
|
28
|
+
args = parser.parse_args()
|
|
29
|
+
|
|
30
|
+
api_key = generate_api_key(args.length)
|
|
31
|
+
print(api_key)
|