eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1 @@
1
+ # This file makes the 'development' directory a Python package.
@@ -0,0 +1,628 @@
1
+ """Normalizes various coding dataset formats from SandboxFusion.
2
+
3
+ This script converts datasets into a unified OpenAI-compatible JSONL format.
4
+ """
5
+
6
+ import ast
7
+ import hashlib
8
+ import json
9
+ import os
10
+ from typing import ( # Removed Callable, Union. Added List, TypedDict
11
+ Any,
12
+ Dict,
13
+ List,
14
+ Optional,
15
+ TypedDict,
16
+ )
17
+
18
+ from transformers import AutoTokenizer # For Repobench-P
19
+
20
+ # Define the root path to the SandboxFusion sample datasets
21
+ # (Relative to the reward-kit project root)
22
+ SANDBOX_SAMPLES_DIR = "./SandboxFusion/sandbox/tests/datasets/samples/"
23
+
24
+ # List of Python-specific dataset .jsonl files
25
+ PYTHON_SPECIFIC_JSONL_FILES = [
26
+ "code_eval_shadow_humaneval_python.jsonl",
27
+ "code_eval_mbpp.jsonl",
28
+ "code_eval_mhpp.jsonl",
29
+ "code_eval_ncb_python_en.jsonl",
30
+ "code_eval_ncb_python_zh.jsonl",
31
+ "code_eval_repobench_c_python_sampled.jsonl", # RepoBench-C, Python subset
32
+ "code_eval_repobench_p_python_sampled.jsonl", # RepoBench-P, Python subset
33
+ "code_eval_cruxeval.jsonl", # Python by default
34
+ "code_eval_cruxeval_x.jsonl", # Multilingual, filter for Python
35
+ "code_eval_aider_benchmark_v1.jsonl",
36
+ "code_eval_bigcodebench.jsonl",
37
+ "code_eval_EvoEval.jsonl",
38
+ ]
39
+
40
+ # List of multilingual dataset .jsonl files that need filtering for Python
41
+ MULTILINGUAL_JSONL_FILES_TO_FILTER = [
42
+ "code_eval_mbxp_v1_en.jsonl",
43
+ "code_eval_humanevalds_v1_en.jsonl",
44
+ "code_eval_humanevalds_v2_en.jsonl",
45
+ "code_eval_mbxp_v2_en.jsonl",
46
+ ]
47
+
48
+ ALL_SOURCE_JSONL_FILES = (
49
+ PYTHON_SPECIFIC_JSONL_FILES + MULTILINGUAL_JSONL_FILES_TO_FILTER
50
+ )
51
+
52
+ # Output file path
53
+ OUTPUT_JSONL_FILE = "./development/CODING_DATASET.jsonl"
54
+
55
+ # --- Helper for Repobench-P ---
56
+ # Global tokenizer instance for Repobench-P to avoid reloading it for each problem
57
+ # Note: This assumes "assets/tokenizer/gpt2" is accessible relative to the execution path.
58
+ try:
59
+ repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2")
60
+ except OSError:
61
+ print(
62
+ "Warning: Could not load gpt2 tokenizer for Repobench-P. "
63
+ "Falling back to basic split for token counting."
64
+ )
65
+ repobench_p_tokenizer = None
66
+
67
+
68
+ def count_tokens_for_repobench_p(text: str) -> int:
69
+ """Count tokens for Repobench-P, using the gpt2 tokenizer or a fallback."""
70
+ if repobench_p_tokenizer:
71
+ return len(repobench_p_tokenizer.encode(text))
72
+ return len(text.split()) # Basic fallback
73
+
74
+
75
+ def decode_tokens_for_repobench_p(tokens: list) -> str:
76
+ """Decode tokens for Repobench-P, using the gpt2 tokenizer or a fallback."""
77
+ if repobench_p_tokenizer:
78
+ return repobench_p_tokenizer.decode(tokens)
79
+ return " ".join(map(str, tokens)) # Basic fallback
80
+
81
+
82
+ def comment_repobench_p_snippet(code: str, language: str):
83
+ """Comment out a code snippet based on its language for Repobench-P."""
84
+ if language == "python":
85
+ return "\n".join([f"# {line}" for line in code.split("\n")])
86
+ # Add other languages if necessary, though we focus on Python
87
+ return code
88
+
89
+
90
+ # --- End Helper for Repobench-P ---
91
+
92
+
93
+ class ContextInfo(TypedDict):
94
+ text: str
95
+ tokens: int
96
+ original_index: int
97
+
98
+
99
+ def extract_python_docstring(code_string: str) -> Optional[str]:
100
+ """Extract docstring from first func/class in Python code."""
101
+ try:
102
+ tree = ast.parse(code_string.strip())
103
+ for node in tree.body:
104
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
105
+ docstring = ast.get_docstring(node)
106
+ return docstring.strip() if docstring else None
107
+ return None
108
+ except SyntaxError:
109
+ return None
110
+
111
+
112
+ def format_aider_prompt(problem_json: dict) -> str:
113
+ """Format the prompt for Aider benchmark style problems."""
114
+ question = problem_json.get("content", "")
115
+ return (
116
+ f"{question}\n\nPlease generate the code in the following format:\n"
117
+ "```python\n# Your code response here\n```"
118
+ )
119
+
120
+
121
+ def format_mbpp_prompt(problem_json: dict) -> str:
122
+ """Format the prompt for MBPP and MBXP style problems."""
123
+ question = problem_json.get("content", "")
124
+ test_list = problem_json.get("test_list", []) # MBPP specific
125
+ if not test_list and isinstance(problem_json.get("labels"), dict): # For MBXP
126
+ test_list = problem_json["labels"].get("test_list", [])
127
+
128
+ tests_string = "\n".join(test_list)
129
+ return (
130
+ f"You are an expert Python programmer, and here is your task: {question} "
131
+ f"Your code should pass these tests:\n\n{tests_string}"
132
+ )
133
+
134
+
135
+ def format_repobench_p_prompt(problem_json: dict, lang: str = "python") -> str:
136
+ """Format the prompt for Repobench-P style problems.
137
+
138
+ This is a simplified port of Repobench-P's _generate_single_prompt.
139
+ """
140
+ # This is complex and may need the actual tokenizer assets to be perfectly replicated.
141
+ # Using gpt2 tokenizer as a stand-in. Max prompt length default from parser.
142
+ max_prompt_length_tokens = 8000
143
+ # Max prompt tokens
144
+ current_file_max_tokens = 1600 # Max tokens for current file code
145
+
146
+ code_context = problem_json.get("code", "")
147
+ file_path = problem_json.get("file_path", "unknown_file.py")
148
+ import_statement = problem_json.get("import_statement", "")
149
+
150
+ # Prepare current file's code
151
+ code_snippet = "\n".join(code_context.split("\n")[-60:]) # Last 60 lines
152
+ if lang == "python":
153
+ code_snippet = f"# Path: {file_path}\n{import_statement}\n{code_snippet}"
154
+ # (Add other lang handling if needed)
155
+
156
+ code_tokens = count_tokens_for_repobench_p(code_snippet)
157
+ if code_tokens > current_file_max_tokens:
158
+ # This truncation needs to be done carefully with actual tokens
159
+ # For simplicity, we're using a rough character-based trim if tokenizer failed.
160
+ if repobench_p_tokenizer:
161
+ encoded_tokens = repobench_p_tokenizer.encode(code_snippet)[
162
+ -current_file_max_tokens:
163
+ ]
164
+ code_snippet = decode_tokens_for_repobench_p(encoded_tokens)
165
+ else: # Fallback if tokenizer is not available
166
+ code_snippet = code_snippet[
167
+ -int(current_file_max_tokens * 4) :
168
+ ] # Approx char length
169
+
170
+ current_prompt_tokens = count_tokens_for_repobench_p(code_snippet)
171
+ final_prompt_parts: List[str] = [code_snippet] # Current code is the last part
172
+
173
+ # Prepare context snippets
174
+ contexts_info: List[ContextInfo] = []
175
+ raw_contexts = problem_json.get("context", [])
176
+ if isinstance(raw_contexts, list):
177
+ for i, ctx_item in enumerate(raw_contexts):
178
+ if not isinstance(ctx_item, dict):
179
+ continue
180
+ snippet_path = ctx_item.get("path", "unknown_context_file.py")
181
+ snippet_content = ctx_item.get("snippet", "")
182
+
183
+ commented_snippet = comment_repobench_p_snippet(snippet_content, lang)
184
+
185
+ if lang == "python":
186
+ formatted_snippet = f"# Path: {snippet_path}\n{commented_snippet}\n"
187
+ # (Add other lang handling)
188
+ else:
189
+ formatted_snippet = f"// Path: {snippet_path}\n{commented_snippet}\n"
190
+
191
+ contexts_info.append(
192
+ {
193
+ "text": formatted_snippet,
194
+ "tokens": count_tokens_for_repobench_p(formatted_snippet),
195
+ "original_index": i,
196
+ }
197
+ )
198
+
199
+ # Add gold snippet first if specified and exists
200
+ gold_snippet_idx = problem_json.get("gold_snippet_index", -1)
201
+ if isinstance(gold_snippet_idx, int) and 0 <= gold_snippet_idx < len(contexts_info):
202
+ gold_snippet_info = next(
203
+ (c for c in contexts_info if c["original_index"] == gold_snippet_idx), None
204
+ )
205
+ if gold_snippet_info and (
206
+ current_prompt_tokens + gold_snippet_info["tokens"]
207
+ <= max_prompt_length_tokens
208
+ ):
209
+ final_prompt_parts.insert(0, gold_snippet_info["text"]) # Prepend
210
+ current_prompt_tokens += gold_snippet_info["tokens"]
211
+ contexts_info = [
212
+ c for c in contexts_info if c["original_index"] != gold_snippet_idx
213
+ ] # Remove from further processing
214
+
215
+ # Add other contexts sorted by md5 hash, until token limit
216
+ contexts_info.sort(
217
+ key=lambda x: hashlib.md5(str(x["text"]).encode("utf8")).hexdigest()
218
+ )
219
+
220
+ for ctx_info in contexts_info:
221
+ if current_prompt_tokens + ctx_info["tokens"] <= max_prompt_length_tokens:
222
+ final_prompt_parts.insert(0, ctx_info["text"]) # Prepend
223
+ current_prompt_tokens += ctx_info["tokens"]
224
+ else:
225
+ break # Token limit reached
226
+
227
+ return "".join(
228
+ reversed(final_prompt_parts)
229
+ ) # They were prepended, so reverse to get correct order
230
+
231
+
232
+ def format_cruxeval_output_prompt(problem_json: dict) -> str:
233
+ """Format the prompt for CruxEval output prediction tasks."""
234
+ # Using 'direct output prompt' style from cruxeval.py
235
+ code = problem_json.get("code", "")
236
+ test_input = problem_json.get("input", "") # This is the input to f()
237
+
238
+ # Ensure test_input is represented as a string literal if it's not already
239
+ # The problem_json['input'] should already be in the correct string representation.
240
+
241
+ return (
242
+ "You are given a Python function and an assertion containing an input to "
243
+ "the function. Complete the assertion with a literal (no unsimplified "
244
+ "expressions, no function calls) containing the output when executing the "
245
+ "provided code on the given input, even if the function is incorrect or "
246
+ "incomplete. Do NOT output any extra information. Provide the full assertion "
247
+ "with the correct output in [ANSWER] and [/ANSWER] tags, "
248
+ "following the examples.\n\n"
249
+ "[PYTHON]\n"
250
+ "def f(n):\n"
251
+ " return n\n"
252
+ "assert f(17) == ??\n"
253
+ "[/PYTHON]\n"
254
+ "[ANSWER]\n"
255
+ "assert f(17) == 17\n"
256
+ "[/ANSWER]\n\n"
257
+ "[PYTHON]\n"
258
+ "def f(s):\n"
259
+ ' return s + "a"\n'
260
+ 'assert f("x9j") == ??\n'
261
+ "[/PYTHON]\n"
262
+ "[ANSWER]\n"
263
+ 'assert f("x9j") == "x9ja"\n'
264
+ "[/ANSWER]\n\n"
265
+ f"[PYTHON]\n{code}\nassert f({{test_input}}) == ??\n[/PYTHON]\n[ANSWER]\n"
266
+ )
267
+
268
+
269
+ def format_cruxeval_output_assistant(problem_json: dict) -> str:
270
+ """Format the assistant's response for CruxEval output prediction tasks."""
271
+ test_input = problem_json.get("input", "")
272
+ expected_output = problem_json.get("output", "") # This is the value f() returns
273
+ # Ensure expected_output is represented as a string literal
274
+ # The problem_json['output'] should already be in the correct string representation.
275
+ return f"assert f({test_input}) == {expected_output}"
276
+
277
+
278
+ def normalize_problem_to_openai_format(
279
+ problem_json: dict, filename: str, is_multilingual_file: bool
280
+ ) -> Optional[Dict[str, Any]]:
281
+ """Normalize a problem from various dataset formats to the OpenAI messages format.
282
+
283
+ Handles dataset-specific prompt engineering and language filtering.
284
+ """
285
+ problem_id_str = str(problem_json.get("id", "N/A"))
286
+ try:
287
+ # Robust key finding from the original script
288
+ user_content_keys = [
289
+ "content",
290
+ "prompt",
291
+ "problem",
292
+ "text",
293
+ "code",
294
+ ] # Added "code" as a fallback
295
+ assistant_content_keys = [
296
+ "canonical_solution",
297
+ "solution",
298
+ "code",
299
+ "completion",
300
+ "next_line",
301
+ "output",
302
+ ] # Added "next_line", "output"
303
+
304
+ raw_user_content = None
305
+ primary_user_key_was_wrong_type = False
306
+ for key_idx, key in enumerate(user_content_keys):
307
+ if key in problem_json:
308
+ if isinstance(problem_json[key], str):
309
+ raw_user_content = problem_json[key]
310
+ break
311
+ elif (
312
+ key_idx == 0 and key == user_content_keys[0]
313
+ ): # Only log if primary 'content' is wrong type
314
+ primary_user_key_was_wrong_type = True
315
+
316
+ raw_assistant_content = None
317
+ for key_idx, key in enumerate(assistant_content_keys):
318
+ if key in problem_json:
319
+ if isinstance(
320
+ problem_json[key], (str, int, float, bool, list, dict)
321
+ ): # Allow more types for raw_assistant
322
+ raw_assistant_content = problem_json[key]
323
+ if isinstance(
324
+ raw_assistant_content, (int, float, bool, list, dict)
325
+ ): # Convert non-strings for now
326
+ raw_assistant_content = str(raw_assistant_content)
327
+ break
328
+ elif key_idx == 0 and key == assistant_content_keys[0]:
329
+ pass
330
+
331
+ if raw_user_content is None:
332
+ if primary_user_key_was_wrong_type:
333
+ print(
334
+ f"Warning: Skipping ID {problem_id_str} in {filename} - "
335
+ f"primary user key '{user_content_keys[0]}' not a string."
336
+ )
337
+ else:
338
+ print(
339
+ f"Warning: Skipping ID {problem_id_str} in {filename} - "
340
+ f"missing user content (keys: {user_content_keys})."
341
+ )
342
+ return None
343
+
344
+ labels_data = problem_json.get("labels")
345
+ labels = {}
346
+ if isinstance(labels_data, str):
347
+ try:
348
+ labels = json.loads(labels_data)
349
+ except json.JSONDecodeError:
350
+ print(
351
+ f"Warning: Skipping ID {problem_id_str} in {filename} "
352
+ "- malformed JSON in labels."
353
+ )
354
+ return None
355
+ elif isinstance(labels_data, dict):
356
+ labels = labels_data
357
+
358
+ programming_language = labels.get(
359
+ "programming_language", "python" if "python" in filename else None
360
+ )
361
+ if (
362
+ not programming_language
363
+ and "cruxeval_x" in filename
364
+ and isinstance(problem_json.get("id"), str)
365
+ ):
366
+ lang_part = problem_json["id"].split("_")[0]
367
+ if lang_part in ["python", "py"]:
368
+ programming_language = "python"
369
+
370
+ if is_multilingual_file or "cruxeval_x" in filename:
371
+ if programming_language != "python":
372
+ return None
373
+
374
+ final_user_content = raw_user_content
375
+ final_assistant_content = (
376
+ str(raw_assistant_content) if raw_assistant_content is not None else ""
377
+ )
378
+
379
+ if "aider_benchmark" in filename:
380
+ final_user_content = format_aider_prompt(problem_json)
381
+ elif "mbpp" in filename and "mbxp" not in filename:
382
+ final_user_content = format_mbpp_prompt(problem_json)
383
+ test_setup_code = labels.get("test_setup_code", "")
384
+ if (
385
+ test_setup_code
386
+ and isinstance(test_setup_code, str)
387
+ and test_setup_code not in final_assistant_content
388
+ ):
389
+ final_assistant_content = (
390
+ test_setup_code.strip() + "\n\n" + final_assistant_content
391
+ )
392
+ elif "mhpp" in filename:
393
+ original_content_for_mhpp = problem_json.get("content", "")
394
+ first_line_of_test = ""
395
+ if problem_json.get("test") and isinstance(problem_json["test"], str):
396
+ first_line_of_test = problem_json["test"].split("\n")[0]
397
+ prompt_stub = original_content_for_mhpp
398
+ if '"""' in prompt_stub:
399
+ prompt_stub = prompt_stub[: prompt_stub.rfind('"""')]
400
+ final_user_content = f'{prompt_stub}\n e.g. {first_line_of_test} """'
401
+ if not (
402
+ "def " in final_assistant_content.strip()
403
+ or "class " in final_assistant_content.strip()
404
+ ):
405
+ if original_content_for_mhpp.rstrip().endswith(":"):
406
+ final_assistant_content = (
407
+ original_content_for_mhpp.rstrip()
408
+ + "\n"
409
+ + final_assistant_content
410
+ )
411
+ elif original_content_for_mhpp.endswith("\n"):
412
+ final_assistant_content = (
413
+ original_content_for_mhpp + final_assistant_content
414
+ )
415
+ else:
416
+ final_assistant_content = (
417
+ original_content_for_mhpp + "\n" + final_assistant_content
418
+ )
419
+ elif "ncb_python" in filename:
420
+ final_user_content = problem_json.get("content", raw_user_content)
421
+ final_assistant_content = problem_json.get(
422
+ "canonical_solution", raw_assistant_content
423
+ )
424
+ elif "repobench_c" in filename:
425
+ final_user_content = problem_json.get("prompt", raw_user_content)
426
+ final_assistant_content = problem_json.get(
427
+ "next_line", raw_assistant_content
428
+ )
429
+ elif "repobench_p" in filename:
430
+ final_user_content = format_repobench_p_prompt(problem_json, lang="python")
431
+ final_assistant_content = problem_json.get(
432
+ "next_line", raw_assistant_content
433
+ )
434
+ elif "cruxeval" in filename:
435
+ final_user_content = format_cruxeval_output_prompt(problem_json)
436
+ final_assistant_content = format_cruxeval_output_assistant(problem_json)
437
+ elif (
438
+ "humaneval" in filename
439
+ or "evoeval" in filename
440
+ or "bigcodebench" in filename
441
+ or (
442
+ is_multilingual_file
443
+ and (
444
+ "humanevalds" in filename
445
+ or labels.get("task_id", "").startswith("humanevalds")
446
+ )
447
+ )
448
+ ):
449
+ extracted_docstring = extract_python_docstring(raw_user_content)
450
+ if extracted_docstring:
451
+ final_user_content = extracted_docstring
452
+ if not (
453
+ "def " in final_assistant_content.strip()
454
+ or "class " in final_assistant_content.strip()
455
+ ):
456
+ if raw_user_content.rstrip().endswith(":"):
457
+ final_assistant_content = (
458
+ raw_user_content.rstrip() + "\n" + final_assistant_content
459
+ )
460
+ elif raw_user_content.endswith("\n"):
461
+ final_assistant_content = (
462
+ raw_user_content + final_assistant_content
463
+ )
464
+ else:
465
+ final_assistant_content = (
466
+ raw_user_content + "\n" + final_assistant_content
467
+ )
468
+ else:
469
+ final_user_content = raw_user_content
470
+ elif is_multilingual_file and (
471
+ "mbxp" in filename or labels.get("task_id", "").startswith("mbxp")
472
+ ):
473
+ final_user_content = format_mbpp_prompt(problem_json)
474
+ test_setup_code = labels.get("test_setup_code", "")
475
+ if (
476
+ test_setup_code
477
+ and isinstance(test_setup_code, str)
478
+ and test_setup_code not in final_assistant_content
479
+ ):
480
+ final_assistant_content = (
481
+ test_setup_code.strip() + "\n\n" + final_assistant_content
482
+ )
483
+ else:
484
+ extracted_docstring = extract_python_docstring(raw_user_content)
485
+ if extracted_docstring:
486
+ final_user_content = extracted_docstring
487
+ if not (
488
+ "def " in final_assistant_content.strip()
489
+ or "class " in final_assistant_content.strip()
490
+ ):
491
+ if raw_user_content.rstrip().endswith(":"):
492
+ final_assistant_content = (
493
+ raw_user_content.rstrip() + "\n" + final_assistant_content
494
+ )
495
+ elif raw_user_content.endswith("\n"):
496
+ final_assistant_content = (
497
+ raw_user_content + final_assistant_content
498
+ )
499
+ else:
500
+ final_assistant_content = (
501
+ raw_user_content + "\n" + final_assistant_content
502
+ )
503
+
504
+ if not isinstance(final_user_content, str) or not isinstance(
505
+ final_assistant_content, str
506
+ ):
507
+ print(
508
+ f"Warning: Skipping ID {problem_id_str} in {filename} - "
509
+ f"invalid content types (user: {type(final_user_content)}, "
510
+ f"assistant: {type(final_assistant_content)})."
511
+ )
512
+ return None
513
+ if not final_user_content.strip() or not final_assistant_content.strip():
514
+ print(
515
+ f"Warning: Skipping ID {problem_id_str} in {filename} - "
516
+ "empty processed content."
517
+ )
518
+ return None
519
+ if final_assistant_content.strip() == "import sys; sys.exit(0)":
520
+ print(
521
+ f"Warning: Skipping ID {problem_id_str} in {filename} - "
522
+ "placeholder solution."
523
+ )
524
+ return None
525
+
526
+ return {
527
+ "messages": [
528
+ {"role": "user", "content": final_user_content.strip()},
529
+ {"role": "assistant", "content": final_assistant_content.strip()},
530
+ ]
531
+ }
532
+ except Exception as e:
533
+ print(
534
+ f"Warning: Skipping ID {problem_id_str} in {filename} - "
535
+ f"error ({type(e).__name__}: {e})."
536
+ )
537
+ import traceback
538
+
539
+ traceback.print_exc()
540
+ return None
541
+
542
+
543
+ def main():
544
+ """Process SandboxFusion datasets and normalize them to OpenAI JSONL format.
545
+
546
+ This function iterates through specified dataset files, normalizes each problem
547
+ to an OpenAI-compatible format, and writes the results to an output JSONL file.
548
+ It handles language filtering for multilingual datasets and logs errors or
549
+ skipped problems.
550
+ """
551
+ output_dir = os.path.dirname(OUTPUT_JSONL_FILE)
552
+ if output_dir and not os.path.exists(output_dir):
553
+ os.makedirs(output_dir)
554
+
555
+ processed_count = 0
556
+ skipped_count = 0
557
+ file_error_count = 0
558
+
559
+ print(
560
+ f"Starting dataset normalization. Output will be written to {OUTPUT_JSONL_FILE}"
561
+ )
562
+
563
+ with open(OUTPUT_JSONL_FILE, "w", encoding="utf-8") as outfile:
564
+ for filename_idx, filename in enumerate(ALL_SOURCE_JSONL_FILES):
565
+ filepath = os.path.join(SANDBOX_SAMPLES_DIR, filename)
566
+ is_multilingual = filename in MULTILINGUAL_JSONL_FILES_TO_FILTER
567
+
568
+ if not os.path.exists(filepath):
569
+ print(f"Warning: File not found, skipping: {filepath}")
570
+ file_error_count += 1
571
+ continue
572
+
573
+ print(
574
+ f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: "
575
+ f"{filename}..."
576
+ )
577
+ lines_in_file = 0
578
+ processed_in_file = 0
579
+ skipped_in_file = 0
580
+ try:
581
+ with open(filepath, "r", encoding="utf-8") as infile:
582
+ for line_number, line in enumerate(infile, 1):
583
+ lines_in_file += 1
584
+ stripped_line = line.strip()
585
+ if not stripped_line:
586
+ continue
587
+ try:
588
+ problem_data = json.loads(stripped_line)
589
+ except json.JSONDecodeError:
590
+ print(
591
+ f"Warning: Malformed JSON on line {line_number} "
592
+ f"in {filepath}. Skipping line."
593
+ )
594
+ skipped_in_file += 1
595
+ continue
596
+
597
+ normalized_problem = normalize_problem_to_openai_format(
598
+ problem_data, filename, is_multilingual
599
+ )
600
+ if normalized_problem:
601
+ outfile.write(json.dumps(normalized_problem) + "\n")
602
+ processed_in_file += 1
603
+ else:
604
+ skipped_in_file += 1
605
+ print(
606
+ f"Finished {filename}. Lines: {lines_in_file}, "
607
+ f"Processed: {processed_in_file}, Skipped: {skipped_in_file}"
608
+ )
609
+ processed_count += processed_in_file
610
+ skipped_count += skipped_in_file
611
+ except Exception as e:
612
+ print(
613
+ f"Error processing file {filepath}: {type(e).__name__}: {e}. "
614
+ "Skipping rest of file."
615
+ )
616
+ import traceback
617
+
618
+ traceback.print_exc()
619
+ file_error_count += 1
620
+
621
+ print("\nDataset normalization complete.")
622
+ print(f"Total problems processed and written: {processed_count}")
623
+ print(f"Total problems/lines skipped: {skipped_count}")
624
+ print(f"Total files with errors or not found: {file_error_count}")
625
+
626
+
627
+ if __name__ == "__main__":
628
+ main()
@@ -0,0 +1 @@
1
+ # This file makes the 'utils' directory a Python sub-package of 'development'.
@@ -0,0 +1,31 @@
1
+ import argparse
2
+ import secrets
3
+
4
+
5
+ def generate_api_key(length: int = 32) -> str:
6
+ """
7
+ Generates a cryptographically strong random string to be used as an API key.
8
+
9
+ Args:
10
+ length: The desired length of the API key in bytes.
11
+ The resulting hex string will be twice this length.
12
+ Default is 32 bytes, resulting in a 64-character hex string.
13
+
14
+ Returns:
15
+ A hex-encoded secure random string.
16
+ """
17
+ return secrets.token_hex(length)
18
+
19
+
20
+ if __name__ == "__main__":
21
+ parser = argparse.ArgumentParser(description="Generate a secure API key.")
22
+ parser.add_argument(
23
+ "--length",
24
+ type=int,
25
+ default=32,
26
+ help="Length of the key in bytes (default: 32, produces a 64-char hex string).",
27
+ )
28
+ args = parser.parse_args()
29
+
30
+ api_key = generate_api_key(args.length)
31
+ print(api_key)