eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1620 @@
1
+ # mypy: ignore-errors
2
+ """
3
+ Code execution reward functions for evaluating code correctness.
4
+
5
+ This module provides functions to evaluate the correctness of code by:
6
+ 1. Extracting code blocks from messages
7
+ 2. Executing the code in a secure environment (local or E2B sandbox)
8
+ 3. Comparing the output with expected results
9
+
10
+ Available reward functions:
11
+ - local_code_execution_reward: Execute code locally and evaluate correctness
12
+ - e2b_code_execution_reward: Execute code in E2B sandbox and evaluate correctness
13
+ - fractional_code_reward: Execute code and return exact pass rate
14
+ """
15
+
16
+ import faulthandler
17
+ import json
18
+ import multiprocessing
19
+ import os
20
+ import platform
21
+ import re
22
+ import resource
23
+ import shlex # Added for robust splitting of arguments
24
+ import signal
25
+ import subprocess
26
+ import sys
27
+ import tempfile
28
+ import traceback
29
+ from io import StringIO
30
+ from multiprocessing.managers import DictProxy
31
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
32
+
33
+ # Try to import from e2b_code_interpreter first (preferred)
34
+ try:
35
+ from e2b_code_interpreter.sync import Sandbox # type: ignore # Use SyncSandbox
36
+
37
+ _HAS_E2B = True
38
+ _E2B_SOURCE = "e2b_code_interpreter"
39
+ except ImportError:
40
+ # Fallback to e2b
41
+ try:
42
+ # Assuming 'e2b' package's default Sandbox is synchronous.
43
+ # If 'e2b' also defaults to async, this part might need adjustment too.
44
+ from e2b import Sandbox # type: ignore
45
+
46
+ _HAS_E2B = True
47
+ _E2B_SOURCE = "e2b"
48
+ except ImportError:
49
+ _HAS_E2B = False
50
+ _E2B_SOURCE = "" # Use empty string instead of None
51
+
52
+ from ..models import EvaluateResult, Message, MetricResult
53
+ from ..reward_function import reward_function
54
+
55
+
56
+ def _target_func_for_execution(result_container, execute_func, args):
57
+ try:
58
+ result = execute_func(*args)
59
+ result_container.update(result)
60
+ except Exception as e:
61
+ error_traceback = traceback.format_exc()
62
+ result_container.update(
63
+ {
64
+ "success": False,
65
+ "output": None,
66
+ "error": f"Execution error: {str(e)}\n{error_traceback}",
67
+ }
68
+ )
69
+
70
+
71
+ def extract_code_blocks(text: str, language: Optional[str] = None) -> List[Dict[str, str]]:
72
+ """
73
+ Extract code blocks from text.
74
+
75
+ Args:
76
+ text: The text to extract code blocks from
77
+ language: Optional language to filter by (e.g., "python", "javascript")
78
+
79
+ Returns:
80
+ List of dictionaries with "code" and "language" keys
81
+ """
82
+ pattern = r"```(\w*)\n([\s\S]*?)\n```"
83
+ matches = re.findall(pattern, text)
84
+
85
+ code_blocks = []
86
+ verbose_patterns_removed = []
87
+
88
+ # Define patterns for verbose text that might appear inside code blocks
89
+ # These patterns will be removed.
90
+ # Using re.DOTALL to make '.' match newlines.
91
+ verbose_regex_patterns = [
92
+ re.compile(r"<think>.*?</think>", re.DOTALL),
93
+ re.compile(r"<reasoning>.*?</reasoning>", re.DOTALL),
94
+ re.compile(r"Thinking:\s*.*?(?=\n\S)", re.DOTALL), # Matches "Thinking: ..." until a new non-whitespace line
95
+ re.compile(r"^\s*Here's the Python code.*?\n", re.MULTILINE | re.IGNORECASE),
96
+ re.compile(r"^\s*Okay, here is the code:.*?\n", re.MULTILINE | re.IGNORECASE),
97
+ ]
98
+
99
+ for lang, code_content in matches:
100
+ if language and lang and language.lower() != lang.lower():
101
+ continue
102
+
103
+ detected_lang = lang.lower() if lang else "unknown"
104
+ original_code_content = code_content
105
+ cleaned_code_content = code_content
106
+
107
+ for verbose_pattern in verbose_regex_patterns:
108
+ cleaned_code_content = verbose_pattern.sub("", cleaned_code_content)
109
+
110
+ if cleaned_code_content != original_code_content:
111
+ verbose_patterns_removed.append(f"Verbose content removed from '{detected_lang}' block.")
112
+
113
+ block_info = {
114
+ "language": detected_lang,
115
+ "code": cleaned_code_content.strip(),
116
+ }
117
+ if verbose_patterns_removed:
118
+ block_info["verbosity_cleaned_reason"] = "; ".join(verbose_patterns_removed)
119
+ verbose_patterns_removed = []
120
+
121
+ code_blocks.append(block_info)
122
+
123
+ return code_blocks
124
+
125
+
126
+ @reward_function
127
+ def local_code_execution_reward(
128
+ messages: List[Message],
129
+ ground_truth: Optional[str] = None, # This is the new expected_output_str
130
+ language: str = "python",
131
+ timeout: int = 5,
132
+ max_memory_mb: int = 100, # Specific to local execution
133
+ **kwargs,
134
+ ) -> EvaluateResult:
135
+ """
136
+ Evaluate code correctness by executing it locally and comparing the output.
137
+
138
+ This function executes code in a secure sandbox with memory limits, CPU limits,
139
+ and timeouts to prevent malicious code from harming the system.
140
+
141
+ Args:
142
+ messages: List of conversation messages. The last message is assumed to be the
143
+ assistant's response containing the code.
144
+ ground_truth: Expected output string from code execution. This corresponds to
145
+ the `expected_output_str` in the previous signature.
146
+ language: Programming language of the code ("python", "javascript", etc.)
147
+ timeout: Maximum execution time in seconds.
148
+ max_memory_mb: Maximum memory usage in megabytes (default: 100).
149
+ **kwargs: Additional keyword arguments.
150
+
151
+ Returns:
152
+ EvaluateResult with score and metrics.
153
+ """
154
+ metrics: Dict[str, MetricResult] = {}
155
+
156
+ if (
157
+ not messages
158
+ or not isinstance(messages[-1], Message)
159
+ or messages[-1].role != "assistant"
160
+ or messages[-1].content is None
161
+ ):
162
+ return EvaluateResult(
163
+ score=0.0,
164
+ reason="Invalid or missing assistant response in messages.",
165
+ metrics={
166
+ "error": MetricResult(
167
+ score=0.0,
168
+ is_score_valid=False,
169
+ reason="Last message not a valid assistant response.",
170
+ )
171
+ },
172
+ )
173
+
174
+ response_content = messages[-1].content
175
+ expected_output_str = ground_truth
176
+
177
+ code_blocks = extract_code_blocks(response_content, language)
178
+
179
+ if not code_blocks:
180
+ return EvaluateResult(
181
+ score=0.0,
182
+ reason=f"No {language} code blocks found in model's response.",
183
+ metrics={
184
+ "error": MetricResult(
185
+ score=0.0,
186
+ reason=f"No {language} code blocks found in model's response.",
187
+ is_score_valid=False,
188
+ )
189
+ },
190
+ )
191
+
192
+ code = code_blocks[0]["code"]
193
+
194
+ metrics["extracted_code"] = MetricResult(
195
+ score=0.0,
196
+ reason=f"Extracted code:\n```{language}\n{code}\n```",
197
+ is_score_valid=True,
198
+ )
199
+
200
+ if expected_output_str:
201
+ metrics["expected_output"] = MetricResult(
202
+ score=0.0,
203
+ reason=f"Expected output:\n{expected_output_str}",
204
+ is_score_valid=True,
205
+ )
206
+
207
+ if language.lower() == "python":
208
+ execution_result = execute_python_code(
209
+ code, timeout
210
+ ) # max_memory_mb is handled inside _execute_python_in_subprocess
211
+ elif language.lower() in ["javascript", "js"]:
212
+ execution_result = execute_javascript_code(code, timeout)
213
+ else:
214
+ metrics["error"] = MetricResult(score=0.0, reason=f"Unsupported language: {language}", is_score_valid=False)
215
+ return EvaluateResult(score=0.0, reason=f"Unsupported language: {language}", metrics=metrics)
216
+
217
+ if execution_result["success"]:
218
+ output = execution_result["output"]
219
+
220
+ metrics["execution_result"] = MetricResult(
221
+ score=1.0,
222
+ reason=f"Code executed successfully with output:\n{output}",
223
+ is_score_valid=True,
224
+ )
225
+
226
+ if expected_output_str:
227
+ similarity = compare_outputs(output, expected_output_str)
228
+ match_reason = (
229
+ f"Output similarity: {similarity:.2f}\n\nExpected:\n{expected_output_str}\n\nActual:\n{output}"
230
+ )
231
+
232
+ metrics["output_match"] = MetricResult(
233
+ score=similarity, reason=match_reason, is_score_valid=similarity == 1.0
234
+ )
235
+ final_reason = f"Execution successful. Output similarity: {similarity:.2f}."
236
+ return EvaluateResult(score=similarity, reason=final_reason, metrics=metrics)
237
+
238
+ final_reason = "Execution successful. No expected output to compare."
239
+ return EvaluateResult(score=1.0, reason=final_reason, metrics=metrics)
240
+ else:
241
+ error = execution_result["error"]
242
+
243
+ metrics["execution_result"] = MetricResult(
244
+ score=0.0,
245
+ reason=f"Code execution failed with error:\n{error}",
246
+ is_score_valid=False,
247
+ )
248
+ final_reason = f"Code execution failed: {error}"
249
+ return EvaluateResult(score=0.0, reason=final_reason, metrics=metrics)
250
+
251
+
252
+ def _process_target_wrapper(execute_func: Callable, args: Tuple, result_container: DictProxy):
253
+ try:
254
+ result = execute_func(*args)
255
+ result_container.update(result)
256
+ except Exception as e:
257
+ error_traceback = traceback.format_exc()
258
+ result_container.update(
259
+ {
260
+ "success": False,
261
+ "output": None,
262
+ "error": f"Execution error: {str(e)}\n{error_traceback}",
263
+ }
264
+ )
265
+
266
+
267
+ def _execute_code_in_process(execute_func: Callable, args: Tuple, timeout: int = 5) -> Dict[str, Any]:
268
+ """
269
+ Execute code in a separate process with timeout and resource limits.
270
+
271
+ Args:
272
+ execute_func: Function to execute the code
273
+ args: Arguments to pass to the execute function
274
+ timeout: Maximum execution time in seconds
275
+
276
+ Returns:
277
+ Dictionary with execution results
278
+ """
279
+ import multiprocessing
280
+
281
+ manager = multiprocessing.Manager()
282
+ result_dict = manager.dict()
283
+
284
+ process = multiprocessing.Process(target=_process_target_wrapper, args=(execute_func, args, result_dict))
285
+ process.start()
286
+ process.join(timeout=timeout + 0.5)
287
+
288
+ if process.is_alive():
289
+ process.terminate()
290
+ process.join(0.5)
291
+ if process.is_alive():
292
+ process.kill()
293
+ return {
294
+ "success": False,
295
+ "output": None,
296
+ "error": f"Timeout: execution timed out after {timeout} seconds",
297
+ }
298
+
299
+ if not result_dict:
300
+ return {
301
+ "success": False,
302
+ "output": None,
303
+ "error": "Execution failed without producing any output",
304
+ }
305
+
306
+ return dict(result_dict)
307
+
308
+
309
+ def _execute_python_in_subprocess(code: str, timeout: int) -> Dict[str, Any]:
310
+ """
311
+ Inner function to execute Python code in a subprocess.
312
+
313
+ Args:
314
+ code: Python code to execute
315
+ timeout: Maximum execution time in seconds
316
+
317
+ Returns:
318
+ Dictionary with execution results
319
+ """
320
+ try:
321
+ with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp_file:
322
+ temp_file_path = temp_file.name
323
+
324
+ safe_code = (
325
+ "import sys\n"
326
+ "import os\n"
327
+ "import signal\n"
328
+ "import resource\n"
329
+ "import platform\n\n"
330
+ "def _reliability_guard():\n"
331
+ " memory_limit = 100 * 1024 * 1024 # 100 MB\n"
332
+ " if platform.uname().system != 'Darwin':\n"
333
+ " resource.setrlimit(resource.RLIMIT_AS, (memory_limit, memory_limit))\n"
334
+ " resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, memory_limit))\n"
335
+ " resource.setrlimit(resource.RLIMIT_STACK, (memory_limit, memory_limit))\n"
336
+ " import builtins\n"
337
+ " builtins.exit = None\n"
338
+ " builtins.quit = None\n"
339
+ " os.environ['OMP_NUM_THREADS'] = '1'\n"
340
+ " os.system = None\n"
341
+ " os.popen = None\n"
342
+ " os.execl = None\n"
343
+ " os.execve = None\n"
344
+ " os.fork = None\n"
345
+ " os.remove = None\n"
346
+ " os.removedirs = None\n"
347
+ " os.rmdir = None\n"
348
+ " os.unlink = None\n"
349
+ " os.access = None\n"
350
+ "\n"
351
+ "_reliability_guard()\n\n" + code
352
+ )
353
+
354
+ temp_file.write(safe_code.encode("utf-8"))
355
+
356
+ def timeout_handler(signum, frame):
357
+ raise TimeoutError(f"Execution timed out after {timeout} seconds")
358
+
359
+ signal.signal(signal.SIGALRM, timeout_handler)
360
+ signal.alarm(timeout)
361
+
362
+ try:
363
+ process = subprocess.Popen(
364
+ [sys.executable, temp_file_path],
365
+ stdout=subprocess.PIPE,
366
+ stderr=subprocess.PIPE,
367
+ text=True,
368
+ preexec_fn=lambda: resource.setrlimit(resource.RLIMIT_CPU, (timeout, timeout + 1)),
369
+ )
370
+
371
+ stdout, stderr = process.communicate()
372
+ signal.alarm(0)
373
+
374
+ if process.returncode == 0:
375
+ return {
376
+ "success": True,
377
+ "output": stdout.strip(),
378
+ "error": None,
379
+ }
380
+ else:
381
+ return {
382
+ "success": False,
383
+ "output": None,
384
+ "error": stderr.strip(),
385
+ }
386
+ except TimeoutError as e:
387
+ return {"success": False, "output": None, "error": str(e)}
388
+ finally:
389
+ signal.alarm(0)
390
+ if os.path.exists(temp_file_path):
391
+ os.unlink(temp_file_path)
392
+ except Exception as e:
393
+ error_traceback = traceback.format_exc()
394
+ return {
395
+ "success": False,
396
+ "output": None,
397
+ "error": f"Setup error: {str(e)}\n{error_traceback}",
398
+ }
399
+
400
+
401
+ def execute_python_code(code: str, timeout: int = 5) -> Dict[str, Any]:
402
+ """
403
+ Execute Python code in a secure sandbox.
404
+
405
+ Args:
406
+ code: Python code to execute
407
+ timeout: Maximum execution time in seconds
408
+
409
+ Returns:
410
+ Dictionary with execution results
411
+ """
412
+ return _execute_code_in_process(_execute_python_in_subprocess, args=(code, timeout), timeout=timeout)
413
+
414
+
415
+ def _execute_javascript_in_subprocess(code: str, timeout: int) -> Dict[str, Any]:
416
+ """
417
+ Inner function to execute JavaScript code in a subprocess.
418
+
419
+ Args:
420
+ code: JavaScript code to execute
421
+ timeout: Maximum execution time in seconds
422
+
423
+ Returns:
424
+ Dictionary with execution results
425
+ """
426
+ try:
427
+ try:
428
+ subprocess.run(["node", "--version"], capture_output=True, check=True)
429
+ except (subprocess.SubprocessError, FileNotFoundError):
430
+ return {
431
+ "success": False,
432
+ "output": None,
433
+ "error": "Node.js is not installed or not found in PATH",
434
+ }
435
+
436
+ with tempfile.NamedTemporaryFile(suffix=".js", delete=False) as temp_file:
437
+ temp_file_path = temp_file.name
438
+
439
+ safe_code = (
440
+ "// Safety wrapper to prevent dangerous operations\n"
441
+ "process.on('uncaughtException', function(err) {\n"
442
+ " console.error('Uncaught exception:', err.message);\n"
443
+ " process.exit(1);\n"
444
+ "});\n\n"
445
+ "process.exit = function() { console.error('exit() is disabled'); };\n"
446
+ "process.kill = function() { console.error('kill() is disabled'); };\n"
447
+ "const fs = require('fs');\n"
448
+ "const originalFsReadFile = fs.readFileSync;\n"
449
+ "const originalFsWriteFile = fs.writeFileSync;\n"
450
+ "fs.readFileSync = function() { console.error('fs.readFileSync() is disabled'); return ''; };\n"
451
+ "fs.writeFileSync = function() { console.error('fs.writeFileSync() is disabled'); };\n"
452
+ "const originalRequire = require;\n"
453
+ "global.require = function(module) {\n"
454
+ " const safeModules = ['assert', 'buffer', 'crypto', 'events', 'path', 'querystring',\n"
455
+ " 'string_decoder', 'stream', 'timers', 'url', 'util', 'zlib'];\n"
456
+ " if (safeModules.includes(module)) {\n"
457
+ " return originalRequire(module);\n"
458
+ " } else {\n"
459
+ " console.error(`Requiring module '${module}' is not allowed for security reasons`);\n"
460
+ " return {};\n"
461
+ " }\n"
462
+ "};\n\n"
463
+ "try {\n"
464
+ " " + code.replace("\n", "\n ") + "\n"
465
+ "} catch (error) {\n"
466
+ " console.error('Code execution error:', error.message);\n"
467
+ " process.exitCode = 1;\n"
468
+ "}\n"
469
+ )
470
+
471
+ temp_file.write(safe_code.encode("utf-8"))
472
+
473
+ def timeout_handler(signum, frame):
474
+ raise TimeoutError(f"Execution timed out after {timeout} seconds")
475
+
476
+ signal.signal(signal.SIGALRM, timeout_handler)
477
+ signal.alarm(timeout)
478
+
479
+ try:
480
+ process = subprocess.Popen(
481
+ [
482
+ "node",
483
+ "--no-warnings",
484
+ "--max-old-space-size=100",
485
+ temp_file_path,
486
+ ],
487
+ stdout=subprocess.PIPE,
488
+ stderr=subprocess.PIPE,
489
+ text=True,
490
+ )
491
+
492
+ try:
493
+ stdout, stderr = process.communicate(timeout=timeout)
494
+ except subprocess.TimeoutExpired:
495
+ process.kill()
496
+ stdout, stderr = process.communicate()
497
+ signal.alarm(0)
498
+ return {
499
+ "success": False,
500
+ "output": None,
501
+ "error": f"JavaScript execution timed out after {timeout} seconds (subprocess.TimeoutExpired). Output: {stdout.strip()}, Error: {stderr.strip()}",
502
+ }
503
+
504
+ signal.alarm(0)
505
+
506
+ if process.returncode == 0:
507
+ return {
508
+ "success": True,
509
+ "output": stdout.strip(),
510
+ "error": None,
511
+ }
512
+ else:
513
+ return {
514
+ "success": False,
515
+ "output": None,
516
+ "error": stderr.strip() or f"JavaScript process exited with code {process.returncode}",
517
+ }
518
+ except TimeoutError as e:
519
+ process.kill()
520
+ _, _ = process.communicate()
521
+ return {
522
+ "success": False,
523
+ "output": None,
524
+ "error": f"JavaScript execution timed out after {timeout} seconds (signal.alarm): {str(e)}",
525
+ }
526
+ finally:
527
+ signal.alarm(0)
528
+ if os.path.exists(temp_file_path):
529
+ os.unlink(temp_file_path)
530
+
531
+ except Exception as e:
532
+ error_traceback = traceback.format_exc()
533
+ return {
534
+ "success": False,
535
+ "output": None,
536
+ "error": f"Setup error: {str(e)}\n{error_traceback}",
537
+ }
538
+
539
+
540
+ def execute_javascript_code(code: str, timeout: int = 5) -> Dict[str, Any]:
541
+ """
542
+ Execute JavaScript code in a secure sandbox.
543
+
544
+ Args:
545
+ code: JavaScript code to execute
546
+ timeout: Maximum execution time in seconds
547
+
548
+ Returns:
549
+ Dictionary with execution results
550
+ """
551
+ return _execute_code_in_process(_execute_javascript_in_subprocess, args=(code, timeout), timeout=timeout)
552
+
553
+
554
+ def compare_outputs(actual: str, expected: str) -> float:
555
+ """
556
+ Compare actual and expected outputs to calculate a similarity score.
557
+
558
+ Args:
559
+ actual: Actual output from code execution
560
+ expected: Expected output
561
+
562
+ Returns:
563
+ Similarity score between 0.0 and 1.0
564
+ """
565
+ actual_norm = normalize_output(actual)
566
+ expected_norm = normalize_output(expected)
567
+
568
+ if actual_norm == expected_norm:
569
+ return 1.0
570
+
571
+ if is_numeric(actual_norm) and is_numeric(expected_norm):
572
+ try:
573
+ actual_num = float(actual_norm)
574
+ expected_num = float(expected_norm)
575
+
576
+ if expected_num == 0:
577
+ return 1.0 if actual_num == 0 else 0.0
578
+
579
+ rel_diff = abs(actual_num - expected_num) / abs(expected_num)
580
+ if rel_diff <= 0.001:
581
+ return 1.0
582
+ elif rel_diff <= 0.01:
583
+ return 0.9
584
+ elif rel_diff <= 0.1:
585
+ return 0.7
586
+ else:
587
+ return max(0.0, 1.0 - min(1.0, rel_diff))
588
+ except (ValueError, TypeError):
589
+ pass
590
+
591
+ if (
592
+ actual_norm.startswith("[")
593
+ and actual_norm.endswith("]")
594
+ and expected_norm.startswith("[")
595
+ and expected_norm.endswith("]")
596
+ ):
597
+ try:
598
+ actual_list = json.loads(actual_norm)
599
+ expected_list = json.loads(expected_norm)
600
+
601
+ if not actual_list and not expected_list:
602
+ return 1.0
603
+
604
+ if not isinstance(actual_list, list) or not isinstance(expected_list, list):
605
+ raise ValueError("Not a list")
606
+
607
+ len_similarity = 1.0 - min(
608
+ 1.0,
609
+ abs(len(actual_list) - len(expected_list)) / max(1, max(len(actual_list), len(expected_list))),
610
+ )
611
+
612
+ items_similarity = 0.0
613
+ if len(actual_list) > 0 and len(expected_list) > 0:
614
+ total_similarity = 0.0
615
+ for exp_item in expected_list:
616
+ best_match = 0.0
617
+ for act_item in actual_list:
618
+ item_similarity = compare_outputs(str(act_item), str(exp_item))
619
+ best_match = max(best_match, item_similarity)
620
+ total_similarity += best_match
621
+ items_similarity = total_similarity / len(expected_list)
622
+ return 0.3 * len_similarity + 0.7 * items_similarity
623
+ except (ValueError, json.JSONDecodeError):
624
+ pass
625
+
626
+ if "\n" in actual_norm or "\n" in expected_norm:
627
+ actual_lines = actual_norm.strip().split("\n")
628
+ expected_lines = expected_norm.strip().split("\n")
629
+
630
+ if not actual_lines and not expected_lines:
631
+ return 1.0
632
+
633
+ len_similarity = 1.0 - min(
634
+ 1.0,
635
+ abs(len(actual_lines) - len(expected_lines)) / max(1, max(len(actual_lines), len(expected_lines))),
636
+ )
637
+
638
+ lines_similarity = 0.0
639
+ common_len = min(len(actual_lines), len(expected_lines))
640
+ if common_len > 0:
641
+ total_similarity = 0.0
642
+ for i in range(common_len):
643
+ line_similarity = string_similarity(actual_lines[i], expected_lines[i])
644
+ total_similarity += line_similarity
645
+ lines_similarity = total_similarity / common_len
646
+ return 0.3 * len_similarity + 0.7 * lines_similarity
647
+
648
+ return string_similarity(actual_norm, expected_norm)
649
+
650
+
651
+ def string_similarity(s1: str, s2: str) -> float:
652
+ """
653
+ Calculate string similarity using character-level comparison.
654
+
655
+ Args:
656
+ s1: First string
657
+ s2: Second string
658
+
659
+ Returns:
660
+ Similarity score between 0.0 and 1.0
661
+ """
662
+ if not s1 and not s2:
663
+ return 1.0
664
+ if not s1 or not s2:
665
+ return 0.0
666
+
667
+ m, n = len(s1), len(s2)
668
+ lcs_length = longest_common_subsequence_length(s1, s2)
669
+
670
+ return lcs_length / max(m, n)
671
+
672
+
673
+ def longest_common_subsequence_length(s1: str, s2: str) -> int:
674
+ """
675
+ Calculate the length of the longest common subsequence.
676
+
677
+ Args:
678
+ s1: First string
679
+ s2: Second string
680
+
681
+ Returns:
682
+ Length of longest common subsequence
683
+ """
684
+ m, n = len(s1), len(s2)
685
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
686
+
687
+ for i in range(1, m + 1):
688
+ for j in range(1, n + 1):
689
+ if s1[i - 1] == s2[j - 1]:
690
+ dp[i][j] = dp[i - 1][j - 1] + 1
691
+ else:
692
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
693
+
694
+ return dp[m][n]
695
+
696
+
697
+ def normalize_output(output: str) -> str:
698
+ """
699
+ Normalize output for comparison.
700
+
701
+ Args:
702
+ output: Output string to normalize
703
+
704
+ Returns:
705
+ Normalized output string
706
+ """
707
+ normalized = output.strip()
708
+ normalized = normalized.replace("\r\n", "\n").replace("\r", "\n")
709
+ normalized = re.sub(r"\s+", " ", normalized)
710
+ return normalized
711
+
712
+
713
+ def is_numeric(value: str) -> bool:
714
+ """
715
+ Check if a string value represents a numeric value.
716
+
717
+ Args:
718
+ value: String value to check
719
+
720
+ Returns:
721
+ True if the value is numeric, False otherwise
722
+ """
723
+ try:
724
+ float(value)
725
+ return True
726
+ except (ValueError, TypeError):
727
+ return False
728
+
729
+
730
+ def noop(*args: Any, **kwargs: Any) -> Any:
731
+ """A no-operation function that returns None."""
732
+ return None
733
+
734
+
735
+ def execute_code_with_e2b(
736
+ code: str,
737
+ language: str = "python",
738
+ timeout: int = 30,
739
+ api_key: Optional[str] = None,
740
+ ) -> Dict[str, Any]:
741
+ """
742
+ Execute code within an E2B sandbox.
743
+
744
+ Args:
745
+ code: Code to execute
746
+ language: Programming language of the code ("python", "javascript", etc.)
747
+ timeout: Maximum execution time in seconds
748
+ api_key: Optional E2B API key (if not provided, will use E2B_API_KEY env var)
749
+
750
+ Returns:
751
+ Dictionary with execution results
752
+ """
753
+ if not _HAS_E2B:
754
+ return {
755
+ "success": False,
756
+ "output": None,
757
+ "error": "E2B package not installed. Install with: pip install e2b",
758
+ }
759
+
760
+ try:
761
+ if api_key is None and os.environ.get("E2B_API_KEY") is None:
762
+ return {
763
+ "success": False,
764
+ "output": None,
765
+ "error": "API key is required for E2B execution. Set it using the api_key parameter or E2B_API_KEY environment variable.",
766
+ }
767
+
768
+ with Sandbox(api_key=api_key) as sandbox:
769
+ stdout = []
770
+ stderr = []
771
+
772
+ def capture_stdout(output):
773
+ if hasattr(output, "line"):
774
+ stdout.append(output.line)
775
+ else:
776
+ stdout.append(str(output))
777
+
778
+ def capture_stderr(output):
779
+ if hasattr(output, "line"):
780
+ stderr.append(output.line)
781
+ else:
782
+ stderr.append(str(output))
783
+
784
+ sandbox.on_exit = lambda *args: None # type: ignore[method-assign, assignment]
785
+
786
+ if language.lower() in ["python", "py"]:
787
+ file_path = "/code/script.py"
788
+ cmd = "python3 /code/script.py"
789
+ elif language.lower() in ["javascript", "js"]:
790
+ file_path = "/code/script.js"
791
+ cmd = "node /code/script.js"
792
+ else:
793
+ return {
794
+ "success": False,
795
+ "output": None,
796
+ "error": f"Unsupported language for E2B: {language}",
797
+ }
798
+
799
+ try:
800
+ fs_handler = None
801
+ if _E2B_SOURCE == "e2b_code_interpreter":
802
+ if hasattr(sandbox, "filesystem"):
803
+ fs_handler = sandbox.filesystem
804
+ elif _E2B_SOURCE == "e2b":
805
+ if hasattr(sandbox, "_filesystem"):
806
+ fs_handler = sandbox._filesystem
807
+ elif hasattr(sandbox, "filesystem"):
808
+ fs_handler = sandbox.filesystem
809
+
810
+ if not fs_handler:
811
+ return {
812
+ "success": False,
813
+ "output": None,
814
+ "error": "Could not access E2B sandbox filesystem handler.",
815
+ }
816
+
817
+ try:
818
+ fs_handler.make_dir("/code")
819
+ except Exception:
820
+ pass
821
+
822
+ fs_handler.write(file_path, code)
823
+ except Exception as e:
824
+ return {
825
+ "success": False,
826
+ "output": None,
827
+ "error": f"Failed to write code to sandbox: {str(e)}",
828
+ }
829
+
830
+ try:
831
+ result = sandbox.commands.run(
832
+ cmd,
833
+ on_stdout=capture_stdout,
834
+ on_stderr=capture_stderr,
835
+ timeout=timeout,
836
+ )
837
+
838
+ output = "\n".join(stdout)
839
+ error_output = "\n".join(stderr)
840
+
841
+ if result.exit_code == 0:
842
+ return {"success": True, "output": output, "error": None}
843
+ else:
844
+ return {
845
+ "success": False,
846
+ "output": None,
847
+ "error": f"Process exited with code {result.exit_code}: {error_output}",
848
+ }
849
+
850
+ except Exception as e:
851
+ return {
852
+ "success": False,
853
+ "output": None,
854
+ "error": f"Execution error: {str(e)}",
855
+ }
856
+
857
+ except Exception as e:
858
+ error_traceback = traceback.format_exc()
859
+ return {
860
+ "success": False,
861
+ "output": None,
862
+ "error": f"E2B setup error: {str(e)}\n{error_traceback}",
863
+ }
864
+
865
+
866
+ @reward_function
867
+ def e2b_code_execution_reward(
868
+ messages: List[Message],
869
+ ground_truth: Optional[str] = None,
870
+ language: str = "python",
871
+ timeout: int = 30,
872
+ api_key: Optional[str] = None,
873
+ **kwargs,
874
+ ) -> EvaluateResult:
875
+ """
876
+ Evaluate code correctness by executing it in E2B sandbox and comparing the output.
877
+
878
+ E2B provides a secure, cloud-based sandbox for executing code safely.
879
+
880
+ Args:
881
+ messages: List of conversation messages. The last message is assumed to be the
882
+ assistant's response containing the code.
883
+ ground_truth: Expected output string from code execution. This corresponds to
884
+ the `expected_output_str` in the previous signature.
885
+ language: Programming language of the code ("python", "javascript", etc.)
886
+ timeout: Maximum execution time in seconds.
887
+ api_key: Optional E2B API key (if not provided, will use E2B_API_KEY env var).
888
+ **kwargs: Additional keyword arguments.
889
+
890
+ Returns:
891
+ EvaluateResult with score and metrics.
892
+ """
893
+ if not _HAS_E2B:
894
+ return EvaluateResult(
895
+ score=0.0,
896
+ reason="E2B package not installed.",
897
+ metrics={
898
+ "error": MetricResult(
899
+ score=0.0,
900
+ reason="E2B package not installed. Install with: pip install e2b",
901
+ is_score_valid=False,
902
+ )
903
+ },
904
+ )
905
+
906
+ if api_key is None and os.environ.get("E2B_API_KEY") is None:
907
+ return EvaluateResult(
908
+ score=0.0,
909
+ reason="E2B API key is required.",
910
+ metrics={
911
+ "error": MetricResult(
912
+ score=0.0,
913
+ reason="E2B API key is required. Set the E2B_API_KEY environment variable or provide api_key parameter.",
914
+ is_score_valid=False,
915
+ )
916
+ },
917
+ )
918
+
919
+ metrics: Dict[str, MetricResult] = {}
920
+
921
+ if (
922
+ not messages
923
+ or not isinstance(messages[-1], Message)
924
+ or messages[-1].role != "assistant"
925
+ or messages[-1].content is None
926
+ ):
927
+ return EvaluateResult(
928
+ score=0.0,
929
+ reason="Invalid or missing assistant response in messages.",
930
+ metrics={
931
+ "error": MetricResult(
932
+ score=0.0,
933
+ is_score_valid=False,
934
+ reason="Last message not a valid assistant response.",
935
+ )
936
+ },
937
+ )
938
+
939
+ response_content = messages[-1].content
940
+ expected_output_str = ground_truth
941
+
942
+ code_blocks = extract_code_blocks(response_content, language)
943
+
944
+ if not code_blocks:
945
+ return EvaluateResult(
946
+ score=0.0,
947
+ reason=f"No {language} code blocks found in model's response.",
948
+ metrics={
949
+ "error": MetricResult(
950
+ score=0.0,
951
+ reason=f"No {language} code blocks found in model's response.",
952
+ is_score_valid=False,
953
+ )
954
+ },
955
+ )
956
+
957
+ code = code_blocks[0]["code"]
958
+
959
+ metrics["extracted_code"] = MetricResult(
960
+ score=0.0,
961
+ reason=f"Extracted code:\n```{language}\n{code}\n```",
962
+ is_score_valid=True,
963
+ )
964
+
965
+ if expected_output_str:
966
+ metrics["expected_output"] = MetricResult(
967
+ score=0.0,
968
+ reason=f"Expected output:\n{expected_output_str}",
969
+ is_score_valid=True,
970
+ )
971
+
972
+ execution_result = execute_code_with_e2b(code=code, language=language, timeout=timeout, api_key=api_key)
973
+
974
+ if execution_result["success"]:
975
+ output = execution_result["output"]
976
+
977
+ metrics["execution_result"] = MetricResult(
978
+ score=1.0,
979
+ reason=f"Code executed successfully in E2B sandbox with output:\n{output}",
980
+ is_score_valid=True,
981
+ )
982
+
983
+ if expected_output_str:
984
+ similarity = compare_outputs(output, expected_output_str)
985
+ match_reason = (
986
+ f"Output similarity: {similarity:.2f}\n\nExpected:\n{expected_output_str}\n\nActual:\n{output}"
987
+ )
988
+
989
+ metrics["output_match"] = MetricResult(
990
+ score=similarity, reason=match_reason, is_score_valid=similarity == 1.0
991
+ )
992
+ final_reason = f"E2B execution successful. Output similarity: {similarity:.2f}."
993
+ return EvaluateResult(score=similarity, reason=final_reason, metrics=metrics)
994
+
995
+ final_reason = "E2B execution successful. No expected output to compare."
996
+ return EvaluateResult(score=1.0, reason=final_reason, metrics=metrics)
997
+ else:
998
+ error = execution_result["error"]
999
+
1000
+ metrics["execution_result"] = MetricResult(
1001
+ score=0.0,
1002
+ reason=f"Code execution failed in E2B sandbox with error:\n{error}",
1003
+ is_score_valid=False,
1004
+ )
1005
+ final_reason = f"E2B code execution failed: {error}"
1006
+ return EvaluateResult(score=0.0, reason=final_reason, metrics=metrics)
1007
+
1008
+
1009
+ @reward_function
1010
+ def fractional_code_reward(
1011
+ messages: List[Message],
1012
+ ground_truth: Union[Optional[str], Optional[List[Dict[str, Any]]]],
1013
+ language: str = "python",
1014
+ timeout: int = 30,
1015
+ environment: str = "local",
1016
+ api_key: Optional[str] = None,
1017
+ **kwargs: Any,
1018
+ ) -> EvaluateResult:
1019
+ """
1020
+ Execute code and return the exact pass rate as a score between 0 and 1.
1021
+
1022
+ Unlike the binary code reward, this function returns the actual score representing
1023
+ how closely the code output matches the expected output or how many test cases pass.
1024
+
1025
+ Args:
1026
+ messages: List of conversation messages. The last message is assumed to be the
1027
+ assistant's response containing the code.
1028
+ ground_truth: Expected output string from code execution, OR a list of test cases.
1029
+ If a string, it's direct output comparison.
1030
+ If a list of dicts, each dict is a test case with "input" and "expected_output".
1031
+ language: Programming language of the code ("python", "javascript", etc.).
1032
+ timeout: Maximum execution time in seconds.
1033
+ environment: Environment to run the code in ("local" or "e2b").
1034
+ api_key: Optional E2B API key (if using e2b environment).
1035
+ **kwargs: Additional keyword arguments (e.g., function_to_call for _run_test_cases).
1036
+
1037
+ Returns:
1038
+ EvaluateResult with score between 0 and 1 representing the exact pass rate.
1039
+ """
1040
+ metrics_strings: Dict[str, str] = {}
1041
+
1042
+ if (
1043
+ not messages
1044
+ or not isinstance(messages[-1], Message)
1045
+ or messages[-1].role != "assistant"
1046
+ or messages[-1].content is None
1047
+ ):
1048
+ return EvaluateResult(
1049
+ score=0.0,
1050
+ reason="Invalid or missing assistant response in messages for fractional code reward.",
1051
+ metrics={
1052
+ "error": MetricResult(
1053
+ score=0.0,
1054
+ is_score_valid=False,
1055
+ reason="Last message not a valid assistant response.",
1056
+ )
1057
+ },
1058
+ )
1059
+
1060
+ response_content = messages[-1].content
1061
+
1062
+ expected_output_str_from_gt: Optional[str] = None
1063
+ test_cases_from_gt: Optional[List[Dict[str, Any]]] = None
1064
+
1065
+ if isinstance(ground_truth, str):
1066
+ expected_output_str_from_gt = ground_truth
1067
+ elif isinstance(ground_truth, list):
1068
+ if all(isinstance(item, dict) for item in ground_truth):
1069
+ test_cases_from_gt = ground_truth
1070
+ else:
1071
+ return EvaluateResult(
1072
+ score=0.0,
1073
+ reason="Invalid ground_truth format: expected string or list of test case dicts.",
1074
+ metrics={
1075
+ "error": MetricResult(
1076
+ score=0.0,
1077
+ is_score_valid=False,
1078
+ reason="Invalid ground_truth format.",
1079
+ )
1080
+ },
1081
+ )
1082
+ elif ground_truth is not None:
1083
+ return EvaluateResult(
1084
+ score=0.0,
1085
+ reason="Invalid ground_truth format: expected string, list of test case dicts, or None.",
1086
+ metrics={
1087
+ "error": MetricResult(
1088
+ score=0.0,
1089
+ is_score_valid=False,
1090
+ reason="Invalid ground_truth format.",
1091
+ )
1092
+ },
1093
+ )
1094
+
1095
+ code_blocks = extract_code_blocks(response_content, language)
1096
+
1097
+ if not code_blocks:
1098
+ return EvaluateResult(
1099
+ score=0.0,
1100
+ reason=f"No {language} code blocks found in model's response for fractional code reward.",
1101
+ metrics={
1102
+ "error": MetricResult(
1103
+ score=0.0,
1104
+ reason=f"No {language} code blocks found in model's response.",
1105
+ is_score_valid=False,
1106
+ )
1107
+ },
1108
+ )
1109
+
1110
+ code = code_blocks[0]["code"]
1111
+
1112
+ metrics_strings["extracted_code"] = f"Extracted code:\n```{language}\n{code}\n```"
1113
+
1114
+ if expected_output_str_from_gt and not test_cases_from_gt:
1115
+ metrics_strings["expected_output"] = f"Expected output:\n{expected_output_str_from_gt}"
1116
+
1117
+ if test_cases_from_gt:
1118
+ return _run_test_cases(
1119
+ code=code,
1120
+ language=language,
1121
+ test_cases=test_cases_from_gt,
1122
+ timeout=timeout,
1123
+ environment=environment,
1124
+ api_key=api_key,
1125
+ **kwargs,
1126
+ )
1127
+
1128
+ execution_result: Dict[str, Any]
1129
+ if environment.lower() == "e2b":
1130
+ if not _HAS_E2B:
1131
+ return EvaluateResult(
1132
+ score=0.0,
1133
+ reason="E2B package not installed for fractional code reward.",
1134
+ metrics={
1135
+ "error": MetricResult(
1136
+ score=0.0,
1137
+ reason="E2B package not installed. Install with: pip install e2b",
1138
+ is_score_valid=False,
1139
+ )
1140
+ },
1141
+ )
1142
+ execution_result = execute_code_with_e2b(code=code, language=language, timeout=timeout, api_key=api_key)
1143
+ else:
1144
+ if language.lower() == "python":
1145
+ execution_result = execute_python_code(code, timeout)
1146
+ elif language.lower() in ["javascript", "js"]:
1147
+ execution_result = execute_javascript_code(code, timeout)
1148
+ else:
1149
+ final_metrics_on_error: Dict[str, MetricResult] = {
1150
+ k: MetricResult(score=0.0, reason=v, is_score_valid=(k == "extracted_code"))
1151
+ for k, v in metrics_strings.items()
1152
+ }
1153
+ final_metrics_on_error["error"] = MetricResult(
1154
+ score=0.0,
1155
+ reason=f"Unsupported language: {language}",
1156
+ is_score_valid=False,
1157
+ )
1158
+ return EvaluateResult(
1159
+ score=0.0,
1160
+ reason=f"Unsupported language for fractional code reward: {language}",
1161
+ metrics=final_metrics_on_error,
1162
+ )
1163
+
1164
+ metric_results: Dict[str, MetricResult] = {
1165
+ k: MetricResult(
1166
+ score=0.0,
1167
+ reason=v,
1168
+ is_score_valid=(
1169
+ k == "extracted_code" or (k == "expected_output" and expected_output_str_from_gt is not None)
1170
+ ),
1171
+ )
1172
+ for k, v in metrics_strings.items()
1173
+ }
1174
+
1175
+ if execution_result["success"]:
1176
+ output = execution_result["output"]
1177
+ metric_results["execution_result"] = MetricResult(
1178
+ score=1.0,
1179
+ reason=f"Code executed successfully with output:\n{output}",
1180
+ is_score_valid=True,
1181
+ )
1182
+
1183
+ if expected_output_str_from_gt:
1184
+ similarity = compare_outputs(output, expected_output_str_from_gt)
1185
+ match_reason = (
1186
+ f"Output similarity: {similarity:.2f}\n\nExpected:\n{expected_output_str_from_gt}\n\nActual:\n{output}"
1187
+ )
1188
+ metric_results["output_match"] = MetricResult(
1189
+ score=similarity, reason=match_reason, is_score_valid=similarity == 1.0
1190
+ )
1191
+ final_reason = f"Fractional code execution successful. Output similarity: {similarity:.2f}."
1192
+ return EvaluateResult(score=similarity, reason=final_reason, metrics=metric_results)
1193
+ else:
1194
+ final_reason = "Fractional code execution successful. No expected output string to compare."
1195
+ return EvaluateResult(score=1.0, reason=final_reason, metrics=metric_results)
1196
+ else:
1197
+ error = execution_result["error"]
1198
+ metric_results["execution_result"] = MetricResult(
1199
+ score=0.0,
1200
+ reason=f"Code execution failed with error:\n{error}",
1201
+ is_score_valid=False,
1202
+ )
1203
+ final_reason = f"Fractional code execution failed: {error}"
1204
+ return EvaluateResult(score=0.0, reason=final_reason, metrics=metric_results)
1205
+
1206
+
1207
+ def _run_test_cases(
1208
+ code: str,
1209
+ language: str,
1210
+ test_cases: List[Dict[str, Any]],
1211
+ timeout: int,
1212
+ environment: str,
1213
+ api_key: Optional[str] = None,
1214
+ function_to_call: Optional[str] = None,
1215
+ prompt_for_name_extraction: Optional[str] = None, # Not used yet, but for future use
1216
+ **kwargs: Any, # Keep kwargs for flexibility, though function_to_call is now explicit
1217
+ ) -> EvaluateResult: # Changed return type hint to match actual returns
1218
+ """
1219
+ Run code against multiple test cases and return the fraction of passing tests.
1220
+ Can optionally call a specific function if `function_to_call` is provided.
1221
+
1222
+ Args:
1223
+ code: The code to execute
1224
+ language: Programming language of the code
1225
+ test_cases: List of test cases with input and expected output
1226
+ timeout: Maximum execution time in seconds
1227
+ environment: Environment to run the code in ("local" or "e2b")
1228
+ api_key: Optional E2B API key (if using e2b environment)
1229
+
1230
+ Returns:
1231
+ EvaluateResult with score representing the fraction of passing tests
1232
+ """
1233
+ metrics: Dict[str, Any] = {}
1234
+ results = []
1235
+ passed = 0
1236
+ total = len(test_cases)
1237
+
1238
+ if total == 0:
1239
+ return EvaluateResult(
1240
+ score=0.0,
1241
+ reason="No test cases provided",
1242
+ metrics={"error": MetricResult(score=0.0, reason="No test cases provided", is_score_valid=False)},
1243
+ )
1244
+
1245
+ if language.lower() in ["python", "py"]:
1246
+ if function_to_call:
1247
+
1248
+ def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
1249
+ import ast
1250
+ import json
1251
+
1252
+ def refine_evaluated_value(val: Any) -> Any:
1253
+ if isinstance(val, str):
1254
+ stripped_val = val.strip()
1255
+ if stripped_val.startswith(("[", "{")):
1256
+ try:
1257
+ return json.loads(stripped_val)
1258
+ except json.JSONDecodeError:
1259
+ return val
1260
+ else:
1261
+ try:
1262
+ if "." in stripped_val or "e" in stripped_val.lower() or "E" in stripped_val:
1263
+ return float(stripped_val)
1264
+ else:
1265
+ return int(stripped_val)
1266
+ except ValueError:
1267
+ return val
1268
+ return val
1269
+
1270
+ parsed_args = []
1271
+ args_str_stripped = test_input_str.strip()
1272
+
1273
+ if not args_str_stripped:
1274
+ pass
1275
+ else:
1276
+ parsed_as_single_arg = False
1277
+ try:
1278
+ val_from_json = json.loads(args_str_stripped)
1279
+ parsed_args.append(refine_evaluated_value(val_from_json))
1280
+ parsed_as_single_arg = True
1281
+ except json.JSONDecodeError:
1282
+ try:
1283
+ val_from_ast = ast.literal_eval(args_str_stripped)
1284
+ parsed_args.append(refine_evaluated_value(val_from_ast))
1285
+ parsed_as_single_arg = True
1286
+ except (ValueError, SyntaxError):
1287
+ pass
1288
+
1289
+ if not parsed_as_single_arg:
1290
+ try:
1291
+ arg_parts = shlex.split(args_str_stripped)
1292
+ except ValueError:
1293
+ arg_parts = [args_str_stripped]
1294
+
1295
+ for part_str in arg_parts:
1296
+ try:
1297
+ val_from_part_ast = ast.literal_eval(part_str)
1298
+ parsed_args.append(refine_evaluated_value(val_from_part_ast))
1299
+ except (ValueError, SyntaxError):
1300
+ parsed_args.append(refine_evaluated_value(part_str))
1301
+
1302
+ args_repr = ", ".join(map(repr, parsed_args))
1303
+
1304
+ return f"""import sys
1305
+ import json
1306
+ import traceback
1307
+
1308
+ {user_code}
1309
+
1310
+ try:
1311
+ result = {func_name}({args_repr})
1312
+ print(repr(result))
1313
+ except Exception as e:
1314
+ import traceback
1315
+ print(f'Error calling function {func_name}: {{traceback.format_exc()}}', file=sys.stderr)
1316
+ import sys
1317
+ sys.exit(1)
1318
+ """
1319
+
1320
+ else:
1321
+
1322
+ def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
1323
+ escaped_test_input = json.dumps(test_input_str)[1:-1].replace("'''", "'\\''\\''\\''")
1324
+ return f"""import sys
1325
+ from io import StringIO
1326
+
1327
+ original_stdout = sys.stdout
1328
+ sys.stdout = captured_stdout = StringIO()
1329
+ sys.stdin = StringIO('''{escaped_test_input}''')
1330
+
1331
+ try:
1332
+ exec({repr(user_code)})
1333
+ except Exception as e:
1334
+ import traceback
1335
+ print(f'Error executing script: {{traceback.format_exc()}}', file=sys.stderr)
1336
+ import sys
1337
+ sys.exit(1)
1338
+
1339
+ sys.stdout = original_stdout
1340
+ print(captured_stdout.getvalue(), end='')
1341
+ """
1342
+
1343
+ elif language.lower() in ["javascript", "js"]:
1344
+ if function_to_call:
1345
+
1346
+ def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
1347
+ args_str = test_input_str.strip()
1348
+ parsed_args_js = []
1349
+ if args_str:
1350
+ for arg in args_str.split():
1351
+ if arg.isdigit() or (arg.startswith("-") and arg[1:].isdigit()):
1352
+ parsed_args_js.append(arg)
1353
+ elif "." in arg and all(
1354
+ c.isdigit() or c == "." or (i == 0 and c == "-") for i, c in enumerate(arg)
1355
+ ):
1356
+ try:
1357
+ float(arg)
1358
+ parsed_args_js.append(arg)
1359
+ except ValueError:
1360
+ parsed_args_js.append(json.dumps(arg))
1361
+ else:
1362
+ parsed_args_js.append(json.dumps(arg))
1363
+
1364
+ args_js_repr = ", ".join(parsed_args_js)
1365
+ return f"""{user_code}
1366
+
1367
+ try {{
1368
+ const result = {func_name}({args_js_repr});
1369
+ console.log(JSON.stringify(result));
1370
+ }} catch (error) {{
1371
+ console.error(`Error calling function {func_name}:`, error);
1372
+ process.exitCode = 1;
1373
+ }}
1374
+ """
1375
+
1376
+ else:
1377
+
1378
+ def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
1379
+ input_lines = test_input_str.strip().split("\n")
1380
+ input_setup = "const inputs = " + json.dumps(input_lines) + ";\n"
1381
+ input_setup += "let inputIndex = 0;\n"
1382
+ input_setup += "const readline = () => inputs[inputIndex++];\n"
1383
+ return f"""const originalLog = console.log;
1384
+ let output = '';
1385
+ console.log = function(...args) {{
1386
+ output += args.map(String).join(' ') + '\\n';
1387
+ }};
1388
+
1389
+ {input_setup}
1390
+
1391
+ try {{
1392
+ {user_code}
1393
+ }} catch (error) {{
1394
+ console.error('Error executing script:', error);
1395
+ process.exitCode = 1;
1396
+ }}
1397
+
1398
+ console.log = originalLog;
1399
+ process.stdout.write(output);
1400
+ """
1401
+
1402
+ else:
1403
+ return EvaluateResult(
1404
+ score=0.0,
1405
+ reason=f"Unsupported language for test cases: {language}",
1406
+ metrics={
1407
+ "error": MetricResult(
1408
+ score=0.0,
1409
+ reason=f"Unsupported language for test cases: {language}",
1410
+ is_score_valid=False,
1411
+ )
1412
+ },
1413
+ )
1414
+
1415
+ for i, test_case in enumerate(test_cases):
1416
+ test_input = test_case.get("input", "")
1417
+ expected = test_case.get("expected_output", "")
1418
+
1419
+ test_code_prepared = prepare_test_code(code, test_input, function_to_call)
1420
+
1421
+ if environment.lower() == "e2b":
1422
+ if not _HAS_E2B:
1423
+ return EvaluateResult(
1424
+ score=0.0,
1425
+ reason="E2B package not installed for test cases.",
1426
+ metrics={
1427
+ "error": MetricResult(
1428
+ score=0.0,
1429
+ reason="E2B package not installed. Install with: pip install e2b",
1430
+ is_score_valid=False,
1431
+ )
1432
+ },
1433
+ )
1434
+
1435
+ execution_result = execute_code_with_e2b(
1436
+ code=test_code_prepared,
1437
+ language=language,
1438
+ timeout=timeout,
1439
+ api_key=api_key,
1440
+ )
1441
+ else:
1442
+ if language.lower() in ["python", "py"]:
1443
+ execution_result = execute_python_code(test_code_prepared, timeout)
1444
+ elif language.lower() in ["javascript", "js"]:
1445
+ execution_result = execute_javascript_code(test_code_prepared, timeout)
1446
+ else:
1447
+ return EvaluateResult(
1448
+ score=0.0,
1449
+ reason=f"Unsupported language for local execution: {language}",
1450
+ metrics={
1451
+ "error": MetricResult(
1452
+ score=0.0,
1453
+ reason=f"Unsupported language for local execution: {language}",
1454
+ is_score_valid=False,
1455
+ )
1456
+ },
1457
+ )
1458
+
1459
+ test_result = {
1460
+ "test_number": i + 1,
1461
+ "input": test_input,
1462
+ "expected_output": expected,
1463
+ "passed": False,
1464
+ "details": "",
1465
+ }
1466
+
1467
+ if execution_result["success"]:
1468
+ output = execution_result["output"]
1469
+ normalized_output = normalize_output(output)
1470
+ normalized_expected = normalize_output(expected)
1471
+
1472
+ expected_repr = repr(expected) if function_to_call and language.lower() in ["python", "py"] else None
1473
+ normalized_expected_repr = normalize_output(expected_repr) if expected_repr else None
1474
+
1475
+ is_pass = normalized_output == normalized_expected
1476
+ if not is_pass and normalized_expected_repr:
1477
+ is_pass = normalized_output == normalized_expected_repr
1478
+
1479
+ test_result["passed"] = is_pass
1480
+ test_result["actual_output"] = output
1481
+ test_result["normalized_actual"] = normalized_output
1482
+ test_result["normalized_expected"] = normalized_expected
1483
+ test_result["details"] = f"Passed: {is_pass}"
1484
+
1485
+ if test_result["passed"]:
1486
+ passed += 1
1487
+ else:
1488
+ test_result["error"] = execution_result["error"]
1489
+ test_result["details"] = f"Error: {execution_result['error']}"
1490
+
1491
+ results.append(test_result)
1492
+
1493
+ score = passed / total if total > 0 else 0.0
1494
+
1495
+ if isinstance(results, list):
1496
+ metrics["test_results"] = results
1497
+ else:
1498
+ metrics["test_results"] = [{"error": "Invalid results format"}]
1499
+ metrics["pass_rate"] = f"{passed}/{total} tests passed ({score:.2%})"
1500
+
1501
+ final_metrics: Dict[str, MetricResult] = {}
1502
+ for key, value in metrics.items():
1503
+ if key == "test_results":
1504
+ final_metrics[key] = MetricResult(
1505
+ score=score,
1506
+ reason=json.dumps(value, indent=2),
1507
+ is_score_valid=score == 1.0,
1508
+ )
1509
+ elif key == "pass_rate":
1510
+ final_metrics[key] = MetricResult(
1511
+ score=score,
1512
+ reason=str(value),
1513
+ is_score_valid=score == 1.0,
1514
+ )
1515
+ elif isinstance(value, MetricResult):
1516
+ final_metrics[key] = value
1517
+ elif isinstance(value, str):
1518
+ final_metrics[key] = MetricResult(score=0.0, reason=value, is_score_valid=False)
1519
+
1520
+ return EvaluateResult(score=score, reason=f"{passed}/{total} tests passed.", metrics=final_metrics)
1521
+
1522
+
1523
+ def reliability_guard(maximum_memory_bytes: Optional[int] = None) -> None:
1524
+ """
1525
+ Disable various destructive functions and prevent the generated code
1526
+ from interfering with the test system.
1527
+
1528
+ This sets resource limits and disables various system calls that could
1529
+ be used to interfere with the testing environment.
1530
+
1531
+ Args:
1532
+ maximum_memory_bytes: Maximum memory allocation allowed in bytes (optional)
1533
+
1534
+ Warning:
1535
+ This function is NOT a security sandbox. Untrusted code should not be
1536
+ blindly executed outside of a proper sandbox environment.
1537
+ """
1538
+ if maximum_memory_bytes is not None:
1539
+ if platform.uname().system != "Darwin":
1540
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
1541
+ resource.setrlimit(
1542
+ resource.RLIMIT_DATA,
1543
+ (maximum_memory_bytes, maximum_memory_bytes),
1544
+ )
1545
+ resource.setrlimit(
1546
+ resource.RLIMIT_STACK,
1547
+ (maximum_memory_bytes, maximum_memory_bytes),
1548
+ )
1549
+
1550
+ faulthandler.disable()
1551
+
1552
+ import builtins
1553
+
1554
+ builtins.exit = noop # type: ignore
1555
+ builtins.quit = noop # type: ignore
1556
+
1557
+ os.environ["OMP_NUM_THREADS"] = "1"
1558
+
1559
+ os.kill = noop # type: ignore
1560
+ os.system = noop # type: ignore
1561
+ os.putenv = noop # type: ignore
1562
+ os.remove = noop # type: ignore
1563
+ os.removedirs = noop # type: ignore
1564
+ os.rmdir = noop # type: ignore
1565
+ os.fchdir = noop # type: ignore
1566
+ os.setuid = noop # type: ignore
1567
+ os.fork = noop # type: ignore
1568
+ os.forkpty = noop # type: ignore
1569
+ os.killpg = noop # type: ignore
1570
+ os.rename = noop # type: ignore
1571
+ os.renames = noop # type: ignore
1572
+ os.truncate = noop # type: ignore
1573
+ os.replace = noop # type: ignore
1574
+ os.unlink = noop # type: ignore
1575
+ os.fchmod = noop # type: ignore
1576
+ os.fchown = noop # type: ignore
1577
+ os.chmod = noop # type: ignore
1578
+ os.chown = noop # type: ignore
1579
+ os.chroot = noop # type: ignore
1580
+
1581
+ if hasattr(os, "lchflags"):
1582
+ os.lchflags = noop # type: ignore
1583
+ if hasattr(os, "lchmod"):
1584
+ os.lchmod = noop # type: ignore
1585
+ if hasattr(os, "lchown"):
1586
+ os.lchown = noop # type: ignore
1587
+
1588
+ import shutil
1589
+
1590
+ shutil.rmtree = noop # type: ignore
1591
+ shutil.move = noop # type: ignore
1592
+ shutil.chown = noop # type: ignore
1593
+
1594
+ class EmptyModule:
1595
+ def __getattr__(self, name: str) -> Any:
1596
+ return noop
1597
+
1598
+ for mod_name in ["ipdb", "joblib", "psutil", "tkinter"]:
1599
+ if mod_name not in sys.modules:
1600
+ sys.modules[mod_name] = EmptyModule() # type: ignore
1601
+
1602
+
1603
+ class Capturing(list):
1604
+ """
1605
+ Context manager for capturing stdout output.
1606
+
1607
+ This class captures all output to stdout and stores it in a list,
1608
+ allowing for the examination of output from executed code.
1609
+ """
1610
+
1611
+ def __enter__(self):
1612
+ self._stdout = sys.stdout
1613
+ sys.stdout = self._stringio = StringIO()
1614
+ self._stringio.close = lambda x: None
1615
+ return self
1616
+
1617
+ def __exit__(self, *args):
1618
+ self.append(self._stringio.getvalue())
1619
+ del self._stringio
1620
+ sys.stdout = self._stdout