eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1620 @@
|
|
|
1
|
+
# mypy: ignore-errors
|
|
2
|
+
"""
|
|
3
|
+
Code execution reward functions for evaluating code correctness.
|
|
4
|
+
|
|
5
|
+
This module provides functions to evaluate the correctness of code by:
|
|
6
|
+
1. Extracting code blocks from messages
|
|
7
|
+
2. Executing the code in a secure environment (local or E2B sandbox)
|
|
8
|
+
3. Comparing the output with expected results
|
|
9
|
+
|
|
10
|
+
Available reward functions:
|
|
11
|
+
- local_code_execution_reward: Execute code locally and evaluate correctness
|
|
12
|
+
- e2b_code_execution_reward: Execute code in E2B sandbox and evaluate correctness
|
|
13
|
+
- fractional_code_reward: Execute code and return exact pass rate
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import faulthandler
|
|
17
|
+
import json
|
|
18
|
+
import multiprocessing
|
|
19
|
+
import os
|
|
20
|
+
import platform
|
|
21
|
+
import re
|
|
22
|
+
import resource
|
|
23
|
+
import shlex # Added for robust splitting of arguments
|
|
24
|
+
import signal
|
|
25
|
+
import subprocess
|
|
26
|
+
import sys
|
|
27
|
+
import tempfile
|
|
28
|
+
import traceback
|
|
29
|
+
from io import StringIO
|
|
30
|
+
from multiprocessing.managers import DictProxy
|
|
31
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
32
|
+
|
|
33
|
+
# Try to import from e2b_code_interpreter first (preferred)
|
|
34
|
+
try:
|
|
35
|
+
from e2b_code_interpreter.sync import Sandbox # type: ignore # Use SyncSandbox
|
|
36
|
+
|
|
37
|
+
_HAS_E2B = True
|
|
38
|
+
_E2B_SOURCE = "e2b_code_interpreter"
|
|
39
|
+
except ImportError:
|
|
40
|
+
# Fallback to e2b
|
|
41
|
+
try:
|
|
42
|
+
# Assuming 'e2b' package's default Sandbox is synchronous.
|
|
43
|
+
# If 'e2b' also defaults to async, this part might need adjustment too.
|
|
44
|
+
from e2b import Sandbox # type: ignore
|
|
45
|
+
|
|
46
|
+
_HAS_E2B = True
|
|
47
|
+
_E2B_SOURCE = "e2b"
|
|
48
|
+
except ImportError:
|
|
49
|
+
_HAS_E2B = False
|
|
50
|
+
_E2B_SOURCE = "" # Use empty string instead of None
|
|
51
|
+
|
|
52
|
+
from ..models import EvaluateResult, Message, MetricResult
|
|
53
|
+
from ..reward_function import reward_function
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _target_func_for_execution(result_container, execute_func, args):
|
|
57
|
+
try:
|
|
58
|
+
result = execute_func(*args)
|
|
59
|
+
result_container.update(result)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
error_traceback = traceback.format_exc()
|
|
62
|
+
result_container.update(
|
|
63
|
+
{
|
|
64
|
+
"success": False,
|
|
65
|
+
"output": None,
|
|
66
|
+
"error": f"Execution error: {str(e)}\n{error_traceback}",
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_code_blocks(text: str, language: Optional[str] = None) -> List[Dict[str, str]]:
|
|
72
|
+
"""
|
|
73
|
+
Extract code blocks from text.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
text: The text to extract code blocks from
|
|
77
|
+
language: Optional language to filter by (e.g., "python", "javascript")
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of dictionaries with "code" and "language" keys
|
|
81
|
+
"""
|
|
82
|
+
pattern = r"```(\w*)\n([\s\S]*?)\n```"
|
|
83
|
+
matches = re.findall(pattern, text)
|
|
84
|
+
|
|
85
|
+
code_blocks = []
|
|
86
|
+
verbose_patterns_removed = []
|
|
87
|
+
|
|
88
|
+
# Define patterns for verbose text that might appear inside code blocks
|
|
89
|
+
# These patterns will be removed.
|
|
90
|
+
# Using re.DOTALL to make '.' match newlines.
|
|
91
|
+
verbose_regex_patterns = [
|
|
92
|
+
re.compile(r"<think>.*?</think>", re.DOTALL),
|
|
93
|
+
re.compile(r"<reasoning>.*?</reasoning>", re.DOTALL),
|
|
94
|
+
re.compile(r"Thinking:\s*.*?(?=\n\S)", re.DOTALL), # Matches "Thinking: ..." until a new non-whitespace line
|
|
95
|
+
re.compile(r"^\s*Here's the Python code.*?\n", re.MULTILINE | re.IGNORECASE),
|
|
96
|
+
re.compile(r"^\s*Okay, here is the code:.*?\n", re.MULTILINE | re.IGNORECASE),
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
for lang, code_content in matches:
|
|
100
|
+
if language and lang and language.lower() != lang.lower():
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
detected_lang = lang.lower() if lang else "unknown"
|
|
104
|
+
original_code_content = code_content
|
|
105
|
+
cleaned_code_content = code_content
|
|
106
|
+
|
|
107
|
+
for verbose_pattern in verbose_regex_patterns:
|
|
108
|
+
cleaned_code_content = verbose_pattern.sub("", cleaned_code_content)
|
|
109
|
+
|
|
110
|
+
if cleaned_code_content != original_code_content:
|
|
111
|
+
verbose_patterns_removed.append(f"Verbose content removed from '{detected_lang}' block.")
|
|
112
|
+
|
|
113
|
+
block_info = {
|
|
114
|
+
"language": detected_lang,
|
|
115
|
+
"code": cleaned_code_content.strip(),
|
|
116
|
+
}
|
|
117
|
+
if verbose_patterns_removed:
|
|
118
|
+
block_info["verbosity_cleaned_reason"] = "; ".join(verbose_patterns_removed)
|
|
119
|
+
verbose_patterns_removed = []
|
|
120
|
+
|
|
121
|
+
code_blocks.append(block_info)
|
|
122
|
+
|
|
123
|
+
return code_blocks
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@reward_function
|
|
127
|
+
def local_code_execution_reward(
|
|
128
|
+
messages: List[Message],
|
|
129
|
+
ground_truth: Optional[str] = None, # This is the new expected_output_str
|
|
130
|
+
language: str = "python",
|
|
131
|
+
timeout: int = 5,
|
|
132
|
+
max_memory_mb: int = 100, # Specific to local execution
|
|
133
|
+
**kwargs,
|
|
134
|
+
) -> EvaluateResult:
|
|
135
|
+
"""
|
|
136
|
+
Evaluate code correctness by executing it locally and comparing the output.
|
|
137
|
+
|
|
138
|
+
This function executes code in a secure sandbox with memory limits, CPU limits,
|
|
139
|
+
and timeouts to prevent malicious code from harming the system.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
messages: List of conversation messages. The last message is assumed to be the
|
|
143
|
+
assistant's response containing the code.
|
|
144
|
+
ground_truth: Expected output string from code execution. This corresponds to
|
|
145
|
+
the `expected_output_str` in the previous signature.
|
|
146
|
+
language: Programming language of the code ("python", "javascript", etc.)
|
|
147
|
+
timeout: Maximum execution time in seconds.
|
|
148
|
+
max_memory_mb: Maximum memory usage in megabytes (default: 100).
|
|
149
|
+
**kwargs: Additional keyword arguments.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
EvaluateResult with score and metrics.
|
|
153
|
+
"""
|
|
154
|
+
metrics: Dict[str, MetricResult] = {}
|
|
155
|
+
|
|
156
|
+
if (
|
|
157
|
+
not messages
|
|
158
|
+
or not isinstance(messages[-1], Message)
|
|
159
|
+
or messages[-1].role != "assistant"
|
|
160
|
+
or messages[-1].content is None
|
|
161
|
+
):
|
|
162
|
+
return EvaluateResult(
|
|
163
|
+
score=0.0,
|
|
164
|
+
reason="Invalid or missing assistant response in messages.",
|
|
165
|
+
metrics={
|
|
166
|
+
"error": MetricResult(
|
|
167
|
+
score=0.0,
|
|
168
|
+
is_score_valid=False,
|
|
169
|
+
reason="Last message not a valid assistant response.",
|
|
170
|
+
)
|
|
171
|
+
},
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
response_content = messages[-1].content
|
|
175
|
+
expected_output_str = ground_truth
|
|
176
|
+
|
|
177
|
+
code_blocks = extract_code_blocks(response_content, language)
|
|
178
|
+
|
|
179
|
+
if not code_blocks:
|
|
180
|
+
return EvaluateResult(
|
|
181
|
+
score=0.0,
|
|
182
|
+
reason=f"No {language} code blocks found in model's response.",
|
|
183
|
+
metrics={
|
|
184
|
+
"error": MetricResult(
|
|
185
|
+
score=0.0,
|
|
186
|
+
reason=f"No {language} code blocks found in model's response.",
|
|
187
|
+
is_score_valid=False,
|
|
188
|
+
)
|
|
189
|
+
},
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
code = code_blocks[0]["code"]
|
|
193
|
+
|
|
194
|
+
metrics["extracted_code"] = MetricResult(
|
|
195
|
+
score=0.0,
|
|
196
|
+
reason=f"Extracted code:\n```{language}\n{code}\n```",
|
|
197
|
+
is_score_valid=True,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if expected_output_str:
|
|
201
|
+
metrics["expected_output"] = MetricResult(
|
|
202
|
+
score=0.0,
|
|
203
|
+
reason=f"Expected output:\n{expected_output_str}",
|
|
204
|
+
is_score_valid=True,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if language.lower() == "python":
|
|
208
|
+
execution_result = execute_python_code(
|
|
209
|
+
code, timeout
|
|
210
|
+
) # max_memory_mb is handled inside _execute_python_in_subprocess
|
|
211
|
+
elif language.lower() in ["javascript", "js"]:
|
|
212
|
+
execution_result = execute_javascript_code(code, timeout)
|
|
213
|
+
else:
|
|
214
|
+
metrics["error"] = MetricResult(score=0.0, reason=f"Unsupported language: {language}", is_score_valid=False)
|
|
215
|
+
return EvaluateResult(score=0.0, reason=f"Unsupported language: {language}", metrics=metrics)
|
|
216
|
+
|
|
217
|
+
if execution_result["success"]:
|
|
218
|
+
output = execution_result["output"]
|
|
219
|
+
|
|
220
|
+
metrics["execution_result"] = MetricResult(
|
|
221
|
+
score=1.0,
|
|
222
|
+
reason=f"Code executed successfully with output:\n{output}",
|
|
223
|
+
is_score_valid=True,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if expected_output_str:
|
|
227
|
+
similarity = compare_outputs(output, expected_output_str)
|
|
228
|
+
match_reason = (
|
|
229
|
+
f"Output similarity: {similarity:.2f}\n\nExpected:\n{expected_output_str}\n\nActual:\n{output}"
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
metrics["output_match"] = MetricResult(
|
|
233
|
+
score=similarity, reason=match_reason, is_score_valid=similarity == 1.0
|
|
234
|
+
)
|
|
235
|
+
final_reason = f"Execution successful. Output similarity: {similarity:.2f}."
|
|
236
|
+
return EvaluateResult(score=similarity, reason=final_reason, metrics=metrics)
|
|
237
|
+
|
|
238
|
+
final_reason = "Execution successful. No expected output to compare."
|
|
239
|
+
return EvaluateResult(score=1.0, reason=final_reason, metrics=metrics)
|
|
240
|
+
else:
|
|
241
|
+
error = execution_result["error"]
|
|
242
|
+
|
|
243
|
+
metrics["execution_result"] = MetricResult(
|
|
244
|
+
score=0.0,
|
|
245
|
+
reason=f"Code execution failed with error:\n{error}",
|
|
246
|
+
is_score_valid=False,
|
|
247
|
+
)
|
|
248
|
+
final_reason = f"Code execution failed: {error}"
|
|
249
|
+
return EvaluateResult(score=0.0, reason=final_reason, metrics=metrics)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _process_target_wrapper(execute_func: Callable, args: Tuple, result_container: DictProxy):
|
|
253
|
+
try:
|
|
254
|
+
result = execute_func(*args)
|
|
255
|
+
result_container.update(result)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
error_traceback = traceback.format_exc()
|
|
258
|
+
result_container.update(
|
|
259
|
+
{
|
|
260
|
+
"success": False,
|
|
261
|
+
"output": None,
|
|
262
|
+
"error": f"Execution error: {str(e)}\n{error_traceback}",
|
|
263
|
+
}
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _execute_code_in_process(execute_func: Callable, args: Tuple, timeout: int = 5) -> Dict[str, Any]:
|
|
268
|
+
"""
|
|
269
|
+
Execute code in a separate process with timeout and resource limits.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
execute_func: Function to execute the code
|
|
273
|
+
args: Arguments to pass to the execute function
|
|
274
|
+
timeout: Maximum execution time in seconds
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Dictionary with execution results
|
|
278
|
+
"""
|
|
279
|
+
import multiprocessing
|
|
280
|
+
|
|
281
|
+
manager = multiprocessing.Manager()
|
|
282
|
+
result_dict = manager.dict()
|
|
283
|
+
|
|
284
|
+
process = multiprocessing.Process(target=_process_target_wrapper, args=(execute_func, args, result_dict))
|
|
285
|
+
process.start()
|
|
286
|
+
process.join(timeout=timeout + 0.5)
|
|
287
|
+
|
|
288
|
+
if process.is_alive():
|
|
289
|
+
process.terminate()
|
|
290
|
+
process.join(0.5)
|
|
291
|
+
if process.is_alive():
|
|
292
|
+
process.kill()
|
|
293
|
+
return {
|
|
294
|
+
"success": False,
|
|
295
|
+
"output": None,
|
|
296
|
+
"error": f"Timeout: execution timed out after {timeout} seconds",
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if not result_dict:
|
|
300
|
+
return {
|
|
301
|
+
"success": False,
|
|
302
|
+
"output": None,
|
|
303
|
+
"error": "Execution failed without producing any output",
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
return dict(result_dict)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _execute_python_in_subprocess(code: str, timeout: int) -> Dict[str, Any]:
|
|
310
|
+
"""
|
|
311
|
+
Inner function to execute Python code in a subprocess.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
code: Python code to execute
|
|
315
|
+
timeout: Maximum execution time in seconds
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Dictionary with execution results
|
|
319
|
+
"""
|
|
320
|
+
try:
|
|
321
|
+
with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp_file:
|
|
322
|
+
temp_file_path = temp_file.name
|
|
323
|
+
|
|
324
|
+
safe_code = (
|
|
325
|
+
"import sys\n"
|
|
326
|
+
"import os\n"
|
|
327
|
+
"import signal\n"
|
|
328
|
+
"import resource\n"
|
|
329
|
+
"import platform\n\n"
|
|
330
|
+
"def _reliability_guard():\n"
|
|
331
|
+
" memory_limit = 100 * 1024 * 1024 # 100 MB\n"
|
|
332
|
+
" if platform.uname().system != 'Darwin':\n"
|
|
333
|
+
" resource.setrlimit(resource.RLIMIT_AS, (memory_limit, memory_limit))\n"
|
|
334
|
+
" resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, memory_limit))\n"
|
|
335
|
+
" resource.setrlimit(resource.RLIMIT_STACK, (memory_limit, memory_limit))\n"
|
|
336
|
+
" import builtins\n"
|
|
337
|
+
" builtins.exit = None\n"
|
|
338
|
+
" builtins.quit = None\n"
|
|
339
|
+
" os.environ['OMP_NUM_THREADS'] = '1'\n"
|
|
340
|
+
" os.system = None\n"
|
|
341
|
+
" os.popen = None\n"
|
|
342
|
+
" os.execl = None\n"
|
|
343
|
+
" os.execve = None\n"
|
|
344
|
+
" os.fork = None\n"
|
|
345
|
+
" os.remove = None\n"
|
|
346
|
+
" os.removedirs = None\n"
|
|
347
|
+
" os.rmdir = None\n"
|
|
348
|
+
" os.unlink = None\n"
|
|
349
|
+
" os.access = None\n"
|
|
350
|
+
"\n"
|
|
351
|
+
"_reliability_guard()\n\n" + code
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
temp_file.write(safe_code.encode("utf-8"))
|
|
355
|
+
|
|
356
|
+
def timeout_handler(signum, frame):
|
|
357
|
+
raise TimeoutError(f"Execution timed out after {timeout} seconds")
|
|
358
|
+
|
|
359
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
360
|
+
signal.alarm(timeout)
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
process = subprocess.Popen(
|
|
364
|
+
[sys.executable, temp_file_path],
|
|
365
|
+
stdout=subprocess.PIPE,
|
|
366
|
+
stderr=subprocess.PIPE,
|
|
367
|
+
text=True,
|
|
368
|
+
preexec_fn=lambda: resource.setrlimit(resource.RLIMIT_CPU, (timeout, timeout + 1)),
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
stdout, stderr = process.communicate()
|
|
372
|
+
signal.alarm(0)
|
|
373
|
+
|
|
374
|
+
if process.returncode == 0:
|
|
375
|
+
return {
|
|
376
|
+
"success": True,
|
|
377
|
+
"output": stdout.strip(),
|
|
378
|
+
"error": None,
|
|
379
|
+
}
|
|
380
|
+
else:
|
|
381
|
+
return {
|
|
382
|
+
"success": False,
|
|
383
|
+
"output": None,
|
|
384
|
+
"error": stderr.strip(),
|
|
385
|
+
}
|
|
386
|
+
except TimeoutError as e:
|
|
387
|
+
return {"success": False, "output": None, "error": str(e)}
|
|
388
|
+
finally:
|
|
389
|
+
signal.alarm(0)
|
|
390
|
+
if os.path.exists(temp_file_path):
|
|
391
|
+
os.unlink(temp_file_path)
|
|
392
|
+
except Exception as e:
|
|
393
|
+
error_traceback = traceback.format_exc()
|
|
394
|
+
return {
|
|
395
|
+
"success": False,
|
|
396
|
+
"output": None,
|
|
397
|
+
"error": f"Setup error: {str(e)}\n{error_traceback}",
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def execute_python_code(code: str, timeout: int = 5) -> Dict[str, Any]:
|
|
402
|
+
"""
|
|
403
|
+
Execute Python code in a secure sandbox.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
code: Python code to execute
|
|
407
|
+
timeout: Maximum execution time in seconds
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Dictionary with execution results
|
|
411
|
+
"""
|
|
412
|
+
return _execute_code_in_process(_execute_python_in_subprocess, args=(code, timeout), timeout=timeout)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _execute_javascript_in_subprocess(code: str, timeout: int) -> Dict[str, Any]:
|
|
416
|
+
"""
|
|
417
|
+
Inner function to execute JavaScript code in a subprocess.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
code: JavaScript code to execute
|
|
421
|
+
timeout: Maximum execution time in seconds
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
Dictionary with execution results
|
|
425
|
+
"""
|
|
426
|
+
try:
|
|
427
|
+
try:
|
|
428
|
+
subprocess.run(["node", "--version"], capture_output=True, check=True)
|
|
429
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
430
|
+
return {
|
|
431
|
+
"success": False,
|
|
432
|
+
"output": None,
|
|
433
|
+
"error": "Node.js is not installed or not found in PATH",
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
with tempfile.NamedTemporaryFile(suffix=".js", delete=False) as temp_file:
|
|
437
|
+
temp_file_path = temp_file.name
|
|
438
|
+
|
|
439
|
+
safe_code = (
|
|
440
|
+
"// Safety wrapper to prevent dangerous operations\n"
|
|
441
|
+
"process.on('uncaughtException', function(err) {\n"
|
|
442
|
+
" console.error('Uncaught exception:', err.message);\n"
|
|
443
|
+
" process.exit(1);\n"
|
|
444
|
+
"});\n\n"
|
|
445
|
+
"process.exit = function() { console.error('exit() is disabled'); };\n"
|
|
446
|
+
"process.kill = function() { console.error('kill() is disabled'); };\n"
|
|
447
|
+
"const fs = require('fs');\n"
|
|
448
|
+
"const originalFsReadFile = fs.readFileSync;\n"
|
|
449
|
+
"const originalFsWriteFile = fs.writeFileSync;\n"
|
|
450
|
+
"fs.readFileSync = function() { console.error('fs.readFileSync() is disabled'); return ''; };\n"
|
|
451
|
+
"fs.writeFileSync = function() { console.error('fs.writeFileSync() is disabled'); };\n"
|
|
452
|
+
"const originalRequire = require;\n"
|
|
453
|
+
"global.require = function(module) {\n"
|
|
454
|
+
" const safeModules = ['assert', 'buffer', 'crypto', 'events', 'path', 'querystring',\n"
|
|
455
|
+
" 'string_decoder', 'stream', 'timers', 'url', 'util', 'zlib'];\n"
|
|
456
|
+
" if (safeModules.includes(module)) {\n"
|
|
457
|
+
" return originalRequire(module);\n"
|
|
458
|
+
" } else {\n"
|
|
459
|
+
" console.error(`Requiring module '${module}' is not allowed for security reasons`);\n"
|
|
460
|
+
" return {};\n"
|
|
461
|
+
" }\n"
|
|
462
|
+
"};\n\n"
|
|
463
|
+
"try {\n"
|
|
464
|
+
" " + code.replace("\n", "\n ") + "\n"
|
|
465
|
+
"} catch (error) {\n"
|
|
466
|
+
" console.error('Code execution error:', error.message);\n"
|
|
467
|
+
" process.exitCode = 1;\n"
|
|
468
|
+
"}\n"
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
temp_file.write(safe_code.encode("utf-8"))
|
|
472
|
+
|
|
473
|
+
def timeout_handler(signum, frame):
|
|
474
|
+
raise TimeoutError(f"Execution timed out after {timeout} seconds")
|
|
475
|
+
|
|
476
|
+
signal.signal(signal.SIGALRM, timeout_handler)
|
|
477
|
+
signal.alarm(timeout)
|
|
478
|
+
|
|
479
|
+
try:
|
|
480
|
+
process = subprocess.Popen(
|
|
481
|
+
[
|
|
482
|
+
"node",
|
|
483
|
+
"--no-warnings",
|
|
484
|
+
"--max-old-space-size=100",
|
|
485
|
+
temp_file_path,
|
|
486
|
+
],
|
|
487
|
+
stdout=subprocess.PIPE,
|
|
488
|
+
stderr=subprocess.PIPE,
|
|
489
|
+
text=True,
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
try:
|
|
493
|
+
stdout, stderr = process.communicate(timeout=timeout)
|
|
494
|
+
except subprocess.TimeoutExpired:
|
|
495
|
+
process.kill()
|
|
496
|
+
stdout, stderr = process.communicate()
|
|
497
|
+
signal.alarm(0)
|
|
498
|
+
return {
|
|
499
|
+
"success": False,
|
|
500
|
+
"output": None,
|
|
501
|
+
"error": f"JavaScript execution timed out after {timeout} seconds (subprocess.TimeoutExpired). Output: {stdout.strip()}, Error: {stderr.strip()}",
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
signal.alarm(0)
|
|
505
|
+
|
|
506
|
+
if process.returncode == 0:
|
|
507
|
+
return {
|
|
508
|
+
"success": True,
|
|
509
|
+
"output": stdout.strip(),
|
|
510
|
+
"error": None,
|
|
511
|
+
}
|
|
512
|
+
else:
|
|
513
|
+
return {
|
|
514
|
+
"success": False,
|
|
515
|
+
"output": None,
|
|
516
|
+
"error": stderr.strip() or f"JavaScript process exited with code {process.returncode}",
|
|
517
|
+
}
|
|
518
|
+
except TimeoutError as e:
|
|
519
|
+
process.kill()
|
|
520
|
+
_, _ = process.communicate()
|
|
521
|
+
return {
|
|
522
|
+
"success": False,
|
|
523
|
+
"output": None,
|
|
524
|
+
"error": f"JavaScript execution timed out after {timeout} seconds (signal.alarm): {str(e)}",
|
|
525
|
+
}
|
|
526
|
+
finally:
|
|
527
|
+
signal.alarm(0)
|
|
528
|
+
if os.path.exists(temp_file_path):
|
|
529
|
+
os.unlink(temp_file_path)
|
|
530
|
+
|
|
531
|
+
except Exception as e:
|
|
532
|
+
error_traceback = traceback.format_exc()
|
|
533
|
+
return {
|
|
534
|
+
"success": False,
|
|
535
|
+
"output": None,
|
|
536
|
+
"error": f"Setup error: {str(e)}\n{error_traceback}",
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def execute_javascript_code(code: str, timeout: int = 5) -> Dict[str, Any]:
|
|
541
|
+
"""
|
|
542
|
+
Execute JavaScript code in a secure sandbox.
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
code: JavaScript code to execute
|
|
546
|
+
timeout: Maximum execution time in seconds
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
Dictionary with execution results
|
|
550
|
+
"""
|
|
551
|
+
return _execute_code_in_process(_execute_javascript_in_subprocess, args=(code, timeout), timeout=timeout)
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def compare_outputs(actual: str, expected: str) -> float:
|
|
555
|
+
"""
|
|
556
|
+
Compare actual and expected outputs to calculate a similarity score.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
actual: Actual output from code execution
|
|
560
|
+
expected: Expected output
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
Similarity score between 0.0 and 1.0
|
|
564
|
+
"""
|
|
565
|
+
actual_norm = normalize_output(actual)
|
|
566
|
+
expected_norm = normalize_output(expected)
|
|
567
|
+
|
|
568
|
+
if actual_norm == expected_norm:
|
|
569
|
+
return 1.0
|
|
570
|
+
|
|
571
|
+
if is_numeric(actual_norm) and is_numeric(expected_norm):
|
|
572
|
+
try:
|
|
573
|
+
actual_num = float(actual_norm)
|
|
574
|
+
expected_num = float(expected_norm)
|
|
575
|
+
|
|
576
|
+
if expected_num == 0:
|
|
577
|
+
return 1.0 if actual_num == 0 else 0.0
|
|
578
|
+
|
|
579
|
+
rel_diff = abs(actual_num - expected_num) / abs(expected_num)
|
|
580
|
+
if rel_diff <= 0.001:
|
|
581
|
+
return 1.0
|
|
582
|
+
elif rel_diff <= 0.01:
|
|
583
|
+
return 0.9
|
|
584
|
+
elif rel_diff <= 0.1:
|
|
585
|
+
return 0.7
|
|
586
|
+
else:
|
|
587
|
+
return max(0.0, 1.0 - min(1.0, rel_diff))
|
|
588
|
+
except (ValueError, TypeError):
|
|
589
|
+
pass
|
|
590
|
+
|
|
591
|
+
if (
|
|
592
|
+
actual_norm.startswith("[")
|
|
593
|
+
and actual_norm.endswith("]")
|
|
594
|
+
and expected_norm.startswith("[")
|
|
595
|
+
and expected_norm.endswith("]")
|
|
596
|
+
):
|
|
597
|
+
try:
|
|
598
|
+
actual_list = json.loads(actual_norm)
|
|
599
|
+
expected_list = json.loads(expected_norm)
|
|
600
|
+
|
|
601
|
+
if not actual_list and not expected_list:
|
|
602
|
+
return 1.0
|
|
603
|
+
|
|
604
|
+
if not isinstance(actual_list, list) or not isinstance(expected_list, list):
|
|
605
|
+
raise ValueError("Not a list")
|
|
606
|
+
|
|
607
|
+
len_similarity = 1.0 - min(
|
|
608
|
+
1.0,
|
|
609
|
+
abs(len(actual_list) - len(expected_list)) / max(1, max(len(actual_list), len(expected_list))),
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
items_similarity = 0.0
|
|
613
|
+
if len(actual_list) > 0 and len(expected_list) > 0:
|
|
614
|
+
total_similarity = 0.0
|
|
615
|
+
for exp_item in expected_list:
|
|
616
|
+
best_match = 0.0
|
|
617
|
+
for act_item in actual_list:
|
|
618
|
+
item_similarity = compare_outputs(str(act_item), str(exp_item))
|
|
619
|
+
best_match = max(best_match, item_similarity)
|
|
620
|
+
total_similarity += best_match
|
|
621
|
+
items_similarity = total_similarity / len(expected_list)
|
|
622
|
+
return 0.3 * len_similarity + 0.7 * items_similarity
|
|
623
|
+
except (ValueError, json.JSONDecodeError):
|
|
624
|
+
pass
|
|
625
|
+
|
|
626
|
+
if "\n" in actual_norm or "\n" in expected_norm:
|
|
627
|
+
actual_lines = actual_norm.strip().split("\n")
|
|
628
|
+
expected_lines = expected_norm.strip().split("\n")
|
|
629
|
+
|
|
630
|
+
if not actual_lines and not expected_lines:
|
|
631
|
+
return 1.0
|
|
632
|
+
|
|
633
|
+
len_similarity = 1.0 - min(
|
|
634
|
+
1.0,
|
|
635
|
+
abs(len(actual_lines) - len(expected_lines)) / max(1, max(len(actual_lines), len(expected_lines))),
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
lines_similarity = 0.0
|
|
639
|
+
common_len = min(len(actual_lines), len(expected_lines))
|
|
640
|
+
if common_len > 0:
|
|
641
|
+
total_similarity = 0.0
|
|
642
|
+
for i in range(common_len):
|
|
643
|
+
line_similarity = string_similarity(actual_lines[i], expected_lines[i])
|
|
644
|
+
total_similarity += line_similarity
|
|
645
|
+
lines_similarity = total_similarity / common_len
|
|
646
|
+
return 0.3 * len_similarity + 0.7 * lines_similarity
|
|
647
|
+
|
|
648
|
+
return string_similarity(actual_norm, expected_norm)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def string_similarity(s1: str, s2: str) -> float:
|
|
652
|
+
"""
|
|
653
|
+
Calculate string similarity using character-level comparison.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
s1: First string
|
|
657
|
+
s2: Second string
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
Similarity score between 0.0 and 1.0
|
|
661
|
+
"""
|
|
662
|
+
if not s1 and not s2:
|
|
663
|
+
return 1.0
|
|
664
|
+
if not s1 or not s2:
|
|
665
|
+
return 0.0
|
|
666
|
+
|
|
667
|
+
m, n = len(s1), len(s2)
|
|
668
|
+
lcs_length = longest_common_subsequence_length(s1, s2)
|
|
669
|
+
|
|
670
|
+
return lcs_length / max(m, n)
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def longest_common_subsequence_length(s1: str, s2: str) -> int:
|
|
674
|
+
"""
|
|
675
|
+
Calculate the length of the longest common subsequence.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
s1: First string
|
|
679
|
+
s2: Second string
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
Length of longest common subsequence
|
|
683
|
+
"""
|
|
684
|
+
m, n = len(s1), len(s2)
|
|
685
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
686
|
+
|
|
687
|
+
for i in range(1, m + 1):
|
|
688
|
+
for j in range(1, n + 1):
|
|
689
|
+
if s1[i - 1] == s2[j - 1]:
|
|
690
|
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
|
691
|
+
else:
|
|
692
|
+
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
|
693
|
+
|
|
694
|
+
return dp[m][n]
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def normalize_output(output: str) -> str:
|
|
698
|
+
"""
|
|
699
|
+
Normalize output for comparison.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
output: Output string to normalize
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
Normalized output string
|
|
706
|
+
"""
|
|
707
|
+
normalized = output.strip()
|
|
708
|
+
normalized = normalized.replace("\r\n", "\n").replace("\r", "\n")
|
|
709
|
+
normalized = re.sub(r"\s+", " ", normalized)
|
|
710
|
+
return normalized
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def is_numeric(value: str) -> bool:
|
|
714
|
+
"""
|
|
715
|
+
Check if a string value represents a numeric value.
|
|
716
|
+
|
|
717
|
+
Args:
|
|
718
|
+
value: String value to check
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
True if the value is numeric, False otherwise
|
|
722
|
+
"""
|
|
723
|
+
try:
|
|
724
|
+
float(value)
|
|
725
|
+
return True
|
|
726
|
+
except (ValueError, TypeError):
|
|
727
|
+
return False
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def noop(*args: Any, **kwargs: Any) -> Any:
|
|
731
|
+
"""A no-operation function that returns None."""
|
|
732
|
+
return None
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def execute_code_with_e2b(
|
|
736
|
+
code: str,
|
|
737
|
+
language: str = "python",
|
|
738
|
+
timeout: int = 30,
|
|
739
|
+
api_key: Optional[str] = None,
|
|
740
|
+
) -> Dict[str, Any]:
|
|
741
|
+
"""
|
|
742
|
+
Execute code within an E2B sandbox.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
code: Code to execute
|
|
746
|
+
language: Programming language of the code ("python", "javascript", etc.)
|
|
747
|
+
timeout: Maximum execution time in seconds
|
|
748
|
+
api_key: Optional E2B API key (if not provided, will use E2B_API_KEY env var)
|
|
749
|
+
|
|
750
|
+
Returns:
|
|
751
|
+
Dictionary with execution results
|
|
752
|
+
"""
|
|
753
|
+
if not _HAS_E2B:
|
|
754
|
+
return {
|
|
755
|
+
"success": False,
|
|
756
|
+
"output": None,
|
|
757
|
+
"error": "E2B package not installed. Install with: pip install e2b",
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
if api_key is None and os.environ.get("E2B_API_KEY") is None:
|
|
762
|
+
return {
|
|
763
|
+
"success": False,
|
|
764
|
+
"output": None,
|
|
765
|
+
"error": "API key is required for E2B execution. Set it using the api_key parameter or E2B_API_KEY environment variable.",
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
with Sandbox(api_key=api_key) as sandbox:
|
|
769
|
+
stdout = []
|
|
770
|
+
stderr = []
|
|
771
|
+
|
|
772
|
+
def capture_stdout(output):
|
|
773
|
+
if hasattr(output, "line"):
|
|
774
|
+
stdout.append(output.line)
|
|
775
|
+
else:
|
|
776
|
+
stdout.append(str(output))
|
|
777
|
+
|
|
778
|
+
def capture_stderr(output):
|
|
779
|
+
if hasattr(output, "line"):
|
|
780
|
+
stderr.append(output.line)
|
|
781
|
+
else:
|
|
782
|
+
stderr.append(str(output))
|
|
783
|
+
|
|
784
|
+
sandbox.on_exit = lambda *args: None # type: ignore[method-assign, assignment]
|
|
785
|
+
|
|
786
|
+
if language.lower() in ["python", "py"]:
|
|
787
|
+
file_path = "/code/script.py"
|
|
788
|
+
cmd = "python3 /code/script.py"
|
|
789
|
+
elif language.lower() in ["javascript", "js"]:
|
|
790
|
+
file_path = "/code/script.js"
|
|
791
|
+
cmd = "node /code/script.js"
|
|
792
|
+
else:
|
|
793
|
+
return {
|
|
794
|
+
"success": False,
|
|
795
|
+
"output": None,
|
|
796
|
+
"error": f"Unsupported language for E2B: {language}",
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
try:
|
|
800
|
+
fs_handler = None
|
|
801
|
+
if _E2B_SOURCE == "e2b_code_interpreter":
|
|
802
|
+
if hasattr(sandbox, "filesystem"):
|
|
803
|
+
fs_handler = sandbox.filesystem
|
|
804
|
+
elif _E2B_SOURCE == "e2b":
|
|
805
|
+
if hasattr(sandbox, "_filesystem"):
|
|
806
|
+
fs_handler = sandbox._filesystem
|
|
807
|
+
elif hasattr(sandbox, "filesystem"):
|
|
808
|
+
fs_handler = sandbox.filesystem
|
|
809
|
+
|
|
810
|
+
if not fs_handler:
|
|
811
|
+
return {
|
|
812
|
+
"success": False,
|
|
813
|
+
"output": None,
|
|
814
|
+
"error": "Could not access E2B sandbox filesystem handler.",
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
try:
|
|
818
|
+
fs_handler.make_dir("/code")
|
|
819
|
+
except Exception:
|
|
820
|
+
pass
|
|
821
|
+
|
|
822
|
+
fs_handler.write(file_path, code)
|
|
823
|
+
except Exception as e:
|
|
824
|
+
return {
|
|
825
|
+
"success": False,
|
|
826
|
+
"output": None,
|
|
827
|
+
"error": f"Failed to write code to sandbox: {str(e)}",
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
try:
|
|
831
|
+
result = sandbox.commands.run(
|
|
832
|
+
cmd,
|
|
833
|
+
on_stdout=capture_stdout,
|
|
834
|
+
on_stderr=capture_stderr,
|
|
835
|
+
timeout=timeout,
|
|
836
|
+
)
|
|
837
|
+
|
|
838
|
+
output = "\n".join(stdout)
|
|
839
|
+
error_output = "\n".join(stderr)
|
|
840
|
+
|
|
841
|
+
if result.exit_code == 0:
|
|
842
|
+
return {"success": True, "output": output, "error": None}
|
|
843
|
+
else:
|
|
844
|
+
return {
|
|
845
|
+
"success": False,
|
|
846
|
+
"output": None,
|
|
847
|
+
"error": f"Process exited with code {result.exit_code}: {error_output}",
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
except Exception as e:
|
|
851
|
+
return {
|
|
852
|
+
"success": False,
|
|
853
|
+
"output": None,
|
|
854
|
+
"error": f"Execution error: {str(e)}",
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
except Exception as e:
|
|
858
|
+
error_traceback = traceback.format_exc()
|
|
859
|
+
return {
|
|
860
|
+
"success": False,
|
|
861
|
+
"output": None,
|
|
862
|
+
"error": f"E2B setup error: {str(e)}\n{error_traceback}",
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
@reward_function
|
|
867
|
+
def e2b_code_execution_reward(
|
|
868
|
+
messages: List[Message],
|
|
869
|
+
ground_truth: Optional[str] = None,
|
|
870
|
+
language: str = "python",
|
|
871
|
+
timeout: int = 30,
|
|
872
|
+
api_key: Optional[str] = None,
|
|
873
|
+
**kwargs,
|
|
874
|
+
) -> EvaluateResult:
|
|
875
|
+
"""
|
|
876
|
+
Evaluate code correctness by executing it in E2B sandbox and comparing the output.
|
|
877
|
+
|
|
878
|
+
E2B provides a secure, cloud-based sandbox for executing code safely.
|
|
879
|
+
|
|
880
|
+
Args:
|
|
881
|
+
messages: List of conversation messages. The last message is assumed to be the
|
|
882
|
+
assistant's response containing the code.
|
|
883
|
+
ground_truth: Expected output string from code execution. This corresponds to
|
|
884
|
+
the `expected_output_str` in the previous signature.
|
|
885
|
+
language: Programming language of the code ("python", "javascript", etc.)
|
|
886
|
+
timeout: Maximum execution time in seconds.
|
|
887
|
+
api_key: Optional E2B API key (if not provided, will use E2B_API_KEY env var).
|
|
888
|
+
**kwargs: Additional keyword arguments.
|
|
889
|
+
|
|
890
|
+
Returns:
|
|
891
|
+
EvaluateResult with score and metrics.
|
|
892
|
+
"""
|
|
893
|
+
if not _HAS_E2B:
|
|
894
|
+
return EvaluateResult(
|
|
895
|
+
score=0.0,
|
|
896
|
+
reason="E2B package not installed.",
|
|
897
|
+
metrics={
|
|
898
|
+
"error": MetricResult(
|
|
899
|
+
score=0.0,
|
|
900
|
+
reason="E2B package not installed. Install with: pip install e2b",
|
|
901
|
+
is_score_valid=False,
|
|
902
|
+
)
|
|
903
|
+
},
|
|
904
|
+
)
|
|
905
|
+
|
|
906
|
+
if api_key is None and os.environ.get("E2B_API_KEY") is None:
|
|
907
|
+
return EvaluateResult(
|
|
908
|
+
score=0.0,
|
|
909
|
+
reason="E2B API key is required.",
|
|
910
|
+
metrics={
|
|
911
|
+
"error": MetricResult(
|
|
912
|
+
score=0.0,
|
|
913
|
+
reason="E2B API key is required. Set the E2B_API_KEY environment variable or provide api_key parameter.",
|
|
914
|
+
is_score_valid=False,
|
|
915
|
+
)
|
|
916
|
+
},
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
metrics: Dict[str, MetricResult] = {}
|
|
920
|
+
|
|
921
|
+
if (
|
|
922
|
+
not messages
|
|
923
|
+
or not isinstance(messages[-1], Message)
|
|
924
|
+
or messages[-1].role != "assistant"
|
|
925
|
+
or messages[-1].content is None
|
|
926
|
+
):
|
|
927
|
+
return EvaluateResult(
|
|
928
|
+
score=0.0,
|
|
929
|
+
reason="Invalid or missing assistant response in messages.",
|
|
930
|
+
metrics={
|
|
931
|
+
"error": MetricResult(
|
|
932
|
+
score=0.0,
|
|
933
|
+
is_score_valid=False,
|
|
934
|
+
reason="Last message not a valid assistant response.",
|
|
935
|
+
)
|
|
936
|
+
},
|
|
937
|
+
)
|
|
938
|
+
|
|
939
|
+
response_content = messages[-1].content
|
|
940
|
+
expected_output_str = ground_truth
|
|
941
|
+
|
|
942
|
+
code_blocks = extract_code_blocks(response_content, language)
|
|
943
|
+
|
|
944
|
+
if not code_blocks:
|
|
945
|
+
return EvaluateResult(
|
|
946
|
+
score=0.0,
|
|
947
|
+
reason=f"No {language} code blocks found in model's response.",
|
|
948
|
+
metrics={
|
|
949
|
+
"error": MetricResult(
|
|
950
|
+
score=0.0,
|
|
951
|
+
reason=f"No {language} code blocks found in model's response.",
|
|
952
|
+
is_score_valid=False,
|
|
953
|
+
)
|
|
954
|
+
},
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
code = code_blocks[0]["code"]
|
|
958
|
+
|
|
959
|
+
metrics["extracted_code"] = MetricResult(
|
|
960
|
+
score=0.0,
|
|
961
|
+
reason=f"Extracted code:\n```{language}\n{code}\n```",
|
|
962
|
+
is_score_valid=True,
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
if expected_output_str:
|
|
966
|
+
metrics["expected_output"] = MetricResult(
|
|
967
|
+
score=0.0,
|
|
968
|
+
reason=f"Expected output:\n{expected_output_str}",
|
|
969
|
+
is_score_valid=True,
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
execution_result = execute_code_with_e2b(code=code, language=language, timeout=timeout, api_key=api_key)
|
|
973
|
+
|
|
974
|
+
if execution_result["success"]:
|
|
975
|
+
output = execution_result["output"]
|
|
976
|
+
|
|
977
|
+
metrics["execution_result"] = MetricResult(
|
|
978
|
+
score=1.0,
|
|
979
|
+
reason=f"Code executed successfully in E2B sandbox with output:\n{output}",
|
|
980
|
+
is_score_valid=True,
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
if expected_output_str:
|
|
984
|
+
similarity = compare_outputs(output, expected_output_str)
|
|
985
|
+
match_reason = (
|
|
986
|
+
f"Output similarity: {similarity:.2f}\n\nExpected:\n{expected_output_str}\n\nActual:\n{output}"
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
metrics["output_match"] = MetricResult(
|
|
990
|
+
score=similarity, reason=match_reason, is_score_valid=similarity == 1.0
|
|
991
|
+
)
|
|
992
|
+
final_reason = f"E2B execution successful. Output similarity: {similarity:.2f}."
|
|
993
|
+
return EvaluateResult(score=similarity, reason=final_reason, metrics=metrics)
|
|
994
|
+
|
|
995
|
+
final_reason = "E2B execution successful. No expected output to compare."
|
|
996
|
+
return EvaluateResult(score=1.0, reason=final_reason, metrics=metrics)
|
|
997
|
+
else:
|
|
998
|
+
error = execution_result["error"]
|
|
999
|
+
|
|
1000
|
+
metrics["execution_result"] = MetricResult(
|
|
1001
|
+
score=0.0,
|
|
1002
|
+
reason=f"Code execution failed in E2B sandbox with error:\n{error}",
|
|
1003
|
+
is_score_valid=False,
|
|
1004
|
+
)
|
|
1005
|
+
final_reason = f"E2B code execution failed: {error}"
|
|
1006
|
+
return EvaluateResult(score=0.0, reason=final_reason, metrics=metrics)
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
@reward_function
|
|
1010
|
+
def fractional_code_reward(
|
|
1011
|
+
messages: List[Message],
|
|
1012
|
+
ground_truth: Union[Optional[str], Optional[List[Dict[str, Any]]]],
|
|
1013
|
+
language: str = "python",
|
|
1014
|
+
timeout: int = 30,
|
|
1015
|
+
environment: str = "local",
|
|
1016
|
+
api_key: Optional[str] = None,
|
|
1017
|
+
**kwargs: Any,
|
|
1018
|
+
) -> EvaluateResult:
|
|
1019
|
+
"""
|
|
1020
|
+
Execute code and return the exact pass rate as a score between 0 and 1.
|
|
1021
|
+
|
|
1022
|
+
Unlike the binary code reward, this function returns the actual score representing
|
|
1023
|
+
how closely the code output matches the expected output or how many test cases pass.
|
|
1024
|
+
|
|
1025
|
+
Args:
|
|
1026
|
+
messages: List of conversation messages. The last message is assumed to be the
|
|
1027
|
+
assistant's response containing the code.
|
|
1028
|
+
ground_truth: Expected output string from code execution, OR a list of test cases.
|
|
1029
|
+
If a string, it's direct output comparison.
|
|
1030
|
+
If a list of dicts, each dict is a test case with "input" and "expected_output".
|
|
1031
|
+
language: Programming language of the code ("python", "javascript", etc.).
|
|
1032
|
+
timeout: Maximum execution time in seconds.
|
|
1033
|
+
environment: Environment to run the code in ("local" or "e2b").
|
|
1034
|
+
api_key: Optional E2B API key (if using e2b environment).
|
|
1035
|
+
**kwargs: Additional keyword arguments (e.g., function_to_call for _run_test_cases).
|
|
1036
|
+
|
|
1037
|
+
Returns:
|
|
1038
|
+
EvaluateResult with score between 0 and 1 representing the exact pass rate.
|
|
1039
|
+
"""
|
|
1040
|
+
metrics_strings: Dict[str, str] = {}
|
|
1041
|
+
|
|
1042
|
+
if (
|
|
1043
|
+
not messages
|
|
1044
|
+
or not isinstance(messages[-1], Message)
|
|
1045
|
+
or messages[-1].role != "assistant"
|
|
1046
|
+
or messages[-1].content is None
|
|
1047
|
+
):
|
|
1048
|
+
return EvaluateResult(
|
|
1049
|
+
score=0.0,
|
|
1050
|
+
reason="Invalid or missing assistant response in messages for fractional code reward.",
|
|
1051
|
+
metrics={
|
|
1052
|
+
"error": MetricResult(
|
|
1053
|
+
score=0.0,
|
|
1054
|
+
is_score_valid=False,
|
|
1055
|
+
reason="Last message not a valid assistant response.",
|
|
1056
|
+
)
|
|
1057
|
+
},
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
response_content = messages[-1].content
|
|
1061
|
+
|
|
1062
|
+
expected_output_str_from_gt: Optional[str] = None
|
|
1063
|
+
test_cases_from_gt: Optional[List[Dict[str, Any]]] = None
|
|
1064
|
+
|
|
1065
|
+
if isinstance(ground_truth, str):
|
|
1066
|
+
expected_output_str_from_gt = ground_truth
|
|
1067
|
+
elif isinstance(ground_truth, list):
|
|
1068
|
+
if all(isinstance(item, dict) for item in ground_truth):
|
|
1069
|
+
test_cases_from_gt = ground_truth
|
|
1070
|
+
else:
|
|
1071
|
+
return EvaluateResult(
|
|
1072
|
+
score=0.0,
|
|
1073
|
+
reason="Invalid ground_truth format: expected string or list of test case dicts.",
|
|
1074
|
+
metrics={
|
|
1075
|
+
"error": MetricResult(
|
|
1076
|
+
score=0.0,
|
|
1077
|
+
is_score_valid=False,
|
|
1078
|
+
reason="Invalid ground_truth format.",
|
|
1079
|
+
)
|
|
1080
|
+
},
|
|
1081
|
+
)
|
|
1082
|
+
elif ground_truth is not None:
|
|
1083
|
+
return EvaluateResult(
|
|
1084
|
+
score=0.0,
|
|
1085
|
+
reason="Invalid ground_truth format: expected string, list of test case dicts, or None.",
|
|
1086
|
+
metrics={
|
|
1087
|
+
"error": MetricResult(
|
|
1088
|
+
score=0.0,
|
|
1089
|
+
is_score_valid=False,
|
|
1090
|
+
reason="Invalid ground_truth format.",
|
|
1091
|
+
)
|
|
1092
|
+
},
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
code_blocks = extract_code_blocks(response_content, language)
|
|
1096
|
+
|
|
1097
|
+
if not code_blocks:
|
|
1098
|
+
return EvaluateResult(
|
|
1099
|
+
score=0.0,
|
|
1100
|
+
reason=f"No {language} code blocks found in model's response for fractional code reward.",
|
|
1101
|
+
metrics={
|
|
1102
|
+
"error": MetricResult(
|
|
1103
|
+
score=0.0,
|
|
1104
|
+
reason=f"No {language} code blocks found in model's response.",
|
|
1105
|
+
is_score_valid=False,
|
|
1106
|
+
)
|
|
1107
|
+
},
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
code = code_blocks[0]["code"]
|
|
1111
|
+
|
|
1112
|
+
metrics_strings["extracted_code"] = f"Extracted code:\n```{language}\n{code}\n```"
|
|
1113
|
+
|
|
1114
|
+
if expected_output_str_from_gt and not test_cases_from_gt:
|
|
1115
|
+
metrics_strings["expected_output"] = f"Expected output:\n{expected_output_str_from_gt}"
|
|
1116
|
+
|
|
1117
|
+
if test_cases_from_gt:
|
|
1118
|
+
return _run_test_cases(
|
|
1119
|
+
code=code,
|
|
1120
|
+
language=language,
|
|
1121
|
+
test_cases=test_cases_from_gt,
|
|
1122
|
+
timeout=timeout,
|
|
1123
|
+
environment=environment,
|
|
1124
|
+
api_key=api_key,
|
|
1125
|
+
**kwargs,
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
execution_result: Dict[str, Any]
|
|
1129
|
+
if environment.lower() == "e2b":
|
|
1130
|
+
if not _HAS_E2B:
|
|
1131
|
+
return EvaluateResult(
|
|
1132
|
+
score=0.0,
|
|
1133
|
+
reason="E2B package not installed for fractional code reward.",
|
|
1134
|
+
metrics={
|
|
1135
|
+
"error": MetricResult(
|
|
1136
|
+
score=0.0,
|
|
1137
|
+
reason="E2B package not installed. Install with: pip install e2b",
|
|
1138
|
+
is_score_valid=False,
|
|
1139
|
+
)
|
|
1140
|
+
},
|
|
1141
|
+
)
|
|
1142
|
+
execution_result = execute_code_with_e2b(code=code, language=language, timeout=timeout, api_key=api_key)
|
|
1143
|
+
else:
|
|
1144
|
+
if language.lower() == "python":
|
|
1145
|
+
execution_result = execute_python_code(code, timeout)
|
|
1146
|
+
elif language.lower() in ["javascript", "js"]:
|
|
1147
|
+
execution_result = execute_javascript_code(code, timeout)
|
|
1148
|
+
else:
|
|
1149
|
+
final_metrics_on_error: Dict[str, MetricResult] = {
|
|
1150
|
+
k: MetricResult(score=0.0, reason=v, is_score_valid=(k == "extracted_code"))
|
|
1151
|
+
for k, v in metrics_strings.items()
|
|
1152
|
+
}
|
|
1153
|
+
final_metrics_on_error["error"] = MetricResult(
|
|
1154
|
+
score=0.0,
|
|
1155
|
+
reason=f"Unsupported language: {language}",
|
|
1156
|
+
is_score_valid=False,
|
|
1157
|
+
)
|
|
1158
|
+
return EvaluateResult(
|
|
1159
|
+
score=0.0,
|
|
1160
|
+
reason=f"Unsupported language for fractional code reward: {language}",
|
|
1161
|
+
metrics=final_metrics_on_error,
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
metric_results: Dict[str, MetricResult] = {
|
|
1165
|
+
k: MetricResult(
|
|
1166
|
+
score=0.0,
|
|
1167
|
+
reason=v,
|
|
1168
|
+
is_score_valid=(
|
|
1169
|
+
k == "extracted_code" or (k == "expected_output" and expected_output_str_from_gt is not None)
|
|
1170
|
+
),
|
|
1171
|
+
)
|
|
1172
|
+
for k, v in metrics_strings.items()
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
if execution_result["success"]:
|
|
1176
|
+
output = execution_result["output"]
|
|
1177
|
+
metric_results["execution_result"] = MetricResult(
|
|
1178
|
+
score=1.0,
|
|
1179
|
+
reason=f"Code executed successfully with output:\n{output}",
|
|
1180
|
+
is_score_valid=True,
|
|
1181
|
+
)
|
|
1182
|
+
|
|
1183
|
+
if expected_output_str_from_gt:
|
|
1184
|
+
similarity = compare_outputs(output, expected_output_str_from_gt)
|
|
1185
|
+
match_reason = (
|
|
1186
|
+
f"Output similarity: {similarity:.2f}\n\nExpected:\n{expected_output_str_from_gt}\n\nActual:\n{output}"
|
|
1187
|
+
)
|
|
1188
|
+
metric_results["output_match"] = MetricResult(
|
|
1189
|
+
score=similarity, reason=match_reason, is_score_valid=similarity == 1.0
|
|
1190
|
+
)
|
|
1191
|
+
final_reason = f"Fractional code execution successful. Output similarity: {similarity:.2f}."
|
|
1192
|
+
return EvaluateResult(score=similarity, reason=final_reason, metrics=metric_results)
|
|
1193
|
+
else:
|
|
1194
|
+
final_reason = "Fractional code execution successful. No expected output string to compare."
|
|
1195
|
+
return EvaluateResult(score=1.0, reason=final_reason, metrics=metric_results)
|
|
1196
|
+
else:
|
|
1197
|
+
error = execution_result["error"]
|
|
1198
|
+
metric_results["execution_result"] = MetricResult(
|
|
1199
|
+
score=0.0,
|
|
1200
|
+
reason=f"Code execution failed with error:\n{error}",
|
|
1201
|
+
is_score_valid=False,
|
|
1202
|
+
)
|
|
1203
|
+
final_reason = f"Fractional code execution failed: {error}"
|
|
1204
|
+
return EvaluateResult(score=0.0, reason=final_reason, metrics=metric_results)
|
|
1205
|
+
|
|
1206
|
+
|
|
1207
|
+
def _run_test_cases(
|
|
1208
|
+
code: str,
|
|
1209
|
+
language: str,
|
|
1210
|
+
test_cases: List[Dict[str, Any]],
|
|
1211
|
+
timeout: int,
|
|
1212
|
+
environment: str,
|
|
1213
|
+
api_key: Optional[str] = None,
|
|
1214
|
+
function_to_call: Optional[str] = None,
|
|
1215
|
+
prompt_for_name_extraction: Optional[str] = None, # Not used yet, but for future use
|
|
1216
|
+
**kwargs: Any, # Keep kwargs for flexibility, though function_to_call is now explicit
|
|
1217
|
+
) -> EvaluateResult: # Changed return type hint to match actual returns
|
|
1218
|
+
"""
|
|
1219
|
+
Run code against multiple test cases and return the fraction of passing tests.
|
|
1220
|
+
Can optionally call a specific function if `function_to_call` is provided.
|
|
1221
|
+
|
|
1222
|
+
Args:
|
|
1223
|
+
code: The code to execute
|
|
1224
|
+
language: Programming language of the code
|
|
1225
|
+
test_cases: List of test cases with input and expected output
|
|
1226
|
+
timeout: Maximum execution time in seconds
|
|
1227
|
+
environment: Environment to run the code in ("local" or "e2b")
|
|
1228
|
+
api_key: Optional E2B API key (if using e2b environment)
|
|
1229
|
+
|
|
1230
|
+
Returns:
|
|
1231
|
+
EvaluateResult with score representing the fraction of passing tests
|
|
1232
|
+
"""
|
|
1233
|
+
metrics: Dict[str, Any] = {}
|
|
1234
|
+
results = []
|
|
1235
|
+
passed = 0
|
|
1236
|
+
total = len(test_cases)
|
|
1237
|
+
|
|
1238
|
+
if total == 0:
|
|
1239
|
+
return EvaluateResult(
|
|
1240
|
+
score=0.0,
|
|
1241
|
+
reason="No test cases provided",
|
|
1242
|
+
metrics={"error": MetricResult(score=0.0, reason="No test cases provided", is_score_valid=False)},
|
|
1243
|
+
)
|
|
1244
|
+
|
|
1245
|
+
if language.lower() in ["python", "py"]:
|
|
1246
|
+
if function_to_call:
|
|
1247
|
+
|
|
1248
|
+
def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
|
|
1249
|
+
import ast
|
|
1250
|
+
import json
|
|
1251
|
+
|
|
1252
|
+
def refine_evaluated_value(val: Any) -> Any:
|
|
1253
|
+
if isinstance(val, str):
|
|
1254
|
+
stripped_val = val.strip()
|
|
1255
|
+
if stripped_val.startswith(("[", "{")):
|
|
1256
|
+
try:
|
|
1257
|
+
return json.loads(stripped_val)
|
|
1258
|
+
except json.JSONDecodeError:
|
|
1259
|
+
return val
|
|
1260
|
+
else:
|
|
1261
|
+
try:
|
|
1262
|
+
if "." in stripped_val or "e" in stripped_val.lower() or "E" in stripped_val:
|
|
1263
|
+
return float(stripped_val)
|
|
1264
|
+
else:
|
|
1265
|
+
return int(stripped_val)
|
|
1266
|
+
except ValueError:
|
|
1267
|
+
return val
|
|
1268
|
+
return val
|
|
1269
|
+
|
|
1270
|
+
parsed_args = []
|
|
1271
|
+
args_str_stripped = test_input_str.strip()
|
|
1272
|
+
|
|
1273
|
+
if not args_str_stripped:
|
|
1274
|
+
pass
|
|
1275
|
+
else:
|
|
1276
|
+
parsed_as_single_arg = False
|
|
1277
|
+
try:
|
|
1278
|
+
val_from_json = json.loads(args_str_stripped)
|
|
1279
|
+
parsed_args.append(refine_evaluated_value(val_from_json))
|
|
1280
|
+
parsed_as_single_arg = True
|
|
1281
|
+
except json.JSONDecodeError:
|
|
1282
|
+
try:
|
|
1283
|
+
val_from_ast = ast.literal_eval(args_str_stripped)
|
|
1284
|
+
parsed_args.append(refine_evaluated_value(val_from_ast))
|
|
1285
|
+
parsed_as_single_arg = True
|
|
1286
|
+
except (ValueError, SyntaxError):
|
|
1287
|
+
pass
|
|
1288
|
+
|
|
1289
|
+
if not parsed_as_single_arg:
|
|
1290
|
+
try:
|
|
1291
|
+
arg_parts = shlex.split(args_str_stripped)
|
|
1292
|
+
except ValueError:
|
|
1293
|
+
arg_parts = [args_str_stripped]
|
|
1294
|
+
|
|
1295
|
+
for part_str in arg_parts:
|
|
1296
|
+
try:
|
|
1297
|
+
val_from_part_ast = ast.literal_eval(part_str)
|
|
1298
|
+
parsed_args.append(refine_evaluated_value(val_from_part_ast))
|
|
1299
|
+
except (ValueError, SyntaxError):
|
|
1300
|
+
parsed_args.append(refine_evaluated_value(part_str))
|
|
1301
|
+
|
|
1302
|
+
args_repr = ", ".join(map(repr, parsed_args))
|
|
1303
|
+
|
|
1304
|
+
return f"""import sys
|
|
1305
|
+
import json
|
|
1306
|
+
import traceback
|
|
1307
|
+
|
|
1308
|
+
{user_code}
|
|
1309
|
+
|
|
1310
|
+
try:
|
|
1311
|
+
result = {func_name}({args_repr})
|
|
1312
|
+
print(repr(result))
|
|
1313
|
+
except Exception as e:
|
|
1314
|
+
import traceback
|
|
1315
|
+
print(f'Error calling function {func_name}: {{traceback.format_exc()}}', file=sys.stderr)
|
|
1316
|
+
import sys
|
|
1317
|
+
sys.exit(1)
|
|
1318
|
+
"""
|
|
1319
|
+
|
|
1320
|
+
else:
|
|
1321
|
+
|
|
1322
|
+
def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
|
|
1323
|
+
escaped_test_input = json.dumps(test_input_str)[1:-1].replace("'''", "'\\''\\''\\''")
|
|
1324
|
+
return f"""import sys
|
|
1325
|
+
from io import StringIO
|
|
1326
|
+
|
|
1327
|
+
original_stdout = sys.stdout
|
|
1328
|
+
sys.stdout = captured_stdout = StringIO()
|
|
1329
|
+
sys.stdin = StringIO('''{escaped_test_input}''')
|
|
1330
|
+
|
|
1331
|
+
try:
|
|
1332
|
+
exec({repr(user_code)})
|
|
1333
|
+
except Exception as e:
|
|
1334
|
+
import traceback
|
|
1335
|
+
print(f'Error executing script: {{traceback.format_exc()}}', file=sys.stderr)
|
|
1336
|
+
import sys
|
|
1337
|
+
sys.exit(1)
|
|
1338
|
+
|
|
1339
|
+
sys.stdout = original_stdout
|
|
1340
|
+
print(captured_stdout.getvalue(), end='')
|
|
1341
|
+
"""
|
|
1342
|
+
|
|
1343
|
+
elif language.lower() in ["javascript", "js"]:
|
|
1344
|
+
if function_to_call:
|
|
1345
|
+
|
|
1346
|
+
def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
|
|
1347
|
+
args_str = test_input_str.strip()
|
|
1348
|
+
parsed_args_js = []
|
|
1349
|
+
if args_str:
|
|
1350
|
+
for arg in args_str.split():
|
|
1351
|
+
if arg.isdigit() or (arg.startswith("-") and arg[1:].isdigit()):
|
|
1352
|
+
parsed_args_js.append(arg)
|
|
1353
|
+
elif "." in arg and all(
|
|
1354
|
+
c.isdigit() or c == "." or (i == 0 and c == "-") for i, c in enumerate(arg)
|
|
1355
|
+
):
|
|
1356
|
+
try:
|
|
1357
|
+
float(arg)
|
|
1358
|
+
parsed_args_js.append(arg)
|
|
1359
|
+
except ValueError:
|
|
1360
|
+
parsed_args_js.append(json.dumps(arg))
|
|
1361
|
+
else:
|
|
1362
|
+
parsed_args_js.append(json.dumps(arg))
|
|
1363
|
+
|
|
1364
|
+
args_js_repr = ", ".join(parsed_args_js)
|
|
1365
|
+
return f"""{user_code}
|
|
1366
|
+
|
|
1367
|
+
try {{
|
|
1368
|
+
const result = {func_name}({args_js_repr});
|
|
1369
|
+
console.log(JSON.stringify(result));
|
|
1370
|
+
}} catch (error) {{
|
|
1371
|
+
console.error(`Error calling function {func_name}:`, error);
|
|
1372
|
+
process.exitCode = 1;
|
|
1373
|
+
}}
|
|
1374
|
+
"""
|
|
1375
|
+
|
|
1376
|
+
else:
|
|
1377
|
+
|
|
1378
|
+
def prepare_test_code(user_code: str, test_input_str: str, func_name: Optional[str]) -> str:
|
|
1379
|
+
input_lines = test_input_str.strip().split("\n")
|
|
1380
|
+
input_setup = "const inputs = " + json.dumps(input_lines) + ";\n"
|
|
1381
|
+
input_setup += "let inputIndex = 0;\n"
|
|
1382
|
+
input_setup += "const readline = () => inputs[inputIndex++];\n"
|
|
1383
|
+
return f"""const originalLog = console.log;
|
|
1384
|
+
let output = '';
|
|
1385
|
+
console.log = function(...args) {{
|
|
1386
|
+
output += args.map(String).join(' ') + '\\n';
|
|
1387
|
+
}};
|
|
1388
|
+
|
|
1389
|
+
{input_setup}
|
|
1390
|
+
|
|
1391
|
+
try {{
|
|
1392
|
+
{user_code}
|
|
1393
|
+
}} catch (error) {{
|
|
1394
|
+
console.error('Error executing script:', error);
|
|
1395
|
+
process.exitCode = 1;
|
|
1396
|
+
}}
|
|
1397
|
+
|
|
1398
|
+
console.log = originalLog;
|
|
1399
|
+
process.stdout.write(output);
|
|
1400
|
+
"""
|
|
1401
|
+
|
|
1402
|
+
else:
|
|
1403
|
+
return EvaluateResult(
|
|
1404
|
+
score=0.0,
|
|
1405
|
+
reason=f"Unsupported language for test cases: {language}",
|
|
1406
|
+
metrics={
|
|
1407
|
+
"error": MetricResult(
|
|
1408
|
+
score=0.0,
|
|
1409
|
+
reason=f"Unsupported language for test cases: {language}",
|
|
1410
|
+
is_score_valid=False,
|
|
1411
|
+
)
|
|
1412
|
+
},
|
|
1413
|
+
)
|
|
1414
|
+
|
|
1415
|
+
for i, test_case in enumerate(test_cases):
|
|
1416
|
+
test_input = test_case.get("input", "")
|
|
1417
|
+
expected = test_case.get("expected_output", "")
|
|
1418
|
+
|
|
1419
|
+
test_code_prepared = prepare_test_code(code, test_input, function_to_call)
|
|
1420
|
+
|
|
1421
|
+
if environment.lower() == "e2b":
|
|
1422
|
+
if not _HAS_E2B:
|
|
1423
|
+
return EvaluateResult(
|
|
1424
|
+
score=0.0,
|
|
1425
|
+
reason="E2B package not installed for test cases.",
|
|
1426
|
+
metrics={
|
|
1427
|
+
"error": MetricResult(
|
|
1428
|
+
score=0.0,
|
|
1429
|
+
reason="E2B package not installed. Install with: pip install e2b",
|
|
1430
|
+
is_score_valid=False,
|
|
1431
|
+
)
|
|
1432
|
+
},
|
|
1433
|
+
)
|
|
1434
|
+
|
|
1435
|
+
execution_result = execute_code_with_e2b(
|
|
1436
|
+
code=test_code_prepared,
|
|
1437
|
+
language=language,
|
|
1438
|
+
timeout=timeout,
|
|
1439
|
+
api_key=api_key,
|
|
1440
|
+
)
|
|
1441
|
+
else:
|
|
1442
|
+
if language.lower() in ["python", "py"]:
|
|
1443
|
+
execution_result = execute_python_code(test_code_prepared, timeout)
|
|
1444
|
+
elif language.lower() in ["javascript", "js"]:
|
|
1445
|
+
execution_result = execute_javascript_code(test_code_prepared, timeout)
|
|
1446
|
+
else:
|
|
1447
|
+
return EvaluateResult(
|
|
1448
|
+
score=0.0,
|
|
1449
|
+
reason=f"Unsupported language for local execution: {language}",
|
|
1450
|
+
metrics={
|
|
1451
|
+
"error": MetricResult(
|
|
1452
|
+
score=0.0,
|
|
1453
|
+
reason=f"Unsupported language for local execution: {language}",
|
|
1454
|
+
is_score_valid=False,
|
|
1455
|
+
)
|
|
1456
|
+
},
|
|
1457
|
+
)
|
|
1458
|
+
|
|
1459
|
+
test_result = {
|
|
1460
|
+
"test_number": i + 1,
|
|
1461
|
+
"input": test_input,
|
|
1462
|
+
"expected_output": expected,
|
|
1463
|
+
"passed": False,
|
|
1464
|
+
"details": "",
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
if execution_result["success"]:
|
|
1468
|
+
output = execution_result["output"]
|
|
1469
|
+
normalized_output = normalize_output(output)
|
|
1470
|
+
normalized_expected = normalize_output(expected)
|
|
1471
|
+
|
|
1472
|
+
expected_repr = repr(expected) if function_to_call and language.lower() in ["python", "py"] else None
|
|
1473
|
+
normalized_expected_repr = normalize_output(expected_repr) if expected_repr else None
|
|
1474
|
+
|
|
1475
|
+
is_pass = normalized_output == normalized_expected
|
|
1476
|
+
if not is_pass and normalized_expected_repr:
|
|
1477
|
+
is_pass = normalized_output == normalized_expected_repr
|
|
1478
|
+
|
|
1479
|
+
test_result["passed"] = is_pass
|
|
1480
|
+
test_result["actual_output"] = output
|
|
1481
|
+
test_result["normalized_actual"] = normalized_output
|
|
1482
|
+
test_result["normalized_expected"] = normalized_expected
|
|
1483
|
+
test_result["details"] = f"Passed: {is_pass}"
|
|
1484
|
+
|
|
1485
|
+
if test_result["passed"]:
|
|
1486
|
+
passed += 1
|
|
1487
|
+
else:
|
|
1488
|
+
test_result["error"] = execution_result["error"]
|
|
1489
|
+
test_result["details"] = f"Error: {execution_result['error']}"
|
|
1490
|
+
|
|
1491
|
+
results.append(test_result)
|
|
1492
|
+
|
|
1493
|
+
score = passed / total if total > 0 else 0.0
|
|
1494
|
+
|
|
1495
|
+
if isinstance(results, list):
|
|
1496
|
+
metrics["test_results"] = results
|
|
1497
|
+
else:
|
|
1498
|
+
metrics["test_results"] = [{"error": "Invalid results format"}]
|
|
1499
|
+
metrics["pass_rate"] = f"{passed}/{total} tests passed ({score:.2%})"
|
|
1500
|
+
|
|
1501
|
+
final_metrics: Dict[str, MetricResult] = {}
|
|
1502
|
+
for key, value in metrics.items():
|
|
1503
|
+
if key == "test_results":
|
|
1504
|
+
final_metrics[key] = MetricResult(
|
|
1505
|
+
score=score,
|
|
1506
|
+
reason=json.dumps(value, indent=2),
|
|
1507
|
+
is_score_valid=score == 1.0,
|
|
1508
|
+
)
|
|
1509
|
+
elif key == "pass_rate":
|
|
1510
|
+
final_metrics[key] = MetricResult(
|
|
1511
|
+
score=score,
|
|
1512
|
+
reason=str(value),
|
|
1513
|
+
is_score_valid=score == 1.0,
|
|
1514
|
+
)
|
|
1515
|
+
elif isinstance(value, MetricResult):
|
|
1516
|
+
final_metrics[key] = value
|
|
1517
|
+
elif isinstance(value, str):
|
|
1518
|
+
final_metrics[key] = MetricResult(score=0.0, reason=value, is_score_valid=False)
|
|
1519
|
+
|
|
1520
|
+
return EvaluateResult(score=score, reason=f"{passed}/{total} tests passed.", metrics=final_metrics)
|
|
1521
|
+
|
|
1522
|
+
|
|
1523
|
+
def reliability_guard(maximum_memory_bytes: Optional[int] = None) -> None:
|
|
1524
|
+
"""
|
|
1525
|
+
Disable various destructive functions and prevent the generated code
|
|
1526
|
+
from interfering with the test system.
|
|
1527
|
+
|
|
1528
|
+
This sets resource limits and disables various system calls that could
|
|
1529
|
+
be used to interfere with the testing environment.
|
|
1530
|
+
|
|
1531
|
+
Args:
|
|
1532
|
+
maximum_memory_bytes: Maximum memory allocation allowed in bytes (optional)
|
|
1533
|
+
|
|
1534
|
+
Warning:
|
|
1535
|
+
This function is NOT a security sandbox. Untrusted code should not be
|
|
1536
|
+
blindly executed outside of a proper sandbox environment.
|
|
1537
|
+
"""
|
|
1538
|
+
if maximum_memory_bytes is not None:
|
|
1539
|
+
if platform.uname().system != "Darwin":
|
|
1540
|
+
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
|
|
1541
|
+
resource.setrlimit(
|
|
1542
|
+
resource.RLIMIT_DATA,
|
|
1543
|
+
(maximum_memory_bytes, maximum_memory_bytes),
|
|
1544
|
+
)
|
|
1545
|
+
resource.setrlimit(
|
|
1546
|
+
resource.RLIMIT_STACK,
|
|
1547
|
+
(maximum_memory_bytes, maximum_memory_bytes),
|
|
1548
|
+
)
|
|
1549
|
+
|
|
1550
|
+
faulthandler.disable()
|
|
1551
|
+
|
|
1552
|
+
import builtins
|
|
1553
|
+
|
|
1554
|
+
builtins.exit = noop # type: ignore
|
|
1555
|
+
builtins.quit = noop # type: ignore
|
|
1556
|
+
|
|
1557
|
+
os.environ["OMP_NUM_THREADS"] = "1"
|
|
1558
|
+
|
|
1559
|
+
os.kill = noop # type: ignore
|
|
1560
|
+
os.system = noop # type: ignore
|
|
1561
|
+
os.putenv = noop # type: ignore
|
|
1562
|
+
os.remove = noop # type: ignore
|
|
1563
|
+
os.removedirs = noop # type: ignore
|
|
1564
|
+
os.rmdir = noop # type: ignore
|
|
1565
|
+
os.fchdir = noop # type: ignore
|
|
1566
|
+
os.setuid = noop # type: ignore
|
|
1567
|
+
os.fork = noop # type: ignore
|
|
1568
|
+
os.forkpty = noop # type: ignore
|
|
1569
|
+
os.killpg = noop # type: ignore
|
|
1570
|
+
os.rename = noop # type: ignore
|
|
1571
|
+
os.renames = noop # type: ignore
|
|
1572
|
+
os.truncate = noop # type: ignore
|
|
1573
|
+
os.replace = noop # type: ignore
|
|
1574
|
+
os.unlink = noop # type: ignore
|
|
1575
|
+
os.fchmod = noop # type: ignore
|
|
1576
|
+
os.fchown = noop # type: ignore
|
|
1577
|
+
os.chmod = noop # type: ignore
|
|
1578
|
+
os.chown = noop # type: ignore
|
|
1579
|
+
os.chroot = noop # type: ignore
|
|
1580
|
+
|
|
1581
|
+
if hasattr(os, "lchflags"):
|
|
1582
|
+
os.lchflags = noop # type: ignore
|
|
1583
|
+
if hasattr(os, "lchmod"):
|
|
1584
|
+
os.lchmod = noop # type: ignore
|
|
1585
|
+
if hasattr(os, "lchown"):
|
|
1586
|
+
os.lchown = noop # type: ignore
|
|
1587
|
+
|
|
1588
|
+
import shutil
|
|
1589
|
+
|
|
1590
|
+
shutil.rmtree = noop # type: ignore
|
|
1591
|
+
shutil.move = noop # type: ignore
|
|
1592
|
+
shutil.chown = noop # type: ignore
|
|
1593
|
+
|
|
1594
|
+
class EmptyModule:
|
|
1595
|
+
def __getattr__(self, name: str) -> Any:
|
|
1596
|
+
return noop
|
|
1597
|
+
|
|
1598
|
+
for mod_name in ["ipdb", "joblib", "psutil", "tkinter"]:
|
|
1599
|
+
if mod_name not in sys.modules:
|
|
1600
|
+
sys.modules[mod_name] = EmptyModule() # type: ignore
|
|
1601
|
+
|
|
1602
|
+
|
|
1603
|
+
class Capturing(list):
|
|
1604
|
+
"""
|
|
1605
|
+
Context manager for capturing stdout output.
|
|
1606
|
+
|
|
1607
|
+
This class captures all output to stdout and stores it in a list,
|
|
1608
|
+
allowing for the examination of output from executed code.
|
|
1609
|
+
"""
|
|
1610
|
+
|
|
1611
|
+
def __enter__(self):
|
|
1612
|
+
self._stdout = sys.stdout
|
|
1613
|
+
sys.stdout = self._stringio = StringIO()
|
|
1614
|
+
self._stringio.close = lambda x: None
|
|
1615
|
+
return self
|
|
1616
|
+
|
|
1617
|
+
def __exit__(self, *args):
|
|
1618
|
+
self.append(self._stringio.getvalue())
|
|
1619
|
+
del self._stringio
|
|
1620
|
+
sys.stdout = self._stdout
|