human-eval-rust 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HumanEval_rust.jsonl +164 -0
- data/HumanEval_rust_extended.jsonl +2 -0
- data/example_rust_problem.jsonl +1 -0
- data/example_rust_samples.jsonl +4 -0
- human_eval/__init__.py +23 -0
- human_eval/data.py +74 -0
- human_eval/evaluate_functional_correctness.py +112 -0
- human_eval/evaluation.py +281 -0
- human_eval/execution.py +186 -0
- human_eval/logging_config.py +43 -0
- human_eval/resource_monitor.py +58 -0
- human_eval/rust_execution.py +802 -0
- human_eval/sandbox.py +586 -0
- human_eval_rust-2.1.0.dist-info/METADATA +488 -0
- human_eval_rust-2.1.0.dist-info/RECORD +19 -0
- human_eval_rust-2.1.0.dist-info/WHEEL +5 -0
- human_eval_rust-2.1.0.dist-info/entry_points.txt +2 -0
- human_eval_rust-2.1.0.dist-info/licenses/LICENSE +21 -0
- human_eval_rust-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,802 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rust-specific execution module for HumanEval evaluation.
|
|
3
|
+
|
|
4
|
+
Handles compilation and test execution of Rust code completions with sandboxing support.
|
|
5
|
+
|
|
6
|
+
Copyright (c) 2025 Dave Tofflemire, SigilDERG Project
|
|
7
|
+
Version: 2.1.0
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import multiprocessing
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
import shutil
|
|
14
|
+
import subprocess
|
|
15
|
+
import time
|
|
16
|
+
import unicodedata
|
|
17
|
+
|
|
18
|
+
# Use relative import to avoid circular dependency with execution.py
|
|
19
|
+
from .execution import TimeoutException, create_tempdir, reliability_guard, time_limit
|
|
20
|
+
|
|
21
|
+
# Try to import sandbox module (optional)
|
|
22
|
+
try:
|
|
23
|
+
from .sandbox import SandboxError, run_binary_sandboxed, run_rustc_sandboxed
|
|
24
|
+
|
|
25
|
+
SANDBOX_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
SANDBOX_AVAILABLE = False
|
|
28
|
+
SandboxError = Exception
|
|
29
|
+
|
|
30
|
+
# Define stub functions to satisfy type checker
|
|
31
|
+
# These will never be called because SANDBOX_AVAILABLE is False
|
|
32
|
+
def run_rustc_sandboxed(
|
|
33
|
+
source_file: str,
|
|
34
|
+
output_binary: str,
|
|
35
|
+
command_args: list[str],
|
|
36
|
+
timeout: float = 30.0,
|
|
37
|
+
capture_output: bool = True,
|
|
38
|
+
sandbox_mode: str | None = None,
|
|
39
|
+
) -> subprocess.CompletedProcess[str]:
|
|
40
|
+
raise RuntimeError("Sandbox not available")
|
|
41
|
+
|
|
42
|
+
def run_binary_sandboxed(
|
|
43
|
+
binary_path: str,
|
|
44
|
+
timeout: float = 30.0,
|
|
45
|
+
capture_output: bool = True,
|
|
46
|
+
sandbox_mode: str | None = None,
|
|
47
|
+
) -> subprocess.CompletedProcess[str]:
|
|
48
|
+
raise RuntimeError("Sandbox not available")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
DISALLOWED_COMPLETION_PATTERNS = [
|
|
52
|
+
# Filesystem operations
|
|
53
|
+
"std::fs",
|
|
54
|
+
"std::path",
|
|
55
|
+
"std::io::write",
|
|
56
|
+
"std::io::read",
|
|
57
|
+
"std::io::copy",
|
|
58
|
+
"std::io::create",
|
|
59
|
+
"std::io::remove",
|
|
60
|
+
"std::io::rename",
|
|
61
|
+
"std::io::metadata",
|
|
62
|
+
"std::io::symlink",
|
|
63
|
+
"std::io::hard_link",
|
|
64
|
+
"std::io::canonicalize",
|
|
65
|
+
"std::io::read_dir",
|
|
66
|
+
"std::io::read_to_string",
|
|
67
|
+
"std::io::read_to_end",
|
|
68
|
+
"std::io::read_exact",
|
|
69
|
+
"std::io::write_all",
|
|
70
|
+
"std::io::write_fmt",
|
|
71
|
+
"std::io::flush",
|
|
72
|
+
"std::io::seek",
|
|
73
|
+
"std::io::set_permissions",
|
|
74
|
+
"std::io::remove_file",
|
|
75
|
+
"std::io::remove_dir",
|
|
76
|
+
"std::io::remove_dir_all",
|
|
77
|
+
"std::io::create_dir",
|
|
78
|
+
"std::io::create_dir_all",
|
|
79
|
+
"std::io::rename",
|
|
80
|
+
"std::io::copy",
|
|
81
|
+
"std::io::hard_link",
|
|
82
|
+
"std::io::symlink_metadata",
|
|
83
|
+
"std::io::read_link",
|
|
84
|
+
"std::io::canonicalize",
|
|
85
|
+
"std::io::File::create",
|
|
86
|
+
"std::io::File::open",
|
|
87
|
+
"std::io::File::create_new",
|
|
88
|
+
"std::io::File::read",
|
|
89
|
+
"std::io::File::write",
|
|
90
|
+
# Process and system operations
|
|
91
|
+
"std::process",
|
|
92
|
+
"std::process::Command",
|
|
93
|
+
"std::process::Command::new",
|
|
94
|
+
"std::process::Command::spawn",
|
|
95
|
+
"std::process::Command::output",
|
|
96
|
+
"std::process::Command::status",
|
|
97
|
+
"std::process::exit",
|
|
98
|
+
"std::process::abort",
|
|
99
|
+
"std::process::id",
|
|
100
|
+
"std::process::parent_id",
|
|
101
|
+
"command::new",
|
|
102
|
+
"command::spawn",
|
|
103
|
+
"command::output",
|
|
104
|
+
# Network operations
|
|
105
|
+
"std::net",
|
|
106
|
+
"std::net::TcpStream",
|
|
107
|
+
"std::net::TcpListener",
|
|
108
|
+
"std::net::UdpSocket",
|
|
109
|
+
"std::net::UnixStream",
|
|
110
|
+
"std::net::UnixListener",
|
|
111
|
+
"std::net::SocketAddr",
|
|
112
|
+
"std::net::IpAddr",
|
|
113
|
+
"std::net::Ipv4Addr",
|
|
114
|
+
"std::net::Ipv6Addr",
|
|
115
|
+
"std::net::ToSocketAddrs",
|
|
116
|
+
"std::net::lookup_host",
|
|
117
|
+
"reqwest",
|
|
118
|
+
"ureq",
|
|
119
|
+
"hyper",
|
|
120
|
+
"tokio::net",
|
|
121
|
+
"tokio::net::TcpStream",
|
|
122
|
+
"tokio::net::TcpListener",
|
|
123
|
+
"tokio::net::UdpSocket",
|
|
124
|
+
"tokio::net::UnixStream",
|
|
125
|
+
"tokio::net::UnixListener",
|
|
126
|
+
# Threading and concurrency
|
|
127
|
+
"std::thread",
|
|
128
|
+
"std::thread::spawn",
|
|
129
|
+
"std::thread::Builder",
|
|
130
|
+
"std::thread::Thread",
|
|
131
|
+
"std::thread::park",
|
|
132
|
+
"std::thread::yield_now",
|
|
133
|
+
"std::thread::sleep",
|
|
134
|
+
"std::thread::available_parallelism",
|
|
135
|
+
"std::sync::mpsc",
|
|
136
|
+
"std::sync::mpsc::channel",
|
|
137
|
+
"std::sync::mpsc::sync_channel",
|
|
138
|
+
"std::sync::Arc",
|
|
139
|
+
"std::sync::Mutex",
|
|
140
|
+
"std::sync::RwLock",
|
|
141
|
+
"std::sync::Condvar",
|
|
142
|
+
"std::sync::Barrier",
|
|
143
|
+
"std::sync::Once",
|
|
144
|
+
"std::sync::atomic",
|
|
145
|
+
"tokio::spawn",
|
|
146
|
+
"tokio::task",
|
|
147
|
+
"tokio::runtime",
|
|
148
|
+
# Unsafe code
|
|
149
|
+
"unsafe",
|
|
150
|
+
"unsafe fn",
|
|
151
|
+
"unsafe trait",
|
|
152
|
+
"unsafe impl",
|
|
153
|
+
"unsafe block",
|
|
154
|
+
"unsafe {}",
|
|
155
|
+
# Memory operations
|
|
156
|
+
"std::alloc",
|
|
157
|
+
"std::alloc::alloc",
|
|
158
|
+
"std::alloc::dealloc",
|
|
159
|
+
"std::alloc::realloc",
|
|
160
|
+
"std::alloc::Layout",
|
|
161
|
+
"std::ptr",
|
|
162
|
+
"std::ptr::null",
|
|
163
|
+
"std::ptr::null_mut",
|
|
164
|
+
"std::ptr::read",
|
|
165
|
+
"std::ptr::write",
|
|
166
|
+
"std::ptr::copy",
|
|
167
|
+
"std::ptr::copy_nonoverlapping",
|
|
168
|
+
"std::ptr::swap",
|
|
169
|
+
"std::ptr::replace",
|
|
170
|
+
"std::ptr::drop_in_place",
|
|
171
|
+
"std::mem",
|
|
172
|
+
"std::mem::forget",
|
|
173
|
+
"std::mem::transmute",
|
|
174
|
+
"std::mem::zeroed",
|
|
175
|
+
"std::mem::uninitialized",
|
|
176
|
+
"std::mem::replace",
|
|
177
|
+
"std::mem::swap",
|
|
178
|
+
"std::mem::take",
|
|
179
|
+
"std::mem::size_of",
|
|
180
|
+
"std::mem::align_of",
|
|
181
|
+
"std::mem::size_of_val",
|
|
182
|
+
"std::mem::align_of_val",
|
|
183
|
+
"std::mem::needs_drop",
|
|
184
|
+
"std::mem::drop",
|
|
185
|
+
"std::mem::forget",
|
|
186
|
+
"std::mem::transmute",
|
|
187
|
+
"std::mem::zeroed",
|
|
188
|
+
"std::mem::uninitialized",
|
|
189
|
+
"std::mem::MaybeUninit",
|
|
190
|
+
# Environment and system
|
|
191
|
+
"std::env",
|
|
192
|
+
"std::env::var",
|
|
193
|
+
"std::env::vars",
|
|
194
|
+
"std::env::set_var",
|
|
195
|
+
"std::env::remove_var",
|
|
196
|
+
"std::env::current_dir",
|
|
197
|
+
"std::env::set_current_dir",
|
|
198
|
+
"std::env::args",
|
|
199
|
+
"std::env::args_os",
|
|
200
|
+
"std::env::consts",
|
|
201
|
+
"std::env::home_dir",
|
|
202
|
+
"std::env::temp_dir",
|
|
203
|
+
# Time and system calls
|
|
204
|
+
"std::time::SystemTime",
|
|
205
|
+
"std::time::UNIX_EPOCH",
|
|
206
|
+
"std::time::Duration",
|
|
207
|
+
"std::time::Instant",
|
|
208
|
+
# External process execution
|
|
209
|
+
"std::os",
|
|
210
|
+
"std::os::unix",
|
|
211
|
+
"std::os::windows",
|
|
212
|
+
"std::os::linux",
|
|
213
|
+
"std::os::macos",
|
|
214
|
+
# FFI (Foreign Function Interface)
|
|
215
|
+
"extern",
|
|
216
|
+
'extern "C"',
|
|
217
|
+
'extern "system"',
|
|
218
|
+
"libc",
|
|
219
|
+
"winapi",
|
|
220
|
+
# Dynamic loading
|
|
221
|
+
"std::ffi",
|
|
222
|
+
"std::ffi::CString",
|
|
223
|
+
"std::ffi::CStr",
|
|
224
|
+
"std::ffi::OsString",
|
|
225
|
+
"std::ffi::OsStr",
|
|
226
|
+
"std::ffi::NulError",
|
|
227
|
+
# Signal handling
|
|
228
|
+
"std::signal",
|
|
229
|
+
"libc::signal",
|
|
230
|
+
# Other dangerous patterns
|
|
231
|
+
"std::panic",
|
|
232
|
+
"std::panic::panic",
|
|
233
|
+
"std::panic::panic_any",
|
|
234
|
+
"std::panic::set_hook",
|
|
235
|
+
"std::panic::take_hook",
|
|
236
|
+
"std::panic::catch_unwind",
|
|
237
|
+
"std::panic::resume_unwind",
|
|
238
|
+
"std::panic::AssertUnwindSafe",
|
|
239
|
+
# Compile-time code execution
|
|
240
|
+
"include!",
|
|
241
|
+
"include_str!",
|
|
242
|
+
"include_bytes!",
|
|
243
|
+
"env!",
|
|
244
|
+
"option_env!",
|
|
245
|
+
"concat!",
|
|
246
|
+
"file!",
|
|
247
|
+
"line!",
|
|
248
|
+
"column!",
|
|
249
|
+
"module_path!",
|
|
250
|
+
# Assembly
|
|
251
|
+
"asm!",
|
|
252
|
+
"global_asm!",
|
|
253
|
+
# FFI/Linking
|
|
254
|
+
"#[link",
|
|
255
|
+
"#[no_mangle]",
|
|
256
|
+
"#[export_name",
|
|
257
|
+
"build.rs",
|
|
258
|
+
# Proc macros
|
|
259
|
+
"proc_macro",
|
|
260
|
+
"#[derive(",
|
|
261
|
+
# Additional dangerous patterns
|
|
262
|
+
"std::intrinsics",
|
|
263
|
+
"core::intrinsics",
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _normalize_unicode(text: str) -> str:
|
|
268
|
+
"""Normalize Unicode to ASCII to prevent homoglyph attacks."""
|
|
269
|
+
|
|
270
|
+
return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _sanitize_rust_completion(completion: str) -> str | None:
|
|
274
|
+
"""Check for disallowed patterns with Unicode normalization."""
|
|
275
|
+
|
|
276
|
+
normalized = _normalize_unicode(completion.lower())
|
|
277
|
+
|
|
278
|
+
for pattern in DISALLOWED_COMPLETION_PATTERNS:
|
|
279
|
+
if pattern.lower() in normalized:
|
|
280
|
+
return f"disallowed usage of {pattern}"
|
|
281
|
+
|
|
282
|
+
if re.search(
|
|
283
|
+
r"r#*\".*?(unsafe|std::fs|std::process).*?\"#*",
|
|
284
|
+
completion,
|
|
285
|
+
re.IGNORECASE | re.DOTALL,
|
|
286
|
+
):
|
|
287
|
+
return "disallowed pattern in raw string"
|
|
288
|
+
|
|
289
|
+
return None
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
MAX_COMPLETION_LENGTH = 100_000
|
|
293
|
+
MAX_COMPLETION_LINES = 5_000
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _validate_completion(completion: str) -> str | None:
|
|
297
|
+
"""Validate completion content. Returns error message or None."""
|
|
298
|
+
|
|
299
|
+
if not completion:
|
|
300
|
+
return "empty completion"
|
|
301
|
+
|
|
302
|
+
if len(completion) > MAX_COMPLETION_LENGTH:
|
|
303
|
+
return f"completion too long ({len(completion)} > {MAX_COMPLETION_LENGTH})"
|
|
304
|
+
|
|
305
|
+
if completion.count("\n") > MAX_COMPLETION_LINES:
|
|
306
|
+
return f"too many lines (> {MAX_COMPLETION_LINES})"
|
|
307
|
+
|
|
308
|
+
if "\x00" in completion:
|
|
309
|
+
return "null byte in completion"
|
|
310
|
+
|
|
311
|
+
try:
|
|
312
|
+
completion.encode("utf-8")
|
|
313
|
+
except UnicodeEncodeError:
|
|
314
|
+
return "invalid UTF-8 encoding"
|
|
315
|
+
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def _extract_function_body(completion: str, entry_point: str) -> str:
|
|
320
|
+
"""
|
|
321
|
+
Extract the function body from a completion, removing extra code like main() functions.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
completion: Raw completion text from the model
|
|
325
|
+
entry_point: Name of the function we're looking for (e.g., "has_close_elements")
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Cleaned completion with only the target function body
|
|
329
|
+
"""
|
|
330
|
+
import re
|
|
331
|
+
|
|
332
|
+
# Step 1: Remove markdown code blocks
|
|
333
|
+
if "```rust" in completion:
|
|
334
|
+
# Extract content between ```rust and ```
|
|
335
|
+
rust_match = re.search(r"```rust\s*(.*?)\s*```", completion, re.DOTALL)
|
|
336
|
+
if rust_match:
|
|
337
|
+
completion = rust_match.group(1)
|
|
338
|
+
elif "```" in completion:
|
|
339
|
+
# Generic code block
|
|
340
|
+
code_match = re.search(r"```[^\n]*\s*(.*?)\s*```", completion, re.DOTALL)
|
|
341
|
+
if code_match:
|
|
342
|
+
completion = code_match.group(1)
|
|
343
|
+
|
|
344
|
+
completion = completion.strip()
|
|
345
|
+
|
|
346
|
+
# Step 2: Try to find the function that matches entry_point
|
|
347
|
+
# Pattern: fn entry_point(...) -> ... { ... }
|
|
348
|
+
# Note: Construct pattern carefully to avoid f-string bracket issues
|
|
349
|
+
# The pattern matches: fn entry_point(...) -> [anything except {] { ... }
|
|
350
|
+
not_brace_pattern = r"[^{]" # Match any character except opening brace
|
|
351
|
+
fn_pattern = rf"fn\s+{re.escape(entry_point)}\s*\([^)]*\)\s*(?:->{not_brace_pattern}*)?\s*\{{"
|
|
352
|
+
fn_match = re.search(fn_pattern, completion, re.MULTILINE | re.DOTALL)
|
|
353
|
+
|
|
354
|
+
if fn_match:
|
|
355
|
+
# Found the target function, extract from the opening brace
|
|
356
|
+
start_pos = fn_match.end() - 1 # Position of opening brace
|
|
357
|
+
brace_count = 0
|
|
358
|
+
end_pos = start_pos
|
|
359
|
+
|
|
360
|
+
# Find matching closing brace
|
|
361
|
+
for i in range(start_pos, len(completion)):
|
|
362
|
+
if completion[i] == "{":
|
|
363
|
+
brace_count += 1
|
|
364
|
+
elif completion[i] == "}":
|
|
365
|
+
brace_count -= 1
|
|
366
|
+
if brace_count == 0:
|
|
367
|
+
end_pos = i + 1
|
|
368
|
+
break
|
|
369
|
+
|
|
370
|
+
if brace_count == 0:
|
|
371
|
+
# Extract just the function body (without the function signature)
|
|
372
|
+
# The prompt already has the signature, we just need the body
|
|
373
|
+
function_body = completion[start_pos + 1 : end_pos - 1].strip()
|
|
374
|
+
return function_body
|
|
375
|
+
|
|
376
|
+
# Step 2b: If completion is just the function body (no signature), check if it starts with {
|
|
377
|
+
# This handles cases where the model generates just the body
|
|
378
|
+
if completion.strip().startswith("{"):
|
|
379
|
+
# Extract content between first { and matching }
|
|
380
|
+
brace_count = 0
|
|
381
|
+
start_pos = 0
|
|
382
|
+
end_pos = len(completion)
|
|
383
|
+
|
|
384
|
+
for i, char in enumerate(completion):
|
|
385
|
+
if char == "{":
|
|
386
|
+
if brace_count == 0:
|
|
387
|
+
start_pos = i + 1
|
|
388
|
+
brace_count += 1
|
|
389
|
+
elif char == "}":
|
|
390
|
+
brace_count -= 1
|
|
391
|
+
if brace_count == 0:
|
|
392
|
+
end_pos = i
|
|
393
|
+
return completion[start_pos:end_pos].strip()
|
|
394
|
+
|
|
395
|
+
# If we didn't find a matching brace, return everything after the first {
|
|
396
|
+
if brace_count > 0:
|
|
397
|
+
return completion[start_pos:].strip()
|
|
398
|
+
|
|
399
|
+
# Step 3: If we didn't find the target function, try to extract any function body
|
|
400
|
+
# and remove main() functions
|
|
401
|
+
lines = completion.split("\n")
|
|
402
|
+
cleaned_lines = []
|
|
403
|
+
in_main = False
|
|
404
|
+
brace_count = 0
|
|
405
|
+
|
|
406
|
+
i = 0
|
|
407
|
+
while i < len(lines):
|
|
408
|
+
line = lines[i]
|
|
409
|
+
stripped = line.strip()
|
|
410
|
+
|
|
411
|
+
# Skip standalone main() functions
|
|
412
|
+
if re.match(r"^fn\s+main\s*\([^)]*\)\s*(?:->[^{]*)?\s*\{", stripped):
|
|
413
|
+
in_main = True
|
|
414
|
+
brace_count = 1
|
|
415
|
+
# Skip until matching closing brace
|
|
416
|
+
i += 1
|
|
417
|
+
while i < len(lines) and brace_count > 0:
|
|
418
|
+
line = lines[i]
|
|
419
|
+
brace_count += line.count("{") - line.count("}")
|
|
420
|
+
i += 1
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
# Skip lines that are part of a main() function we're skipping
|
|
424
|
+
if in_main:
|
|
425
|
+
brace_count += line.count("{") - line.count("}")
|
|
426
|
+
if brace_count <= 0:
|
|
427
|
+
in_main = False
|
|
428
|
+
i += 1
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
# Keep other lines
|
|
432
|
+
cleaned_lines.append(line)
|
|
433
|
+
i += 1
|
|
434
|
+
|
|
435
|
+
result = "\n".join(cleaned_lines).strip()
|
|
436
|
+
|
|
437
|
+
# Step 4: Remove common extra patterns
|
|
438
|
+
# Remove "Example usage:" or "// Example usage:" blocks
|
|
439
|
+
result = re.sub(
|
|
440
|
+
r"(?i)(//\s*)?(example\s+usage|usage\s+example):.*", "", result, flags=re.DOTALL
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Remove standalone use statements that aren't needed (keep them if they're at the top)
|
|
444
|
+
# This is tricky, so we'll be conservative and only remove obviously wrong ones
|
|
445
|
+
result = re.sub(
|
|
446
|
+
r"^use\s+std::collections::Vec;?\s*$", "", result, flags=re.MULTILINE
|
|
447
|
+
) # Vec is in std::vec, not collections
|
|
448
|
+
|
|
449
|
+
return result.strip()
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def _check_rustc_available(sandbox_mode: str | None = None) -> tuple[bool, str | None]:
|
|
453
|
+
"""
|
|
454
|
+
Preflight check for rustc availability.
|
|
455
|
+
Returns (available, error_message).
|
|
456
|
+
"""
|
|
457
|
+
try:
|
|
458
|
+
# Check local rustc (for firejail, none, or any mode - firejail uses host rustc)
|
|
459
|
+
result = subprocess.run(
|
|
460
|
+
["rustc", "--version"],
|
|
461
|
+
capture_output=True,
|
|
462
|
+
text=True,
|
|
463
|
+
timeout=5.0,
|
|
464
|
+
)
|
|
465
|
+
if result.returncode == 0:
|
|
466
|
+
return True, None
|
|
467
|
+
return False, "rustc --version failed"
|
|
468
|
+
except FileNotFoundError:
|
|
469
|
+
return False, "rustc not found in PATH"
|
|
470
|
+
except subprocess.TimeoutExpired:
|
|
471
|
+
return False, "rustc version check timed out"
|
|
472
|
+
except Exception as e:
|
|
473
|
+
return False, f"rustc check error: {e}"
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def check_main_free(completion: str) -> bool:
|
|
477
|
+
"""Check if completion contains fn main."""
|
|
478
|
+
import re
|
|
479
|
+
|
|
480
|
+
# Check for fn main() patterns
|
|
481
|
+
main_pattern = r"fn\s+main\s*\("
|
|
482
|
+
return not bool(re.search(main_pattern, completion, re.IGNORECASE))
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _run_clippy_check(source_path: str, timeout: float) -> tuple[bool, str]:
|
|
486
|
+
"""Run clippy on compiled code and return (passed, warnings)."""
|
|
487
|
+
|
|
488
|
+
result = subprocess.run(
|
|
489
|
+
["cargo", "clippy", "--", "-D", "warnings"],
|
|
490
|
+
capture_output=True,
|
|
491
|
+
text=True,
|
|
492
|
+
timeout=timeout,
|
|
493
|
+
cwd=os.path.dirname(source_path),
|
|
494
|
+
)
|
|
495
|
+
return result.returncode == 0, result.stderr
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
class ReliabilityContext:
|
|
499
|
+
"""Context manager that provides isolated reliability guards."""
|
|
500
|
+
|
|
501
|
+
def __init__(self, maximum_memory_bytes: int | None = None):
|
|
502
|
+
self.maximum_memory_bytes = maximum_memory_bytes
|
|
503
|
+
self._original_functions: dict[str, object] = {}
|
|
504
|
+
|
|
505
|
+
def __enter__(self):
|
|
506
|
+
# Store originals
|
|
507
|
+
self._original_functions = {
|
|
508
|
+
"rmtree": shutil.rmtree,
|
|
509
|
+
"rmdir": os.rmdir,
|
|
510
|
+
"chdir": os.chdir,
|
|
511
|
+
"Popen": subprocess.Popen,
|
|
512
|
+
}
|
|
513
|
+
reliability_guard(self.maximum_memory_bytes)
|
|
514
|
+
return self
|
|
515
|
+
|
|
516
|
+
def __exit__(self, *args):
|
|
517
|
+
shutil.rmtree = self._original_functions["rmtree"]
|
|
518
|
+
os.rmdir = self._original_functions["rmdir"]
|
|
519
|
+
os.chdir = self._original_functions["chdir"]
|
|
520
|
+
subprocess.Popen = self._original_functions["Popen"]
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
DETERMINISTIC_RUSTC_FLAGS = [
|
|
524
|
+
"--edition=2021",
|
|
525
|
+
"--test",
|
|
526
|
+
"-C",
|
|
527
|
+
"opt-level=0",
|
|
528
|
+
"-C",
|
|
529
|
+
"debuginfo=0",
|
|
530
|
+
"-C",
|
|
531
|
+
"incremental=false",
|
|
532
|
+
]
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def _rust_unsafe_execute(
|
|
536
|
+
problem: dict,
|
|
537
|
+
completion: str,
|
|
538
|
+
timeout: float,
|
|
539
|
+
result,
|
|
540
|
+
sandbox_mode: str | None = None,
|
|
541
|
+
enforce_policy: bool = True,
|
|
542
|
+
):
|
|
543
|
+
"""
|
|
544
|
+
Execute Rust code and return enhanced result schema.
|
|
545
|
+
Result dict structure:
|
|
546
|
+
{
|
|
547
|
+
"compile_ok": bool | None,
|
|
548
|
+
"test_ok": bool | None,
|
|
549
|
+
"error_type": str | None, # "infra_missing_toolchain" | "compile_error" | "runtime_error" | "assertion_failure"
|
|
550
|
+
"stderr": str,
|
|
551
|
+
"passed": bool,
|
|
552
|
+
"main_free": bool,
|
|
553
|
+
"result": str, # Legacy field for compatibility
|
|
554
|
+
}
|
|
555
|
+
"""
|
|
556
|
+
with create_tempdir() as temp_dir, ReliabilityContext():
|
|
557
|
+
result_dict = {
|
|
558
|
+
"compile_ok": None,
|
|
559
|
+
"test_ok": None,
|
|
560
|
+
"clippy_ok": None,
|
|
561
|
+
"compile_time_ms": None,
|
|
562
|
+
"binary_size_bytes": None,
|
|
563
|
+
"error_type": None,
|
|
564
|
+
"stderr": "",
|
|
565
|
+
"passed": False,
|
|
566
|
+
"main_free": check_main_free(completion),
|
|
567
|
+
"result": "",
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
rustc_available, rustc_error = _check_rustc_available(sandbox_mode)
|
|
571
|
+
if not rustc_available:
|
|
572
|
+
result_dict["error_type"] = "infra_missing_toolchain"
|
|
573
|
+
result_dict["stderr"] = rustc_error or "rustc not available"
|
|
574
|
+
result_dict["result"] = f"failed: {result_dict['stderr']}"
|
|
575
|
+
result.append(result_dict)
|
|
576
|
+
return
|
|
577
|
+
|
|
578
|
+
validation_error = _validate_completion(completion)
|
|
579
|
+
if validation_error:
|
|
580
|
+
result_dict["error_type"] = "compile_error"
|
|
581
|
+
result_dict["stderr"] = validation_error
|
|
582
|
+
result_dict["result"] = f"filtered: {validation_error}"
|
|
583
|
+
result.append(result_dict)
|
|
584
|
+
return
|
|
585
|
+
|
|
586
|
+
entry_point = problem.get("entry_point", "")
|
|
587
|
+
cleaned_completion = _extract_function_body(completion, entry_point)
|
|
588
|
+
|
|
589
|
+
if enforce_policy:
|
|
590
|
+
violation = _sanitize_rust_completion(cleaned_completion)
|
|
591
|
+
if violation:
|
|
592
|
+
result_dict["error_type"] = "compile_error"
|
|
593
|
+
result_dict["stderr"] = violation
|
|
594
|
+
result_dict["result"] = f"failed: {violation}"
|
|
595
|
+
result.append(result_dict)
|
|
596
|
+
return
|
|
597
|
+
|
|
598
|
+
source_path = os.path.join(temp_dir, "solution.rs")
|
|
599
|
+
test_binary = os.path.join(temp_dir, "solution_test")
|
|
600
|
+
|
|
601
|
+
with open(source_path, "w", encoding="utf-8") as source_file:
|
|
602
|
+
source_file.write(problem["prompt"])
|
|
603
|
+
source_file.write(cleaned_completion)
|
|
604
|
+
source_file.write("\n\n")
|
|
605
|
+
source_file.write(problem["test"])
|
|
606
|
+
source_file.write("\n")
|
|
607
|
+
|
|
608
|
+
compile_args = DETERMINISTIC_RUSTC_FLAGS.copy()
|
|
609
|
+
|
|
610
|
+
effective_mode = sandbox_mode
|
|
611
|
+
use_sandbox = SANDBOX_AVAILABLE and effective_mode != "none"
|
|
612
|
+
|
|
613
|
+
timed_out = None
|
|
614
|
+
try:
|
|
615
|
+
with time_limit(timeout) as timed_out_event:
|
|
616
|
+
timed_out = timed_out_event
|
|
617
|
+
start_time = time.perf_counter()
|
|
618
|
+
if use_sandbox:
|
|
619
|
+
try:
|
|
620
|
+
compile_result = run_rustc_sandboxed(
|
|
621
|
+
source_path,
|
|
622
|
+
test_binary,
|
|
623
|
+
compile_args,
|
|
624
|
+
timeout=timeout,
|
|
625
|
+
capture_output=True,
|
|
626
|
+
sandbox_mode=effective_mode,
|
|
627
|
+
)
|
|
628
|
+
except SandboxError as e:
|
|
629
|
+
result_dict["error_type"] = "infra_missing_toolchain"
|
|
630
|
+
result_dict["stderr"] = str(e)
|
|
631
|
+
result_dict["result"] = f"failed: sandbox error: {e}"
|
|
632
|
+
result.append(result_dict)
|
|
633
|
+
return
|
|
634
|
+
else:
|
|
635
|
+
compile_result = subprocess.run(
|
|
636
|
+
["rustc"] + compile_args + [source_path, "-o", test_binary],
|
|
637
|
+
capture_output=True,
|
|
638
|
+
text=True,
|
|
639
|
+
timeout=timeout,
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
result_dict["compile_time_ms"] = int(
|
|
643
|
+
(time.perf_counter() - start_time) * 1000
|
|
644
|
+
)
|
|
645
|
+
result_dict["compile_ok"] = compile_result.returncode == 0
|
|
646
|
+
if compile_result.returncode != 0:
|
|
647
|
+
failure = (
|
|
648
|
+
compile_result.stderr.strip() or compile_result.stdout.strip()
|
|
649
|
+
)
|
|
650
|
+
result_dict["error_type"] = "compile_error"
|
|
651
|
+
result_dict["stderr"] = failure or "compile error"
|
|
652
|
+
result_dict["result"] = f"failed: {result_dict['stderr']}"
|
|
653
|
+
result.append(result_dict)
|
|
654
|
+
return
|
|
655
|
+
|
|
656
|
+
if os.path.exists(test_binary):
|
|
657
|
+
result_dict["binary_size_bytes"] = os.path.getsize(test_binary)
|
|
658
|
+
|
|
659
|
+
if shutil.which("cargo"):
|
|
660
|
+
try:
|
|
661
|
+
clippy_ok, clippy_stderr = _run_clippy_check(
|
|
662
|
+
source_path, timeout
|
|
663
|
+
)
|
|
664
|
+
result_dict["clippy_ok"] = clippy_ok
|
|
665
|
+
if not clippy_ok:
|
|
666
|
+
result_dict["stderr"] = clippy_stderr
|
|
667
|
+
except Exception as exc: # noqa: BLE001
|
|
668
|
+
result_dict["clippy_ok"] = False
|
|
669
|
+
result_dict["stderr"] = str(exc)
|
|
670
|
+
|
|
671
|
+
if use_sandbox:
|
|
672
|
+
try:
|
|
673
|
+
test_result = run_binary_sandboxed(
|
|
674
|
+
test_binary,
|
|
675
|
+
timeout=timeout,
|
|
676
|
+
capture_output=True,
|
|
677
|
+
sandbox_mode=effective_mode,
|
|
678
|
+
)
|
|
679
|
+
except SandboxError as e:
|
|
680
|
+
result_dict["error_type"] = "runtime_error"
|
|
681
|
+
result_dict["stderr"] = str(e)
|
|
682
|
+
result_dict["result"] = f"failed: sandbox error: {e}"
|
|
683
|
+
result.append(result_dict)
|
|
684
|
+
return
|
|
685
|
+
else:
|
|
686
|
+
test_result = subprocess.run(
|
|
687
|
+
[test_binary],
|
|
688
|
+
capture_output=True,
|
|
689
|
+
text=True,
|
|
690
|
+
timeout=timeout,
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
if timed_out and timed_out.is_set():
|
|
694
|
+
raise TimeoutException("Timed out!")
|
|
695
|
+
|
|
696
|
+
result_dict["test_ok"] = test_result.returncode == 0
|
|
697
|
+
if test_result.returncode == 0:
|
|
698
|
+
result_dict["passed"] = True
|
|
699
|
+
result_dict["result"] = "passed"
|
|
700
|
+
else:
|
|
701
|
+
failure = test_result.stderr.strip() or test_result.stdout.strip()
|
|
702
|
+
result_dict["error_type"] = "assertion_failure"
|
|
703
|
+
result_dict["stderr"] = failure or "tests failed"
|
|
704
|
+
result_dict["result"] = f"failed: {result_dict['stderr']}"
|
|
705
|
+
|
|
706
|
+
except (TimeoutException, subprocess.TimeoutExpired):
|
|
707
|
+
result_dict["error_type"] = "runtime_error"
|
|
708
|
+
result_dict["stderr"] = "timeout"
|
|
709
|
+
result_dict["result"] = "timed out"
|
|
710
|
+
except BaseException as exc: # noqa: BLE001
|
|
711
|
+
result_dict["error_type"] = "runtime_error"
|
|
712
|
+
result_dict["stderr"] = str(exc)
|
|
713
|
+
result_dict["result"] = f"failed: {exc}"
|
|
714
|
+
|
|
715
|
+
result.append(result_dict)
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def rust_check_correctness(
|
|
719
|
+
problem: dict,
|
|
720
|
+
completion: str,
|
|
721
|
+
timeout: float,
|
|
722
|
+
completion_id: int | None = None,
|
|
723
|
+
sandbox_mode: str | None = None,
|
|
724
|
+
enforce_policy: bool = True,
|
|
725
|
+
) -> dict:
|
|
726
|
+
"""
|
|
727
|
+
Evaluate a Rust completion by compiling and running its tests.
|
|
728
|
+
|
|
729
|
+
Args:
|
|
730
|
+
problem: Problem dictionary with prompt, test, etc.
|
|
731
|
+
completion: Generated code completion
|
|
732
|
+
timeout: Timeout in seconds
|
|
733
|
+
completion_id: Optional completion ID for tracking
|
|
734
|
+
sandbox_mode: Optional sandbox mode ("firejail", "none", or None for auto-detect)
|
|
735
|
+
enforce_policy: Whether to enforce pattern-based policy filtering (default: True).
|
|
736
|
+
Set to False for pure HumanEval compatibility without security filtering.
|
|
737
|
+
|
|
738
|
+
Returns:
|
|
739
|
+
Dictionary with enhanced schema:
|
|
740
|
+
{
|
|
741
|
+
"task_id": str,
|
|
742
|
+
"completion": str,
|
|
743
|
+
"completion_id": int | None,
|
|
744
|
+
"compile_ok": bool | None,
|
|
745
|
+
"test_ok": bool | None,
|
|
746
|
+
"error_type": str | None,
|
|
747
|
+
"stderr": str,
|
|
748
|
+
"passed": bool,
|
|
749
|
+
"main_free": bool,
|
|
750
|
+
"result": str, # Legacy field
|
|
751
|
+
}
|
|
752
|
+
"""
|
|
753
|
+
|
|
754
|
+
manager = multiprocessing.Manager()
|
|
755
|
+
try:
|
|
756
|
+
result = manager.list()
|
|
757
|
+
|
|
758
|
+
process = multiprocessing.Process(
|
|
759
|
+
target=_rust_unsafe_execute,
|
|
760
|
+
args=(problem, completion, timeout, result, sandbox_mode, enforce_policy),
|
|
761
|
+
)
|
|
762
|
+
process.start()
|
|
763
|
+
process.join(timeout=timeout + 1)
|
|
764
|
+
if process.is_alive():
|
|
765
|
+
process.kill()
|
|
766
|
+
process.join()
|
|
767
|
+
|
|
768
|
+
if not result:
|
|
769
|
+
result_dict = {
|
|
770
|
+
"compile_ok": None,
|
|
771
|
+
"test_ok": None,
|
|
772
|
+
"error_type": "runtime_error",
|
|
773
|
+
"stderr": "process timeout",
|
|
774
|
+
"passed": False,
|
|
775
|
+
"main_free": check_main_free(completion),
|
|
776
|
+
"result": "timed out",
|
|
777
|
+
}
|
|
778
|
+
result.append(result_dict)
|
|
779
|
+
|
|
780
|
+
result_dict = (
|
|
781
|
+
result[0]
|
|
782
|
+
if isinstance(result[0], dict)
|
|
783
|
+
else {"result": result[0], "passed": result[0] == "passed"}
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
return dict(
|
|
787
|
+
task_id=problem["task_id"],
|
|
788
|
+
completion=completion,
|
|
789
|
+
completion_id=completion_id,
|
|
790
|
+
compile_ok=result_dict.get("compile_ok"),
|
|
791
|
+
test_ok=result_dict.get("test_ok"),
|
|
792
|
+
clippy_ok=result_dict.get("clippy_ok"),
|
|
793
|
+
compile_time_ms=result_dict.get("compile_time_ms"),
|
|
794
|
+
binary_size_bytes=result_dict.get("binary_size_bytes"),
|
|
795
|
+
error_type=result_dict.get("error_type"),
|
|
796
|
+
stderr=result_dict.get("stderr", ""),
|
|
797
|
+
passed=result_dict.get("passed", False),
|
|
798
|
+
main_free=result_dict.get("main_free", check_main_free(completion)),
|
|
799
|
+
result=result_dict.get("result", ""),
|
|
800
|
+
)
|
|
801
|
+
finally:
|
|
802
|
+
manager.shutdown()
|