human-eval-rust 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,802 @@
1
+ """
2
+ Rust-specific execution module for HumanEval evaluation.
3
+
4
+ Handles compilation and test execution of Rust code completions with sandboxing support.
5
+
6
+ Copyright (c) 2025 Dave Tofflemire, SigilDERG Project
7
+ Version: 2.1.0
8
+ """
9
+
10
+ import multiprocessing
11
+ import os
12
+ import re
13
+ import shutil
14
+ import subprocess
15
+ import time
16
+ import unicodedata
17
+
18
+ # Use relative import to avoid circular dependency with execution.py
19
+ from .execution import TimeoutException, create_tempdir, reliability_guard, time_limit
20
+
21
+ # Try to import sandbox module (optional)
22
+ try:
23
+ from .sandbox import SandboxError, run_binary_sandboxed, run_rustc_sandboxed
24
+
25
+ SANDBOX_AVAILABLE = True
26
+ except ImportError:
27
+ SANDBOX_AVAILABLE = False
28
+ SandboxError = Exception
29
+
30
+ # Define stub functions to satisfy type checker
31
+ # These will never be called because SANDBOX_AVAILABLE is False
32
+ def run_rustc_sandboxed(
33
+ source_file: str,
34
+ output_binary: str,
35
+ command_args: list[str],
36
+ timeout: float = 30.0,
37
+ capture_output: bool = True,
38
+ sandbox_mode: str | None = None,
39
+ ) -> subprocess.CompletedProcess[str]:
40
+ raise RuntimeError("Sandbox not available")
41
+
42
+ def run_binary_sandboxed(
43
+ binary_path: str,
44
+ timeout: float = 30.0,
45
+ capture_output: bool = True,
46
+ sandbox_mode: str | None = None,
47
+ ) -> subprocess.CompletedProcess[str]:
48
+ raise RuntimeError("Sandbox not available")
49
+
50
+
51
+ DISALLOWED_COMPLETION_PATTERNS = [
52
+ # Filesystem operations
53
+ "std::fs",
54
+ "std::path",
55
+ "std::io::write",
56
+ "std::io::read",
57
+ "std::io::copy",
58
+ "std::io::create",
59
+ "std::io::remove",
60
+ "std::io::rename",
61
+ "std::io::metadata",
62
+ "std::io::symlink",
63
+ "std::io::hard_link",
64
+ "std::io::canonicalize",
65
+ "std::io::read_dir",
66
+ "std::io::read_to_string",
67
+ "std::io::read_to_end",
68
+ "std::io::read_exact",
69
+ "std::io::write_all",
70
+ "std::io::write_fmt",
71
+ "std::io::flush",
72
+ "std::io::seek",
73
+ "std::io::set_permissions",
74
+ "std::io::remove_file",
75
+ "std::io::remove_dir",
76
+ "std::io::remove_dir_all",
77
+ "std::io::create_dir",
78
+ "std::io::create_dir_all",
79
+ "std::io::rename",
80
+ "std::io::copy",
81
+ "std::io::hard_link",
82
+ "std::io::symlink_metadata",
83
+ "std::io::read_link",
84
+ "std::io::canonicalize",
85
+ "std::io::File::create",
86
+ "std::io::File::open",
87
+ "std::io::File::create_new",
88
+ "std::io::File::read",
89
+ "std::io::File::write",
90
+ # Process and system operations
91
+ "std::process",
92
+ "std::process::Command",
93
+ "std::process::Command::new",
94
+ "std::process::Command::spawn",
95
+ "std::process::Command::output",
96
+ "std::process::Command::status",
97
+ "std::process::exit",
98
+ "std::process::abort",
99
+ "std::process::id",
100
+ "std::process::parent_id",
101
+ "command::new",
102
+ "command::spawn",
103
+ "command::output",
104
+ # Network operations
105
+ "std::net",
106
+ "std::net::TcpStream",
107
+ "std::net::TcpListener",
108
+ "std::net::UdpSocket",
109
+ "std::net::UnixStream",
110
+ "std::net::UnixListener",
111
+ "std::net::SocketAddr",
112
+ "std::net::IpAddr",
113
+ "std::net::Ipv4Addr",
114
+ "std::net::Ipv6Addr",
115
+ "std::net::ToSocketAddrs",
116
+ "std::net::lookup_host",
117
+ "reqwest",
118
+ "ureq",
119
+ "hyper",
120
+ "tokio::net",
121
+ "tokio::net::TcpStream",
122
+ "tokio::net::TcpListener",
123
+ "tokio::net::UdpSocket",
124
+ "tokio::net::UnixStream",
125
+ "tokio::net::UnixListener",
126
+ # Threading and concurrency
127
+ "std::thread",
128
+ "std::thread::spawn",
129
+ "std::thread::Builder",
130
+ "std::thread::Thread",
131
+ "std::thread::park",
132
+ "std::thread::yield_now",
133
+ "std::thread::sleep",
134
+ "std::thread::available_parallelism",
135
+ "std::sync::mpsc",
136
+ "std::sync::mpsc::channel",
137
+ "std::sync::mpsc::sync_channel",
138
+ "std::sync::Arc",
139
+ "std::sync::Mutex",
140
+ "std::sync::RwLock",
141
+ "std::sync::Condvar",
142
+ "std::sync::Barrier",
143
+ "std::sync::Once",
144
+ "std::sync::atomic",
145
+ "tokio::spawn",
146
+ "tokio::task",
147
+ "tokio::runtime",
148
+ # Unsafe code
149
+ "unsafe",
150
+ "unsafe fn",
151
+ "unsafe trait",
152
+ "unsafe impl",
153
+ "unsafe block",
154
+ "unsafe {}",
155
+ # Memory operations
156
+ "std::alloc",
157
+ "std::alloc::alloc",
158
+ "std::alloc::dealloc",
159
+ "std::alloc::realloc",
160
+ "std::alloc::Layout",
161
+ "std::ptr",
162
+ "std::ptr::null",
163
+ "std::ptr::null_mut",
164
+ "std::ptr::read",
165
+ "std::ptr::write",
166
+ "std::ptr::copy",
167
+ "std::ptr::copy_nonoverlapping",
168
+ "std::ptr::swap",
169
+ "std::ptr::replace",
170
+ "std::ptr::drop_in_place",
171
+ "std::mem",
172
+ "std::mem::forget",
173
+ "std::mem::transmute",
174
+ "std::mem::zeroed",
175
+ "std::mem::uninitialized",
176
+ "std::mem::replace",
177
+ "std::mem::swap",
178
+ "std::mem::take",
179
+ "std::mem::size_of",
180
+ "std::mem::align_of",
181
+ "std::mem::size_of_val",
182
+ "std::mem::align_of_val",
183
+ "std::mem::needs_drop",
184
+ "std::mem::drop",
185
+ "std::mem::forget",
186
+ "std::mem::transmute",
187
+ "std::mem::zeroed",
188
+ "std::mem::uninitialized",
189
+ "std::mem::MaybeUninit",
190
+ # Environment and system
191
+ "std::env",
192
+ "std::env::var",
193
+ "std::env::vars",
194
+ "std::env::set_var",
195
+ "std::env::remove_var",
196
+ "std::env::current_dir",
197
+ "std::env::set_current_dir",
198
+ "std::env::args",
199
+ "std::env::args_os",
200
+ "std::env::consts",
201
+ "std::env::home_dir",
202
+ "std::env::temp_dir",
203
+ # Time and system calls
204
+ "std::time::SystemTime",
205
+ "std::time::UNIX_EPOCH",
206
+ "std::time::Duration",
207
+ "std::time::Instant",
208
+ # External process execution
209
+ "std::os",
210
+ "std::os::unix",
211
+ "std::os::windows",
212
+ "std::os::linux",
213
+ "std::os::macos",
214
+ # FFI (Foreign Function Interface)
215
+ "extern",
216
+ 'extern "C"',
217
+ 'extern "system"',
218
+ "libc",
219
+ "winapi",
220
+ # Dynamic loading
221
+ "std::ffi",
222
+ "std::ffi::CString",
223
+ "std::ffi::CStr",
224
+ "std::ffi::OsString",
225
+ "std::ffi::OsStr",
226
+ "std::ffi::NulError",
227
+ # Signal handling
228
+ "std::signal",
229
+ "libc::signal",
230
+ # Other dangerous patterns
231
+ "std::panic",
232
+ "std::panic::panic",
233
+ "std::panic::panic_any",
234
+ "std::panic::set_hook",
235
+ "std::panic::take_hook",
236
+ "std::panic::catch_unwind",
237
+ "std::panic::resume_unwind",
238
+ "std::panic::AssertUnwindSafe",
239
+ # Compile-time code execution
240
+ "include!",
241
+ "include_str!",
242
+ "include_bytes!",
243
+ "env!",
244
+ "option_env!",
245
+ "concat!",
246
+ "file!",
247
+ "line!",
248
+ "column!",
249
+ "module_path!",
250
+ # Assembly
251
+ "asm!",
252
+ "global_asm!",
253
+ # FFI/Linking
254
+ "#[link",
255
+ "#[no_mangle]",
256
+ "#[export_name",
257
+ "build.rs",
258
+ # Proc macros
259
+ "proc_macro",
260
+ "#[derive(",
261
+ # Additional dangerous patterns
262
+ "std::intrinsics",
263
+ "core::intrinsics",
264
+ ]
265
+
266
+
267
+ def _normalize_unicode(text: str) -> str:
268
+ """Normalize Unicode to ASCII to prevent homoglyph attacks."""
269
+
270
+ return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
271
+
272
+
273
+ def _sanitize_rust_completion(completion: str) -> str | None:
274
+ """Check for disallowed patterns with Unicode normalization."""
275
+
276
+ normalized = _normalize_unicode(completion.lower())
277
+
278
+ for pattern in DISALLOWED_COMPLETION_PATTERNS:
279
+ if pattern.lower() in normalized:
280
+ return f"disallowed usage of {pattern}"
281
+
282
+ if re.search(
283
+ r"r#*\".*?(unsafe|std::fs|std::process).*?\"#*",
284
+ completion,
285
+ re.IGNORECASE | re.DOTALL,
286
+ ):
287
+ return "disallowed pattern in raw string"
288
+
289
+ return None
290
+
291
+
292
+ MAX_COMPLETION_LENGTH = 100_000
293
+ MAX_COMPLETION_LINES = 5_000
294
+
295
+
296
+ def _validate_completion(completion: str) -> str | None:
297
+ """Validate completion content. Returns error message or None."""
298
+
299
+ if not completion:
300
+ return "empty completion"
301
+
302
+ if len(completion) > MAX_COMPLETION_LENGTH:
303
+ return f"completion too long ({len(completion)} > {MAX_COMPLETION_LENGTH})"
304
+
305
+ if completion.count("\n") > MAX_COMPLETION_LINES:
306
+ return f"too many lines (> {MAX_COMPLETION_LINES})"
307
+
308
+ if "\x00" in completion:
309
+ return "null byte in completion"
310
+
311
+ try:
312
+ completion.encode("utf-8")
313
+ except UnicodeEncodeError:
314
+ return "invalid UTF-8 encoding"
315
+
316
+ return None
317
+
318
+
319
+ def _extract_function_body(completion: str, entry_point: str) -> str:
320
+ """
321
+ Extract the function body from a completion, removing extra code like main() functions.
322
+
323
+ Args:
324
+ completion: Raw completion text from the model
325
+ entry_point: Name of the function we're looking for (e.g., "has_close_elements")
326
+
327
+ Returns:
328
+ Cleaned completion with only the target function body
329
+ """
330
+ import re
331
+
332
+ # Step 1: Remove markdown code blocks
333
+ if "```rust" in completion:
334
+ # Extract content between ```rust and ```
335
+ rust_match = re.search(r"```rust\s*(.*?)\s*```", completion, re.DOTALL)
336
+ if rust_match:
337
+ completion = rust_match.group(1)
338
+ elif "```" in completion:
339
+ # Generic code block
340
+ code_match = re.search(r"```[^\n]*\s*(.*?)\s*```", completion, re.DOTALL)
341
+ if code_match:
342
+ completion = code_match.group(1)
343
+
344
+ completion = completion.strip()
345
+
346
+ # Step 2: Try to find the function that matches entry_point
347
+ # Pattern: fn entry_point(...) -> ... { ... }
348
+ # Note: Construct pattern carefully to avoid f-string bracket issues
349
+ # The pattern matches: fn entry_point(...) -> [anything except {] { ... }
350
+ not_brace_pattern = r"[^{]" # Match any character except opening brace
351
+ fn_pattern = rf"fn\s+{re.escape(entry_point)}\s*\([^)]*\)\s*(?:->{not_brace_pattern}*)?\s*\{{"
352
+ fn_match = re.search(fn_pattern, completion, re.MULTILINE | re.DOTALL)
353
+
354
+ if fn_match:
355
+ # Found the target function, extract from the opening brace
356
+ start_pos = fn_match.end() - 1 # Position of opening brace
357
+ brace_count = 0
358
+ end_pos = start_pos
359
+
360
+ # Find matching closing brace
361
+ for i in range(start_pos, len(completion)):
362
+ if completion[i] == "{":
363
+ brace_count += 1
364
+ elif completion[i] == "}":
365
+ brace_count -= 1
366
+ if brace_count == 0:
367
+ end_pos = i + 1
368
+ break
369
+
370
+ if brace_count == 0:
371
+ # Extract just the function body (without the function signature)
372
+ # The prompt already has the signature, we just need the body
373
+ function_body = completion[start_pos + 1 : end_pos - 1].strip()
374
+ return function_body
375
+
376
+ # Step 2b: If completion is just the function body (no signature), check if it starts with {
377
+ # This handles cases where the model generates just the body
378
+ if completion.strip().startswith("{"):
379
+ # Extract content between first { and matching }
380
+ brace_count = 0
381
+ start_pos = 0
382
+ end_pos = len(completion)
383
+
384
+ for i, char in enumerate(completion):
385
+ if char == "{":
386
+ if brace_count == 0:
387
+ start_pos = i + 1
388
+ brace_count += 1
389
+ elif char == "}":
390
+ brace_count -= 1
391
+ if brace_count == 0:
392
+ end_pos = i
393
+ return completion[start_pos:end_pos].strip()
394
+
395
+ # If we didn't find a matching brace, return everything after the first {
396
+ if brace_count > 0:
397
+ return completion[start_pos:].strip()
398
+
399
+ # Step 3: If we didn't find the target function, try to extract any function body
400
+ # and remove main() functions
401
+ lines = completion.split("\n")
402
+ cleaned_lines = []
403
+ in_main = False
404
+ brace_count = 0
405
+
406
+ i = 0
407
+ while i < len(lines):
408
+ line = lines[i]
409
+ stripped = line.strip()
410
+
411
+ # Skip standalone main() functions
412
+ if re.match(r"^fn\s+main\s*\([^)]*\)\s*(?:->[^{]*)?\s*\{", stripped):
413
+ in_main = True
414
+ brace_count = 1
415
+ # Skip until matching closing brace
416
+ i += 1
417
+ while i < len(lines) and brace_count > 0:
418
+ line = lines[i]
419
+ brace_count += line.count("{") - line.count("}")
420
+ i += 1
421
+ continue
422
+
423
+ # Skip lines that are part of a main() function we're skipping
424
+ if in_main:
425
+ brace_count += line.count("{") - line.count("}")
426
+ if brace_count <= 0:
427
+ in_main = False
428
+ i += 1
429
+ continue
430
+
431
+ # Keep other lines
432
+ cleaned_lines.append(line)
433
+ i += 1
434
+
435
+ result = "\n".join(cleaned_lines).strip()
436
+
437
+ # Step 4: Remove common extra patterns
438
+ # Remove "Example usage:" or "// Example usage:" blocks
439
+ result = re.sub(
440
+ r"(?i)(//\s*)?(example\s+usage|usage\s+example):.*", "", result, flags=re.DOTALL
441
+ )
442
+
443
+ # Remove standalone use statements that aren't needed (keep them if they're at the top)
444
+ # This is tricky, so we'll be conservative and only remove obviously wrong ones
445
+ result = re.sub(
446
+ r"^use\s+std::collections::Vec;?\s*$", "", result, flags=re.MULTILINE
447
+ ) # Vec is in std::vec, not collections
448
+
449
+ return result.strip()
450
+
451
+
452
+ def _check_rustc_available(sandbox_mode: str | None = None) -> tuple[bool, str | None]:
453
+ """
454
+ Preflight check for rustc availability.
455
+ Returns (available, error_message).
456
+ """
457
+ try:
458
+ # Check local rustc (for firejail, none, or any mode - firejail uses host rustc)
459
+ result = subprocess.run(
460
+ ["rustc", "--version"],
461
+ capture_output=True,
462
+ text=True,
463
+ timeout=5.0,
464
+ )
465
+ if result.returncode == 0:
466
+ return True, None
467
+ return False, "rustc --version failed"
468
+ except FileNotFoundError:
469
+ return False, "rustc not found in PATH"
470
+ except subprocess.TimeoutExpired:
471
+ return False, "rustc version check timed out"
472
+ except Exception as e:
473
+ return False, f"rustc check error: {e}"
474
+
475
+
476
+ def check_main_free(completion: str) -> bool:
477
+ """Check if completion contains fn main."""
478
+ import re
479
+
480
+ # Check for fn main() patterns
481
+ main_pattern = r"fn\s+main\s*\("
482
+ return not bool(re.search(main_pattern, completion, re.IGNORECASE))
483
+
484
+
485
+ def _run_clippy_check(source_path: str, timeout: float) -> tuple[bool, str]:
486
+ """Run clippy on compiled code and return (passed, warnings)."""
487
+
488
+ result = subprocess.run(
489
+ ["cargo", "clippy", "--", "-D", "warnings"],
490
+ capture_output=True,
491
+ text=True,
492
+ timeout=timeout,
493
+ cwd=os.path.dirname(source_path),
494
+ )
495
+ return result.returncode == 0, result.stderr
496
+
497
+
498
+ class ReliabilityContext:
499
+ """Context manager that provides isolated reliability guards."""
500
+
501
+ def __init__(self, maximum_memory_bytes: int | None = None):
502
+ self.maximum_memory_bytes = maximum_memory_bytes
503
+ self._original_functions: dict[str, object] = {}
504
+
505
+ def __enter__(self):
506
+ # Store originals
507
+ self._original_functions = {
508
+ "rmtree": shutil.rmtree,
509
+ "rmdir": os.rmdir,
510
+ "chdir": os.chdir,
511
+ "Popen": subprocess.Popen,
512
+ }
513
+ reliability_guard(self.maximum_memory_bytes)
514
+ return self
515
+
516
+ def __exit__(self, *args):
517
+ shutil.rmtree = self._original_functions["rmtree"]
518
+ os.rmdir = self._original_functions["rmdir"]
519
+ os.chdir = self._original_functions["chdir"]
520
+ subprocess.Popen = self._original_functions["Popen"]
521
+
522
+
523
+ DETERMINISTIC_RUSTC_FLAGS = [
524
+ "--edition=2021",
525
+ "--test",
526
+ "-C",
527
+ "opt-level=0",
528
+ "-C",
529
+ "debuginfo=0",
530
+ "-C",
531
+ "incremental=false",
532
+ ]
533
+
534
+
535
+ def _rust_unsafe_execute(
536
+ problem: dict,
537
+ completion: str,
538
+ timeout: float,
539
+ result,
540
+ sandbox_mode: str | None = None,
541
+ enforce_policy: bool = True,
542
+ ):
543
+ """
544
+ Execute Rust code and return enhanced result schema.
545
+ Result dict structure:
546
+ {
547
+ "compile_ok": bool | None,
548
+ "test_ok": bool | None,
549
+ "error_type": str | None, # "infra_missing_toolchain" | "compile_error" | "runtime_error" | "assertion_failure"
550
+ "stderr": str,
551
+ "passed": bool,
552
+ "main_free": bool,
553
+ "result": str, # Legacy field for compatibility
554
+ }
555
+ """
556
+ with create_tempdir() as temp_dir, ReliabilityContext():
557
+ result_dict = {
558
+ "compile_ok": None,
559
+ "test_ok": None,
560
+ "clippy_ok": None,
561
+ "compile_time_ms": None,
562
+ "binary_size_bytes": None,
563
+ "error_type": None,
564
+ "stderr": "",
565
+ "passed": False,
566
+ "main_free": check_main_free(completion),
567
+ "result": "",
568
+ }
569
+
570
+ rustc_available, rustc_error = _check_rustc_available(sandbox_mode)
571
+ if not rustc_available:
572
+ result_dict["error_type"] = "infra_missing_toolchain"
573
+ result_dict["stderr"] = rustc_error or "rustc not available"
574
+ result_dict["result"] = f"failed: {result_dict['stderr']}"
575
+ result.append(result_dict)
576
+ return
577
+
578
+ validation_error = _validate_completion(completion)
579
+ if validation_error:
580
+ result_dict["error_type"] = "compile_error"
581
+ result_dict["stderr"] = validation_error
582
+ result_dict["result"] = f"filtered: {validation_error}"
583
+ result.append(result_dict)
584
+ return
585
+
586
+ entry_point = problem.get("entry_point", "")
587
+ cleaned_completion = _extract_function_body(completion, entry_point)
588
+
589
+ if enforce_policy:
590
+ violation = _sanitize_rust_completion(cleaned_completion)
591
+ if violation:
592
+ result_dict["error_type"] = "compile_error"
593
+ result_dict["stderr"] = violation
594
+ result_dict["result"] = f"failed: {violation}"
595
+ result.append(result_dict)
596
+ return
597
+
598
+ source_path = os.path.join(temp_dir, "solution.rs")
599
+ test_binary = os.path.join(temp_dir, "solution_test")
600
+
601
+ with open(source_path, "w", encoding="utf-8") as source_file:
602
+ source_file.write(problem["prompt"])
603
+ source_file.write(cleaned_completion)
604
+ source_file.write("\n\n")
605
+ source_file.write(problem["test"])
606
+ source_file.write("\n")
607
+
608
+ compile_args = DETERMINISTIC_RUSTC_FLAGS.copy()
609
+
610
+ effective_mode = sandbox_mode
611
+ use_sandbox = SANDBOX_AVAILABLE and effective_mode != "none"
612
+
613
+ timed_out = None
614
+ try:
615
+ with time_limit(timeout) as timed_out_event:
616
+ timed_out = timed_out_event
617
+ start_time = time.perf_counter()
618
+ if use_sandbox:
619
+ try:
620
+ compile_result = run_rustc_sandboxed(
621
+ source_path,
622
+ test_binary,
623
+ compile_args,
624
+ timeout=timeout,
625
+ capture_output=True,
626
+ sandbox_mode=effective_mode,
627
+ )
628
+ except SandboxError as e:
629
+ result_dict["error_type"] = "infra_missing_toolchain"
630
+ result_dict["stderr"] = str(e)
631
+ result_dict["result"] = f"failed: sandbox error: {e}"
632
+ result.append(result_dict)
633
+ return
634
+ else:
635
+ compile_result = subprocess.run(
636
+ ["rustc"] + compile_args + [source_path, "-o", test_binary],
637
+ capture_output=True,
638
+ text=True,
639
+ timeout=timeout,
640
+ )
641
+
642
+ result_dict["compile_time_ms"] = int(
643
+ (time.perf_counter() - start_time) * 1000
644
+ )
645
+ result_dict["compile_ok"] = compile_result.returncode == 0
646
+ if compile_result.returncode != 0:
647
+ failure = (
648
+ compile_result.stderr.strip() or compile_result.stdout.strip()
649
+ )
650
+ result_dict["error_type"] = "compile_error"
651
+ result_dict["stderr"] = failure or "compile error"
652
+ result_dict["result"] = f"failed: {result_dict['stderr']}"
653
+ result.append(result_dict)
654
+ return
655
+
656
+ if os.path.exists(test_binary):
657
+ result_dict["binary_size_bytes"] = os.path.getsize(test_binary)
658
+
659
+ if shutil.which("cargo"):
660
+ try:
661
+ clippy_ok, clippy_stderr = _run_clippy_check(
662
+ source_path, timeout
663
+ )
664
+ result_dict["clippy_ok"] = clippy_ok
665
+ if not clippy_ok:
666
+ result_dict["stderr"] = clippy_stderr
667
+ except Exception as exc: # noqa: BLE001
668
+ result_dict["clippy_ok"] = False
669
+ result_dict["stderr"] = str(exc)
670
+
671
+ if use_sandbox:
672
+ try:
673
+ test_result = run_binary_sandboxed(
674
+ test_binary,
675
+ timeout=timeout,
676
+ capture_output=True,
677
+ sandbox_mode=effective_mode,
678
+ )
679
+ except SandboxError as e:
680
+ result_dict["error_type"] = "runtime_error"
681
+ result_dict["stderr"] = str(e)
682
+ result_dict["result"] = f"failed: sandbox error: {e}"
683
+ result.append(result_dict)
684
+ return
685
+ else:
686
+ test_result = subprocess.run(
687
+ [test_binary],
688
+ capture_output=True,
689
+ text=True,
690
+ timeout=timeout,
691
+ )
692
+
693
+ if timed_out and timed_out.is_set():
694
+ raise TimeoutException("Timed out!")
695
+
696
+ result_dict["test_ok"] = test_result.returncode == 0
697
+ if test_result.returncode == 0:
698
+ result_dict["passed"] = True
699
+ result_dict["result"] = "passed"
700
+ else:
701
+ failure = test_result.stderr.strip() or test_result.stdout.strip()
702
+ result_dict["error_type"] = "assertion_failure"
703
+ result_dict["stderr"] = failure or "tests failed"
704
+ result_dict["result"] = f"failed: {result_dict['stderr']}"
705
+
706
+ except (TimeoutException, subprocess.TimeoutExpired):
707
+ result_dict["error_type"] = "runtime_error"
708
+ result_dict["stderr"] = "timeout"
709
+ result_dict["result"] = "timed out"
710
+ except BaseException as exc: # noqa: BLE001
711
+ result_dict["error_type"] = "runtime_error"
712
+ result_dict["stderr"] = str(exc)
713
+ result_dict["result"] = f"failed: {exc}"
714
+
715
+ result.append(result_dict)
716
+
717
+
718
+ def rust_check_correctness(
719
+ problem: dict,
720
+ completion: str,
721
+ timeout: float,
722
+ completion_id: int | None = None,
723
+ sandbox_mode: str | None = None,
724
+ enforce_policy: bool = True,
725
+ ) -> dict:
726
+ """
727
+ Evaluate a Rust completion by compiling and running its tests.
728
+
729
+ Args:
730
+ problem: Problem dictionary with prompt, test, etc.
731
+ completion: Generated code completion
732
+ timeout: Timeout in seconds
733
+ completion_id: Optional completion ID for tracking
734
+ sandbox_mode: Optional sandbox mode ("firejail", "none", or None for auto-detect)
735
+ enforce_policy: Whether to enforce pattern-based policy filtering (default: True).
736
+ Set to False for pure HumanEval compatibility without security filtering.
737
+
738
+ Returns:
739
+ Dictionary with enhanced schema:
740
+ {
741
+ "task_id": str,
742
+ "completion": str,
743
+ "completion_id": int | None,
744
+ "compile_ok": bool | None,
745
+ "test_ok": bool | None,
746
+ "error_type": str | None,
747
+ "stderr": str,
748
+ "passed": bool,
749
+ "main_free": bool,
750
+ "result": str, # Legacy field
751
+ }
752
+ """
753
+
754
+ manager = multiprocessing.Manager()
755
+ try:
756
+ result = manager.list()
757
+
758
+ process = multiprocessing.Process(
759
+ target=_rust_unsafe_execute,
760
+ args=(problem, completion, timeout, result, sandbox_mode, enforce_policy),
761
+ )
762
+ process.start()
763
+ process.join(timeout=timeout + 1)
764
+ if process.is_alive():
765
+ process.kill()
766
+ process.join()
767
+
768
+ if not result:
769
+ result_dict = {
770
+ "compile_ok": None,
771
+ "test_ok": None,
772
+ "error_type": "runtime_error",
773
+ "stderr": "process timeout",
774
+ "passed": False,
775
+ "main_free": check_main_free(completion),
776
+ "result": "timed out",
777
+ }
778
+ result.append(result_dict)
779
+
780
+ result_dict = (
781
+ result[0]
782
+ if isinstance(result[0], dict)
783
+ else {"result": result[0], "passed": result[0] == "passed"}
784
+ )
785
+
786
+ return dict(
787
+ task_id=problem["task_id"],
788
+ completion=completion,
789
+ completion_id=completion_id,
790
+ compile_ok=result_dict.get("compile_ok"),
791
+ test_ok=result_dict.get("test_ok"),
792
+ clippy_ok=result_dict.get("clippy_ok"),
793
+ compile_time_ms=result_dict.get("compile_time_ms"),
794
+ binary_size_bytes=result_dict.get("binary_size_bytes"),
795
+ error_type=result_dict.get("error_type"),
796
+ stderr=result_dict.get("stderr", ""),
797
+ passed=result_dict.get("passed", False),
798
+ main_free=result_dict.get("main_free", check_main_free(completion)),
799
+ result=result_dict.get("result", ""),
800
+ )
801
+ finally:
802
+ manager.shutdown()