human-eval-rust 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HumanEval_rust.jsonl +164 -0
- data/HumanEval_rust_extended.jsonl +2 -0
- data/example_rust_problem.jsonl +1 -0
- data/example_rust_samples.jsonl +4 -0
- human_eval/__init__.py +23 -0
- human_eval/data.py +74 -0
- human_eval/evaluate_functional_correctness.py +112 -0
- human_eval/evaluation.py +281 -0
- human_eval/execution.py +186 -0
- human_eval/logging_config.py +43 -0
- human_eval/resource_monitor.py +58 -0
- human_eval/rust_execution.py +802 -0
- human_eval/sandbox.py +586 -0
- human_eval_rust-2.1.0.dist-info/METADATA +488 -0
- human_eval_rust-2.1.0.dist-info/RECORD +19 -0
- human_eval_rust-2.1.0.dist-info/WHEEL +5 -0
- human_eval_rust-2.1.0.dist-info/entry_points.txt +2 -0
- human_eval_rust-2.1.0.dist-info/licenses/LICENSE +21 -0
- human_eval_rust-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
{"task_id": "Extended/0", "prompt": "fn identity(x: i32) -> i32 {", "canonical_solution": " x\n}\n", "test": "#[test] fn test_identity() { assert_eq!(identity(5), 5); }", "entry_point": "identity"}
|
|
2
|
+
{"task_id": "Extended/1", "prompt": "fn sum_slice(values: &[i32]) -> i32 {", "canonical_solution": " values.iter().sum()\n}\n", "test": "#[test] fn test_sum() { assert_eq!(sum_slice(&[1,2,3]), 6); }", "entry_point": "sum_slice"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_id": "test_rust/0", "prompt": "fn return_one() -> i32 {\n", "canonical_solution": " 1\n}\n", "test": "\n#[cfg(test)]\nmod tests {\n use super::*;\n\n #[test]\n fn test_return_one() {\n assert_eq!(return_one(), 1);\n }\n}\n", "entry_point": "return_one"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
{"task_id": "test_rust/0", "completion": " 1\n}\n"}
|
|
2
|
+
{"task_id": "test_rust/0", "completion": " 0\n}\n"}
|
|
3
|
+
{"task_id": "test_rust/0", "completion": " 2\n}\n"}
|
|
4
|
+
{"task_id": "test_rust/0", "completion": " let mut value = 0;\n for _ in 0..1 {\n value += 1;\n }\n value\n}\n"}
|
human_eval/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HumanEval Rust evaluation package.
|
|
3
|
+
|
|
4
|
+
Provides evaluation harness for the HumanEval Rust problem solving dataset.
|
|
5
|
+
|
|
6
|
+
Copyright (c) 2025 Dave Tofflemire, SigilDERG Project
|
|
7
|
+
Version: 2.1.0
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "2.1.0"
|
|
11
|
+
|
|
12
|
+
# Export rust_execution module so it can be imported
|
|
13
|
+
# Use relative import to avoid circular dependency issues
|
|
14
|
+
from . import rust_execution
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EvaluationError(Exception):
|
|
18
|
+
"""Raised when evaluation cannot proceed due to data issues."""
|
|
19
|
+
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = ["rust_execution", "__version__", "EvaluationError"]
|
human_eval/data.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data loading utilities for HumanEval Rust dataset.
|
|
3
|
+
|
|
4
|
+
Provides functions to read and write JSONL files containing problems and completions.
|
|
5
|
+
|
|
6
|
+
Copyright (c) 2025 Dave Tofflemire, SigilDERG Project
|
|
7
|
+
Version: 2.1.0
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import gzip
|
|
11
|
+
import importlib.resources
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
from collections.abc import Iterable
|
|
15
|
+
|
|
16
|
+
ROOT = os.path.dirname(os.path.abspath(__file__))
|
|
17
|
+
HUMAN_EVAL_RUST = os.path.join(ROOT, "..", "data", "HumanEval_rust.jsonl")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_human_eval_dataset(language: str | None = None) -> str:
|
|
21
|
+
"""Returns path to HumanEval dataset, using importlib.resources."""
|
|
22
|
+
|
|
23
|
+
if language and language.lower() != "rust":
|
|
24
|
+
raise ValueError(f"Only Rust is supported. Got language: {language}")
|
|
25
|
+
|
|
26
|
+
with importlib.resources.as_file(
|
|
27
|
+
importlib.resources.files("human_eval").joinpath("../data/HumanEval_rust.jsonl")
|
|
28
|
+
) as path:
|
|
29
|
+
return str(path)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def read_problems(evalset_file: str | None = None) -> dict[str, dict]:
|
|
33
|
+
"""Reads problems from the specified file, or defaults to the Rust dataset."""
|
|
34
|
+
|
|
35
|
+
if evalset_file is None:
|
|
36
|
+
evalset_file = get_human_eval_dataset()
|
|
37
|
+
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def stream_jsonl(filename: str) -> Iterable[dict]:
|
|
41
|
+
"""
|
|
42
|
+
Parses each jsonl line and yields it as a dictionary
|
|
43
|
+
"""
|
|
44
|
+
if filename.endswith(".gz"):
|
|
45
|
+
with open(filename, "rb") as gzfp:
|
|
46
|
+
with gzip.open(gzfp, "rt", encoding="utf-8") as fp:
|
|
47
|
+
for line in fp:
|
|
48
|
+
if any(not x.isspace() for x in line):
|
|
49
|
+
yield json.loads(line)
|
|
50
|
+
else:
|
|
51
|
+
with open(filename, "r", encoding="utf-8") as fp:
|
|
52
|
+
for line in fp:
|
|
53
|
+
if any(not x.isspace() for x in line):
|
|
54
|
+
yield json.loads(line)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def write_jsonl(filename: str, data: Iterable[dict], append: bool = False):
|
|
58
|
+
"""
|
|
59
|
+
Writes an iterable of dictionaries to jsonl
|
|
60
|
+
"""
|
|
61
|
+
if append:
|
|
62
|
+
mode = "ab"
|
|
63
|
+
else:
|
|
64
|
+
mode = "wb"
|
|
65
|
+
filename = os.path.expanduser(filename)
|
|
66
|
+
if filename.endswith(".gz"):
|
|
67
|
+
with open(filename, mode) as fp:
|
|
68
|
+
with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
|
|
69
|
+
for x in data:
|
|
70
|
+
gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
|
|
71
|
+
else:
|
|
72
|
+
with open(filename, mode) as fp:
|
|
73
|
+
for x in data:
|
|
74
|
+
fp.write((json.dumps(x) + "\n").encode("utf-8"))
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line entry point for HumanEval Rust functional correctness evaluation.
|
|
3
|
+
|
|
4
|
+
Provides CLI interface using Fire for evaluating Rust code completions.
|
|
5
|
+
|
|
6
|
+
Copyright (c) 2025 Dave Tofflemire, SigilDERG Project
|
|
7
|
+
Version: 2.1.0
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
import fire
|
|
13
|
+
|
|
14
|
+
from human_eval.data import get_human_eval_dataset
|
|
15
|
+
from human_eval.evaluation import evaluate_functional_correctness
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def entry_point(
|
|
19
|
+
sample_file: str,
|
|
20
|
+
k: str = "1,10,100",
|
|
21
|
+
n_workers: int = 24, # Optimized for H100: 24 workers (26 vCPUs - 2 reserved)
|
|
22
|
+
timeout: float = 10.0, # Optimized for H100: 10s timeout (was 3.0s) for compilation latency
|
|
23
|
+
problem_file: str | None = None,
|
|
24
|
+
language: str | None = None,
|
|
25
|
+
sandbox_mode: str | None = None,
|
|
26
|
+
allow_no_sandbox: bool = False,
|
|
27
|
+
enforce_policy: bool = True,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Evaluate HumanEval Rust completions and write a "<input>_results.jsonl"
|
|
31
|
+
file containing pass/fail metadata.
|
|
32
|
+
|
|
33
|
+
Arguments:
|
|
34
|
+
sample_file: Path to a JSONL file containing Rust completions with
|
|
35
|
+
`task_id` and `completion` fields.
|
|
36
|
+
k: Comma-separated list of pass@k values to compute (e.g. "1,10,100").
|
|
37
|
+
n_workers: Number of parallel workers to use when running tests.
|
|
38
|
+
timeout: Per-sample timeout in seconds for compilation/execution.
|
|
39
|
+
problem_file: Optional dataset override. If omitted, defaults to the
|
|
40
|
+
Rust HumanEval dataset.
|
|
41
|
+
language: Kept for API compatibility but only "rust" is supported.
|
|
42
|
+
If not provided, defaults to "rust".
|
|
43
|
+
sandbox_mode: Sandbox mode ("firejail" or "none").
|
|
44
|
+
- firejail (recommended): Uses Firejail for Linux process isolation
|
|
45
|
+
- none: No sandboxing (UNSAFE - only for local dev with trusted code)
|
|
46
|
+
If not specified, auto-detects Firejail availability.
|
|
47
|
+
allow_no_sandbox: Allow proceeding without sandbox in non-interactive mode.
|
|
48
|
+
Use with --sandbox-mode=none or when Firejail is unavailable.
|
|
49
|
+
Required for automated pipelines that accept unsandboxed execution.
|
|
50
|
+
enforce_policy: Whether to enforce pattern-based policy filtering (default: True).
|
|
51
|
+
Set to False for pure HumanEval compatibility without security filtering.
|
|
52
|
+
Use --no-enforce-policy to disable policy enforcement.
|
|
53
|
+
"""
|
|
54
|
+
k_list: list[int] = list(map(int, k.split(",")))
|
|
55
|
+
if problem_file is None:
|
|
56
|
+
problem_file = get_human_eval_dataset(language)
|
|
57
|
+
|
|
58
|
+
# Resolve sandbox mode with user interaction if needed
|
|
59
|
+
try:
|
|
60
|
+
from human_eval.sandbox import check_firejail_available, resolve_sandbox_mode
|
|
61
|
+
|
|
62
|
+
# Determine if we're in interactive mode (stdin is a TTY)
|
|
63
|
+
non_interactive = not sys.stdin.isatty()
|
|
64
|
+
|
|
65
|
+
resolved_mode = resolve_sandbox_mode(
|
|
66
|
+
sandbox_mode=sandbox_mode,
|
|
67
|
+
allow_no_sandbox=allow_no_sandbox,
|
|
68
|
+
non_interactive=non_interactive,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if resolved_mode == "firejail":
|
|
72
|
+
status = check_firejail_available()
|
|
73
|
+
print(f"Using Firejail sandboxing ({status.version})", file=sys.stderr)
|
|
74
|
+
elif resolved_mode == "none":
|
|
75
|
+
if not allow_no_sandbox:
|
|
76
|
+
print(
|
|
77
|
+
"⚠ WARNING: Running without sandbox. This is UNSAFE for untrusted code!",
|
|
78
|
+
file=sys.stderr,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
sandbox_mode = resolved_mode
|
|
82
|
+
|
|
83
|
+
except ImportError:
|
|
84
|
+
# Sandbox module not available
|
|
85
|
+
sandbox_mode = "none"
|
|
86
|
+
print(
|
|
87
|
+
"WARNING: Sandbox module not available, running without sandboxing",
|
|
88
|
+
file=sys.stderr,
|
|
89
|
+
)
|
|
90
|
+
except SystemExit:
|
|
91
|
+
# User cancelled the prompt
|
|
92
|
+
raise
|
|
93
|
+
|
|
94
|
+
results = evaluate_functional_correctness(
|
|
95
|
+
sample_file,
|
|
96
|
+
k_list,
|
|
97
|
+
n_workers,
|
|
98
|
+
timeout,
|
|
99
|
+
problem_file,
|
|
100
|
+
language,
|
|
101
|
+
sandbox_mode,
|
|
102
|
+
enforce_policy,
|
|
103
|
+
)
|
|
104
|
+
print(results)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main():
|
|
108
|
+
fire.Fire(entry_point)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
if __name__ == "__main__":
|
|
112
|
+
sys.exit(main())
|
human_eval/evaluation.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Functional correctness evaluation for HumanEval Rust completions.
|
|
3
|
+
|
|
4
|
+
Implements pass@k estimation and parallel test execution.
|
|
5
|
+
|
|
6
|
+
Copyright (c) 2025 Dave Tofflemire, SigilDERG Project
|
|
7
|
+
Version: 2.1.0
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import itertools
|
|
11
|
+
import subprocess
|
|
12
|
+
from collections import Counter, defaultdict
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import tqdm
|
|
17
|
+
|
|
18
|
+
from human_eval import EvaluationError
|
|
19
|
+
from human_eval.data import (
|
|
20
|
+
get_human_eval_dataset,
|
|
21
|
+
read_problems,
|
|
22
|
+
stream_jsonl,
|
|
23
|
+
write_jsonl,
|
|
24
|
+
)
|
|
25
|
+
from human_eval.execution import check_correctness
|
|
26
|
+
from human_eval.rust_execution import check_main_free
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def estimate_pass_at_k(
|
|
30
|
+
num_samples: int | list[int] | np.ndarray,
|
|
31
|
+
num_correct: list[int] | np.ndarray,
|
|
32
|
+
k: int,
|
|
33
|
+
) -> np.ndarray:
|
|
34
|
+
"""
|
|
35
|
+
Estimates pass@k of each problem and returns them in an array.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def estimator(n: int, c: int, k: int) -> float:
|
|
39
|
+
"""
|
|
40
|
+
Calculates 1 - comb(n - c, k) / comb(n, k).
|
|
41
|
+
"""
|
|
42
|
+
if n - c < k:
|
|
43
|
+
return 1.0
|
|
44
|
+
return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
|
|
45
|
+
|
|
46
|
+
if isinstance(num_samples, int):
|
|
47
|
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
|
48
|
+
else:
|
|
49
|
+
assert len(num_samples) == len(num_correct)
|
|
50
|
+
num_samples_it = iter(num_samples)
|
|
51
|
+
|
|
52
|
+
return np.array(
|
|
53
|
+
[estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _resolve_language(language: str | None, problem_file: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Resolves the language for evaluation. Only Rust is supported.
|
|
60
|
+
"""
|
|
61
|
+
if language and language.lower() != "rust":
|
|
62
|
+
raise ValueError(
|
|
63
|
+
f"Only Rust is supported. Got language: {language}. "
|
|
64
|
+
"This evaluator only supports Rust code evaluation."
|
|
65
|
+
)
|
|
66
|
+
return "rust"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _get_rustc_version() -> str:
|
|
70
|
+
result = subprocess.run(["rustc", "--version"], capture_output=True, text=True)
|
|
71
|
+
return result.stdout.strip() if result.returncode == 0 else "unknown"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def evaluate_functional_correctness(
|
|
75
|
+
sample_file: str,
|
|
76
|
+
k: list[int] = [1, 10, 100],
|
|
77
|
+
n_workers: int = 4,
|
|
78
|
+
timeout: float = 3.0,
|
|
79
|
+
problem_file: str | None = None,
|
|
80
|
+
language: str | None = None,
|
|
81
|
+
sandbox_mode: str | None = None,
|
|
82
|
+
enforce_policy: bool = True,
|
|
83
|
+
):
|
|
84
|
+
"""
|
|
85
|
+
Evaluates the functional correctness of generated samples, and writes
|
|
86
|
+
results to f"{sample_file}_results.jsonl" (one JSON object per sample result).
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
sample_file: Path to JSONL file with completions
|
|
90
|
+
k: List of pass@k values to compute
|
|
91
|
+
n_workers: Number of parallel workers
|
|
92
|
+
timeout: Per-sample timeout in seconds
|
|
93
|
+
problem_file: Optional problem dataset file
|
|
94
|
+
language: Language (only "rust" supported)
|
|
95
|
+
sandbox_mode: Sandbox mode ("firejail", "none", or None for auto-detect)
|
|
96
|
+
enforce_policy: Whether to enforce pattern-based policy filtering (default: True).
|
|
97
|
+
Set to False for pure HumanEval compatibility without security filtering.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
if problem_file is None:
|
|
101
|
+
problem_file = get_human_eval_dataset(language or "rust")
|
|
102
|
+
|
|
103
|
+
resolved_language = _resolve_language(language, problem_file)
|
|
104
|
+
|
|
105
|
+
problems = read_problems(problem_file)
|
|
106
|
+
|
|
107
|
+
# Check the generated samples against test suites.
|
|
108
|
+
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
|
109
|
+
|
|
110
|
+
futures = []
|
|
111
|
+
completion_id = Counter()
|
|
112
|
+
n_samples = 0
|
|
113
|
+
results = defaultdict(list)
|
|
114
|
+
|
|
115
|
+
print("Reading samples...")
|
|
116
|
+
for sample in tqdm.tqdm(stream_jsonl(sample_file)):
|
|
117
|
+
task_id = sample["task_id"]
|
|
118
|
+
problem = problems.get(task_id)
|
|
119
|
+
|
|
120
|
+
if problem is None:
|
|
121
|
+
raise KeyError(f"Unknown task_id '{task_id}' in {sample_file}.")
|
|
122
|
+
|
|
123
|
+
completion = sample["completion"]
|
|
124
|
+
args = (
|
|
125
|
+
problem,
|
|
126
|
+
completion,
|
|
127
|
+
timeout,
|
|
128
|
+
completion_id[task_id],
|
|
129
|
+
resolved_language,
|
|
130
|
+
sandbox_mode,
|
|
131
|
+
enforce_policy,
|
|
132
|
+
)
|
|
133
|
+
future = executor.submit(check_correctness, *args)
|
|
134
|
+
futures.append(future)
|
|
135
|
+
completion_id[task_id] += 1
|
|
136
|
+
n_samples += 1
|
|
137
|
+
|
|
138
|
+
if len(completion_id) != len(problems):
|
|
139
|
+
missing = set(problems.keys()) - set(completion_id.keys())
|
|
140
|
+
raise EvaluationError(
|
|
141
|
+
f"Missing completions for {len(missing)} problems: {list(missing)[:5]}..."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
print("Running test suites...")
|
|
145
|
+
all_results_list = []
|
|
146
|
+
for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
|
|
147
|
+
result = future.result()
|
|
148
|
+
results[result["task_id"]].append((result["completion_id"], result))
|
|
149
|
+
all_results_list.append(result)
|
|
150
|
+
|
|
151
|
+
# Track compile rate and main-free rate
|
|
152
|
+
compile_ok_count = sum(1 for r in all_results_list if r.get("compile_ok") is True)
|
|
153
|
+
compile_total = sum(1 for r in all_results_list if r.get("compile_ok") is not None)
|
|
154
|
+
compile_rate = compile_ok_count / compile_total if compile_total > 0 else 0.0
|
|
155
|
+
|
|
156
|
+
main_free_count = sum(1 for r in all_results_list if r.get("main_free") is True)
|
|
157
|
+
main_free_rate = (
|
|
158
|
+
main_free_count / len(all_results_list) if all_results_list else 0.0
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
clippy_total = sum(1 for r in all_results_list if r.get("clippy_ok") is not None)
|
|
162
|
+
clippy_pass = sum(1 for r in all_results_list if r.get("clippy_ok") is True)
|
|
163
|
+
clippy_rate = clippy_pass / clippy_total if clippy_total else 0.0
|
|
164
|
+
|
|
165
|
+
compile_times = [
|
|
166
|
+
r.get("compile_time_ms")
|
|
167
|
+
for r in all_results_list
|
|
168
|
+
if r.get("compile_time_ms") is not None
|
|
169
|
+
]
|
|
170
|
+
binary_sizes = [
|
|
171
|
+
r.get("binary_size_bytes")
|
|
172
|
+
for r in all_results_list
|
|
173
|
+
if r.get("binary_size_bytes") is not None
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
# Calculate pass@k.
|
|
177
|
+
total, correct = [], []
|
|
178
|
+
for result in results.values():
|
|
179
|
+
result.sort()
|
|
180
|
+
passed = [r[1]["passed"] for r in result]
|
|
181
|
+
total.append(len(passed))
|
|
182
|
+
correct.append(sum(passed))
|
|
183
|
+
total = np.array(total)
|
|
184
|
+
correct = np.array(correct)
|
|
185
|
+
|
|
186
|
+
ks = k
|
|
187
|
+
pass_at_k = {
|
|
188
|
+
f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
|
|
189
|
+
for k in ks
|
|
190
|
+
if (total >= k).all()
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
pass_at_k["compile_rate"] = compile_rate
|
|
194
|
+
pass_at_k["main_free_rate"] = main_free_rate
|
|
195
|
+
pass_at_k["clippy_pass_rate"] = clippy_rate
|
|
196
|
+
pass_at_k["avg_compile_time_ms"] = (
|
|
197
|
+
float(np.mean(compile_times)) if compile_times else 0.0
|
|
198
|
+
)
|
|
199
|
+
pass_at_k["avg_binary_size_bytes"] = (
|
|
200
|
+
float(np.mean(binary_sizes)) if binary_sizes else 0.0
|
|
201
|
+
)
|
|
202
|
+
pass_at_k["rustc_version"] = _get_rustc_version()
|
|
203
|
+
|
|
204
|
+
# Print metrics
|
|
205
|
+
print("\nMetrics:")
|
|
206
|
+
print(f" Compile rate: {compile_rate:.4f} ({compile_rate * 100:.2f}%)")
|
|
207
|
+
print(f" Main-free rate: {main_free_rate:.4f} ({main_free_rate * 100:.2f}%)")
|
|
208
|
+
print(f" Clippy pass rate: {clippy_rate:.4f} ({clippy_rate * 100:.2f}%)")
|
|
209
|
+
if compile_times:
|
|
210
|
+
print(f" Avg compile time (ms): {np.mean(compile_times):.2f}")
|
|
211
|
+
if binary_sizes:
|
|
212
|
+
print(f" Avg binary size (bytes): {np.mean(binary_sizes):.2f}")
|
|
213
|
+
for metric, value in sorted(pass_at_k.items()):
|
|
214
|
+
if metric not in (
|
|
215
|
+
"compile_rate",
|
|
216
|
+
"main_free_rate",
|
|
217
|
+
"clippy_pass_rate",
|
|
218
|
+
"avg_compile_time_ms",
|
|
219
|
+
"avg_binary_size_bytes",
|
|
220
|
+
"rustc_version",
|
|
221
|
+
):
|
|
222
|
+
print(f" {metric}: {value:.4f} ({value * 100:.2f}%)")
|
|
223
|
+
print(f" rustc: {pass_at_k['rustc_version']}")
|
|
224
|
+
|
|
225
|
+
# Finally, save the results in one file:
|
|
226
|
+
# Writes to "<sample_file>_results.jsonl" (one JSON object per sample result)
|
|
227
|
+
# Ensure all completions are included (never drop silently)
|
|
228
|
+
def combine_results():
|
|
229
|
+
# Read all samples to ensure we don't miss any
|
|
230
|
+
samples_by_task = defaultdict(list)
|
|
231
|
+
for sample in stream_jsonl(sample_file):
|
|
232
|
+
samples_by_task[sample["task_id"]].append(sample)
|
|
233
|
+
|
|
234
|
+
# Match results with samples
|
|
235
|
+
for task_id in sorted(samples_by_task.keys()):
|
|
236
|
+
task_samples = samples_by_task[task_id]
|
|
237
|
+
task_results = results.get(task_id, [])
|
|
238
|
+
task_results.sort()
|
|
239
|
+
|
|
240
|
+
# Ensure we have a result for every sample
|
|
241
|
+
for i, sample in enumerate(task_samples):
|
|
242
|
+
if i < len(task_results):
|
|
243
|
+
result = task_results[i][1]
|
|
244
|
+
sample.update(
|
|
245
|
+
{
|
|
246
|
+
"compile_ok": result.get("compile_ok"),
|
|
247
|
+
"test_ok": result.get("test_ok"),
|
|
248
|
+
"clippy_ok": result.get("clippy_ok"),
|
|
249
|
+
"compile_time_ms": result.get("compile_time_ms"),
|
|
250
|
+
"binary_size_bytes": result.get("binary_size_bytes"),
|
|
251
|
+
"error_type": result.get("error_type"),
|
|
252
|
+
"stderr": result.get("stderr", ""),
|
|
253
|
+
"main_free": result.get("main_free"),
|
|
254
|
+
"result": result.get("result", ""),
|
|
255
|
+
"passed": result.get("passed", False),
|
|
256
|
+
}
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
# Missing result - create placeholder (never drop silently)
|
|
260
|
+
sample.update(
|
|
261
|
+
{
|
|
262
|
+
"compile_ok": None,
|
|
263
|
+
"test_ok": None,
|
|
264
|
+
"clippy_ok": None,
|
|
265
|
+
"compile_time_ms": None,
|
|
266
|
+
"binary_size_bytes": None,
|
|
267
|
+
"error_type": "runtime_error",
|
|
268
|
+
"stderr": "missing result",
|
|
269
|
+
"main_free": check_main_free(sample.get("completion", "")),
|
|
270
|
+
"result": "filtered: missing result",
|
|
271
|
+
"passed": False,
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
assert sample is not None
|
|
275
|
+
yield sample
|
|
276
|
+
|
|
277
|
+
out_file = sample_file + "_results.jsonl"
|
|
278
|
+
print(f"Writing results to {out_file}...")
|
|
279
|
+
write_jsonl(out_file, tqdm.tqdm(combine_results(), total=n_samples))
|
|
280
|
+
|
|
281
|
+
return pass_at_k
|
human_eval/execution.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Execution utilities for HumanEval Rust evaluation.
|
|
3
|
+
|
|
4
|
+
Provides timeout handling, reliability guards, and correctness checking.
|
|
5
|
+
|
|
6
|
+
Copyright (c) 2025 Dave Tofflemire, SigilDERG Project
|
|
7
|
+
Version: 2.1.0
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# pyright: reportAttributeAccessIssue=false, reportArgumentType=false
|
|
11
|
+
|
|
12
|
+
import contextlib
|
|
13
|
+
import faulthandler
|
|
14
|
+
import os
|
|
15
|
+
import platform
|
|
16
|
+
import tempfile
|
|
17
|
+
import threading
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def check_correctness(
|
|
21
|
+
problem: dict,
|
|
22
|
+
completion: str,
|
|
23
|
+
timeout: float,
|
|
24
|
+
completion_id: int | None = None,
|
|
25
|
+
language: str | None = None,
|
|
26
|
+
sandbox_mode: str | None = None,
|
|
27
|
+
enforce_policy: bool = True,
|
|
28
|
+
) -> dict:
|
|
29
|
+
"""
|
|
30
|
+
Evaluates the functional correctness of a Rust completion by compiling
|
|
31
|
+
and running the test suite provided in the problem.
|
|
32
|
+
|
|
33
|
+
:param completion_id: an optional completion ID so we can match
|
|
34
|
+
the results later even if execution finishes asynchronously.
|
|
35
|
+
:param sandbox_mode: Optional sandbox mode ("docker", "firejail", "none", or None for auto-detect)
|
|
36
|
+
:param enforce_policy: Whether to enforce pattern-based policy filtering (default: True).
|
|
37
|
+
Set to False for pure HumanEval compatibility without security filtering.
|
|
38
|
+
"""
|
|
39
|
+
# Import here to avoid circular import (rust_execution imports from execution)
|
|
40
|
+
from human_eval import rust_execution
|
|
41
|
+
|
|
42
|
+
# Language parameter is kept for API compatibility but only Rust is supported
|
|
43
|
+
if language and language.lower() != "rust":
|
|
44
|
+
raise ValueError(f"Only Rust is supported. Got language: {language}")
|
|
45
|
+
|
|
46
|
+
return rust_execution.rust_check_correctness(
|
|
47
|
+
problem, completion, timeout, completion_id, sandbox_mode, enforce_policy
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@contextlib.contextmanager
|
|
52
|
+
def time_limit(seconds: float):
|
|
53
|
+
"""Thread-safe timeout context manager using Timer."""
|
|
54
|
+
|
|
55
|
+
timed_out = threading.Event()
|
|
56
|
+
|
|
57
|
+
def timeout_handler():
|
|
58
|
+
timed_out.set()
|
|
59
|
+
|
|
60
|
+
timer = threading.Timer(seconds, timeout_handler)
|
|
61
|
+
timer.start()
|
|
62
|
+
try:
|
|
63
|
+
yield timed_out
|
|
64
|
+
finally:
|
|
65
|
+
if timer:
|
|
66
|
+
timer.cancel()
|
|
67
|
+
|
|
68
|
+
if timed_out.is_set():
|
|
69
|
+
raise TimeoutException("Timed out!")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@contextlib.contextmanager
|
|
73
|
+
def create_tempdir():
|
|
74
|
+
original_unlink = os.unlink
|
|
75
|
+
original_rmdir = os.rmdir
|
|
76
|
+
with tempfile.TemporaryDirectory() as dirname:
|
|
77
|
+
with chdir(dirname):
|
|
78
|
+
yield dirname
|
|
79
|
+
os.unlink = original_unlink
|
|
80
|
+
os.rmdir = original_rmdir
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class TimeoutException(Exception):
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@contextlib.contextmanager
|
|
88
|
+
def chdir(root):
|
|
89
|
+
if root == ".":
|
|
90
|
+
yield
|
|
91
|
+
return
|
|
92
|
+
cwd = os.getcwd()
|
|
93
|
+
os.chdir(root)
|
|
94
|
+
try:
|
|
95
|
+
yield
|
|
96
|
+
except BaseException as exc:
|
|
97
|
+
raise exc
|
|
98
|
+
finally:
|
|
99
|
+
os.chdir(cwd)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def reliability_guard(maximum_memory_bytes: int | None = None):
|
|
103
|
+
"""
|
|
104
|
+
This disables various destructive functions and prevents the generated code
|
|
105
|
+
from interfering with the test (e.g. fork bomb, killing other processes,
|
|
106
|
+
removing filesystem files, etc.)
|
|
107
|
+
|
|
108
|
+
WARNING
|
|
109
|
+
This function is NOT a security sandbox. Untrusted code, including, model-
|
|
110
|
+
generated code, should not be blindly executed outside of one. See the
|
|
111
|
+
Codex paper for more information about OpenAI's code sandbox, and proceed
|
|
112
|
+
with caution.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
if maximum_memory_bytes is not None:
|
|
116
|
+
import resource
|
|
117
|
+
|
|
118
|
+
resource.setrlimit(
|
|
119
|
+
resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
|
|
120
|
+
)
|
|
121
|
+
resource.setrlimit(
|
|
122
|
+
resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
|
|
123
|
+
)
|
|
124
|
+
if not platform.uname().system == "Darwin":
|
|
125
|
+
resource.setrlimit(
|
|
126
|
+
resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
faulthandler.disable()
|
|
130
|
+
|
|
131
|
+
import builtins
|
|
132
|
+
|
|
133
|
+
builtins.exit = None
|
|
134
|
+
builtins.quit = None
|
|
135
|
+
|
|
136
|
+
import os
|
|
137
|
+
|
|
138
|
+
os.environ["OMP_NUM_THREADS"] = "1"
|
|
139
|
+
|
|
140
|
+
os.kill = None
|
|
141
|
+
os.system = None
|
|
142
|
+
os.putenv = None
|
|
143
|
+
os.remove = None
|
|
144
|
+
os.removedirs = None
|
|
145
|
+
os.rmdir = None
|
|
146
|
+
os.fchdir = None
|
|
147
|
+
os.setuid = None
|
|
148
|
+
os.fork = None
|
|
149
|
+
os.forkpty = None
|
|
150
|
+
os.killpg = None
|
|
151
|
+
os.rename = None
|
|
152
|
+
os.renames = None
|
|
153
|
+
os.truncate = None
|
|
154
|
+
os.replace = None
|
|
155
|
+
os.unlink = None
|
|
156
|
+
os.fchmod = None
|
|
157
|
+
os.fchown = None
|
|
158
|
+
os.chmod = None
|
|
159
|
+
os.chown = None
|
|
160
|
+
os.chroot = None
|
|
161
|
+
os.fchdir = None
|
|
162
|
+
os.lchflags = None
|
|
163
|
+
os.lchmod = None
|
|
164
|
+
os.lchown = None
|
|
165
|
+
os.getcwd = None
|
|
166
|
+
os.chdir = None
|
|
167
|
+
|
|
168
|
+
import shutil
|
|
169
|
+
|
|
170
|
+
shutil.rmtree = None
|
|
171
|
+
shutil.move = None
|
|
172
|
+
shutil.chown = None
|
|
173
|
+
|
|
174
|
+
import subprocess
|
|
175
|
+
|
|
176
|
+
subprocess.Popen = None # type: ignore
|
|
177
|
+
|
|
178
|
+
__builtins__["help"] = None
|
|
179
|
+
|
|
180
|
+
import sys
|
|
181
|
+
|
|
182
|
+
sys.modules["ipdb"] = None
|
|
183
|
+
sys.modules["joblib"] = None
|
|
184
|
+
sys.modules["resource"] = None
|
|
185
|
+
sys.modules["psutil"] = None
|
|
186
|
+
sys.modules["tkinter"] = None
|