human-eval-rust 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HumanEval_rust.jsonl +164 -0
- data/HumanEval_rust_extended.jsonl +2 -0
- data/example_rust_problem.jsonl +1 -0
- data/example_rust_samples.jsonl +4 -0
- human_eval/__init__.py +23 -0
- human_eval/data.py +74 -0
- human_eval/evaluate_functional_correctness.py +112 -0
- human_eval/evaluation.py +281 -0
- human_eval/execution.py +186 -0
- human_eval/logging_config.py +43 -0
- human_eval/resource_monitor.py +58 -0
- human_eval/rust_execution.py +802 -0
- human_eval/sandbox.py +586 -0
- human_eval_rust-2.1.0.dist-info/METADATA +488 -0
- human_eval_rust-2.1.0.dist-info/RECORD +19 -0
- human_eval_rust-2.1.0.dist-info/WHEEL +5 -0
- human_eval_rust-2.1.0.dist-info/entry_points.txt +2 -0
- human_eval_rust-2.1.0.dist-info/licenses/LICENSE +21 -0
- human_eval_rust-2.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: human-eval-rust
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: Evaluation harness for the HumanEval Rust problem solving dataset
|
|
5
|
+
Author-email: Dave Tofflemire <davetmire85@gmail.com>
|
|
6
|
+
Maintainer-email: Dave Tofflemire <davetmire85@gmail.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Related Projects, https://github.com/Superuser666-Sigil
|
|
9
|
+
Project-URL: Pipeline, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
|
|
10
|
+
Project-URL: Finetuner, https://github.com/Superuser666-Sigil/SigilDERG-Finetuner
|
|
11
|
+
Project-URL: Evaluation, https://github.com/Superuser666-Sigil/human-eval-Rust
|
|
12
|
+
Keywords: evaluation,rust,code,llm,benchmark
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Rust
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.12
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: tqdm>=4.65.0
|
|
25
|
+
Requires-Dist: fire>=0.5.0
|
|
26
|
+
Requires-Dist: numpy>=1.24.0
|
|
27
|
+
Requires-Dist: termcolor>=3.2.0
|
|
28
|
+
Requires-Dist: psutil>=5.9.0
|
|
29
|
+
Provides-Extra: flash-attn
|
|
30
|
+
Requires-Dist: flash-attn>=2.5.0; extra == "flash-attn"
|
|
31
|
+
Provides-Extra: ecosystem
|
|
32
|
+
Requires-Dist: sigil-pipeline>=1.2.0; extra == "ecosystem"
|
|
33
|
+
Requires-Dist: sigilderg-finetuner>=2.8.0; extra == "ecosystem"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-mock>=3.11.0; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-benchmark>=4.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
42
|
+
Requires-Dist: isort>=5.12.0; extra == "dev"
|
|
43
|
+
Requires-Dist: flake8>=7.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
45
|
+
Requires-Dist: hypothesis>=6.92.0; extra == "dev"
|
|
46
|
+
Requires-Dist: mutmut>=3.0.0; extra == "dev"
|
|
47
|
+
Provides-Extra: test
|
|
48
|
+
Requires-Dist: pytest>=7.4.0; extra == "test"
|
|
49
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "test"
|
|
50
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "test"
|
|
51
|
+
Requires-Dist: pytest-mock>=3.11.0; extra == "test"
|
|
52
|
+
Requires-Dist: hypothesis>=6.92.0; extra == "test"
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
|
|
55
|
+
# HumanEval Rust: Evaluation Harness for SigilDERG Ecosystem
|
|
56
|
+
|
|
57
|
+
A specialized evaluation harness for assessing Rust code generation capabilities of language models, designed as a core component of the [SigilDERG ecosystem](https://github.com/Superuser666-Sigil) for Rust-focused AI development.
|
|
58
|
+
|
|
59
|
+
> 📖 **Ecosystem Architecture**: For a comprehensive overview of how this project integrates with [SigilDERG-Data_Production](https://github.com/Superuser666-Sigil/SigilDERG-Data_Production) and [SigilDERG-Finetuner](https://github.com/Superuser666-Sigil/SigilDERG-Finetuner), see [ARCHITECTURE.md](https://github.com/Superuser666-Sigil/SigilDERG-Data_Production/blob/main/ARCHITECTURE.md) in the Data Production repository.
|
|
60
|
+
|
|
61
|
+
## About the SigilDERG Ecosystem
|
|
62
|
+
|
|
63
|
+
This evaluation harness is part of an integrated pipeline for training and evaluating Rust code generation models:
|
|
64
|
+
|
|
65
|
+
1. **[SigilDERG-Data_Production](https://github.com/Superuser666-Sigil/SigilDERG-Data_Production)**: Generates high-quality, instruction-style Rust code datasets from real-world crates using static analysis and quality filters
|
|
66
|
+
2. **[SigilDERG-Finetuner](https://github.com/Superuser666-Sigil/SigilDERG-Finetuner)**: Fine-tunes language models (like Llama-3.1-8B-Instruct) on Rust code using QLoRA and multi-phase training strategies
|
|
67
|
+
3. **HumanEval Rust** (this project): Evaluates model performance on standardized Rust programming problems using the HumanEval benchmark format
|
|
68
|
+
4. **[sigil-mmf-codex-priv](https://github.com/Superuser666-Sigil/sigil-mmf-codex-priv)**: Additional components for the ecosystem
|
|
69
|
+
|
|
70
|
+
### Target Model
|
|
71
|
+
|
|
72
|
+
This evaluator is designed to work with fine-tuned Rust code generation models, particularly:
|
|
73
|
+
- **[Llama-3.1-8B-Instruct-Rust-QLora](https://huggingface.co/Superuser666-Sigil/Llama-3.1-8B-Instruct-Rust-QLora)**: A Phase 1 fine-tuned model produced using the SigilDERG Finetuner
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
### Prerequisites
|
|
78
|
+
|
|
79
|
+
This package requires **Python 3.12.10 or later**. We recommend using a virtual environment:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Using venv (recommended)
|
|
83
|
+
python3.12 -m venv venv
|
|
84
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
85
|
+
|
|
86
|
+
# Or using uv (fast alternative)
|
|
87
|
+
uv venv
|
|
88
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Install a Rust toolchain via [`rustup`](https://www.rust-lang.org/tools/install) and ensure a modern compiler with Edition 2021 support (Rust 1.56+; we recommend the latest stable toolchain):
|
|
92
|
+
```bash
|
|
93
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
|
94
|
+
rustup default stable
|
|
95
|
+
rustc --version
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Install from PyPI
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install human-eval-rust
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
📦 **Package available on PyPI**: [https://pypi.org/project/human-eval-rust/](https://pypi.org/project/human-eval-rust/)
|
|
105
|
+
|
|
106
|
+
### Install Full Ecosystem
|
|
107
|
+
|
|
108
|
+
Install all three SigilDERG packages together:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pip install human-eval-rust[ecosystem]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Or install via the pipeline package:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pip install sigil-pipeline[ecosystem]
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
This installs:
|
|
121
|
+
- `human-eval-rust>=2.0.0`
|
|
122
|
+
- `sigil-pipeline>=1.2.1`
|
|
123
|
+
- `sigilderg-finetuner>=2.8.0`
|
|
124
|
+
|
|
125
|
+
### Install from source
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
git clone https://github.com/Superuser666-Sigil/human-eval-Rust.git
|
|
129
|
+
cd human-eval-Rust
|
|
130
|
+
pip install -e .
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Usage
|
|
134
|
+
|
|
135
|
+
**⚠️ Security Warning**: This program exists to run untrusted model-generated Rust code. Users are strongly encouraged not to do so outside of a robust security sandbox. Rust completions are compiled and executed via [`rust_execution.py`](human_eval/rust_execution.py); you should sandbox the Rust evaluator, because it builds binaries from untrusted code and runs their tests locally.
|
|
136
|
+
|
|
137
|
+
### Basic Evaluation Workflow
|
|
138
|
+
|
|
139
|
+
1. **Generate completions** from your model using the HumanEval Rust prompts
|
|
140
|
+
2. **Save samples** in JSONL format with `task_id` and `completion` fields
|
|
141
|
+
3. **Run evaluation** to get pass@k metrics and detailed results
|
|
142
|
+
|
|
143
|
+
### Example: Evaluating a Fine-Tuned Model
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from human_eval.data import read_problems, write_jsonl, get_human_eval_dataset
|
|
147
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
148
|
+
from peft import AutoPeftModelForCausalLM
|
|
149
|
+
import torch
|
|
150
|
+
|
|
151
|
+
# Load your fine-tuned PEFT model (e.g., from HuggingFace)
|
|
152
|
+
# For checkpoint subdirectories, use: "repo-name/checkpoint-9000"
|
|
153
|
+
model_name = "Superuser666-Sigil/Llama-3.1-8B-Instruct-Rust-QLora/checkpoint-9000"
|
|
154
|
+
|
|
155
|
+
# Load tokenizer (try checkpoint subfolder first, fallback to repo root)
|
|
156
|
+
try:
|
|
157
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
158
|
+
except Exception:
|
|
159
|
+
# Fallback: load from repo root or base model
|
|
160
|
+
repo_name = model_name.split("/checkpoint-")[0] if "/checkpoint-" in model_name else model_name
|
|
161
|
+
tokenizer = AutoTokenizer.from_pretrained(repo_name)
|
|
162
|
+
|
|
163
|
+
# Load PEFT model with explicit parameters to avoid TensorFlow loading issues
|
|
164
|
+
model = AutoPeftModelForCausalLM.from_pretrained(
|
|
165
|
+
model_name,
|
|
166
|
+
dtype=torch.bfloat16,
|
|
167
|
+
device_map="auto",
|
|
168
|
+
trust_remote_code=True,
|
|
169
|
+
from_tf=False, # Explicitly prevent TensorFlow loading
|
|
170
|
+
use_safetensors=True, # Prefer SafeTensors format
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# For base models (not PEFT), use:
|
|
174
|
+
# model = AutoModelForCausalLM.from_pretrained(
|
|
175
|
+
# model_name,
|
|
176
|
+
# dtype=torch.bfloat16,
|
|
177
|
+
# device_map="auto",
|
|
178
|
+
# trust_remote_code=True,
|
|
179
|
+
# from_tf=False,
|
|
180
|
+
# use_safetensors=True,
|
|
181
|
+
# )
|
|
182
|
+
|
|
183
|
+
# Load HumanEval Rust problems
|
|
184
|
+
rust_problems = read_problems(get_human_eval_dataset())
|
|
185
|
+
|
|
186
|
+
# Generate completions
|
|
187
|
+
samples = []
|
|
188
|
+
for task_id, problem in rust_problems.items():
|
|
189
|
+
prompt = problem["prompt"]
|
|
190
|
+
|
|
191
|
+
# Generate completion (adjust parameters as needed)
|
|
192
|
+
inputs = tokenizer(prompt, return_tensors="pt")
|
|
193
|
+
with torch.no_grad():
|
|
194
|
+
outputs = model.generate(
|
|
195
|
+
**inputs,
|
|
196
|
+
max_new_tokens=512,
|
|
197
|
+
temperature=0.2,
|
|
198
|
+
do_sample=True,
|
|
199
|
+
)
|
|
200
|
+
completion = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
|
201
|
+
|
|
202
|
+
samples.append(dict(task_id=task_id, completion=completion))
|
|
203
|
+
|
|
204
|
+
# Save samples
|
|
205
|
+
write_jsonl("rust_samples.jsonl", samples)
|
|
206
|
+
|
|
207
|
+
# Evaluate
|
|
208
|
+
# Run: evaluate_functional_correctness rust_samples.jsonl
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Command-Line Evaluation
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
$ evaluate_functional_correctness rust_samples.jsonl
|
|
215
|
+
Reading samples...
|
|
216
|
+
164it [00:01, 1959.50it/s]
|
|
217
|
+
Running test suites...
|
|
218
|
+
100%|...| 164/164 [00:45<00:00, 3.62it/s]
|
|
219
|
+
Writing results to rust_samples.jsonl_results.jsonl...
|
|
220
|
+
100%|...| 164/164 [00:00<00:00, 42876.84it/s]
|
|
221
|
+
{'pass@1': 0.42, 'pass@10': 0.68, 'pass@100': 0.85}
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
The evaluator provides detailed results in `<input>_results.jsonl` with per-sample pass/fail status and execution results ("passed", "timed out", or "failed").
|
|
225
|
+
|
|
226
|
+
### Integration with SigilDERG Finetuner
|
|
227
|
+
|
|
228
|
+
The evaluation workflow integrates seamlessly with the [SigilDERG Finetuner](https://github.com/Superuser666-Sigil/SigilDERG-Finetuner) evaluation system:
|
|
229
|
+
|
|
230
|
+
1. **After training**: Use the finetuner's evaluation scripts to generate samples
|
|
231
|
+
2. **Run this evaluator**: Process the generated samples to get HumanEval metrics
|
|
232
|
+
3. **Compare metrics**: Track improvements across training phases
|
|
233
|
+
|
|
234
|
+
Example integration:
|
|
235
|
+
```bash
|
|
236
|
+
# After Phase 1 training, evaluate checkpoint
|
|
237
|
+
python scripts/generate_samples.py \
|
|
238
|
+
--checkpoint out/llama8b-rust-qlora-phase1/checkpoint-1000 \
|
|
239
|
+
--output eval_samples.jsonl
|
|
240
|
+
|
|
241
|
+
# Evaluate with HumanEval Rust
|
|
242
|
+
evaluate_functional_correctness eval_samples.jsonl \
|
|
243
|
+
--problem_file=data/HumanEval_rust.jsonl
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Quick Sanity Check
|
|
247
|
+
|
|
248
|
+
The example samples should yield 0.5 pass@1:
|
|
249
|
+
```bash
|
|
250
|
+
$ evaluate_functional_correctness data/example_rust_samples.jsonl --problem_file=data/example_rust_problem.jsonl
|
|
251
|
+
Reading samples...
|
|
252
|
+
4it [00:00, 1959.50it/s]
|
|
253
|
+
Running test suites...
|
|
254
|
+
100%|...| 4/4 [00:03<00:00, 1.13it/s]
|
|
255
|
+
Writing results to data/example_rust_samples.jsonl_results.jsonl...
|
|
256
|
+
100%|...| 4/4 [00:00<00:00, 1536.38it/s]
|
|
257
|
+
{'pass@1': 0.5}
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Advanced Options
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
# Custom pass@k values
|
|
264
|
+
evaluate_functional_correctness samples.jsonl --k=1,5,10,20
|
|
265
|
+
|
|
266
|
+
# Adjust parallelism (default: 24 workers optimized for H100)
|
|
267
|
+
evaluate_functional_correctness samples.jsonl --n_workers=8
|
|
268
|
+
|
|
269
|
+
# Custom timeout (default: 10.0s optimized for H100)
|
|
270
|
+
evaluate_functional_correctness samples.jsonl --timeout=5.0
|
|
271
|
+
|
|
272
|
+
# Sandboxing options
|
|
273
|
+
evaluate_functional_correctness samples.jsonl --sandbox-mode=firejail # Recommended
|
|
274
|
+
evaluate_functional_correctness samples.jsonl --sandbox-mode=none # UNSAFE: local dev only
|
|
275
|
+
|
|
276
|
+
# Non-interactive mode (for CI/automated pipelines)
|
|
277
|
+
evaluate_functional_correctness samples.jsonl --allow-no-sandbox # Required when Firejail unavailable
|
|
278
|
+
|
|
279
|
+
# Policy enforcement (pattern filtering)
|
|
280
|
+
evaluate_functional_correctness samples.jsonl --enforce-policy # Default: enabled
|
|
281
|
+
evaluate_functional_correctness samples.jsonl --no-enforce-policy # Disable for pure HumanEval compatibility
|
|
282
|
+
|
|
283
|
+
# See all options
|
|
284
|
+
evaluate_functional_correctness --help
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Security and Sandboxing
|
|
288
|
+
|
|
289
|
+
**⚠️ Important**: This evaluator runs untrusted LLM-generated Rust code. For production use, **always use Firejail sandboxing**.
|
|
290
|
+
|
|
291
|
+
The evaluator includes multiple layers of security:
|
|
292
|
+
|
|
293
|
+
1. **Pattern-based filtering** (optional, enabled by default): Blocks dangerous code patterns before execution (filesystem, network, process operations, unsafe code, etc.). Can be disabled with `--no-enforce-policy` for pure HumanEval compatibility.
|
|
294
|
+
2. **Process isolation**: Each evaluation runs in a separate process
|
|
295
|
+
3. **Firejail sandboxing** (recommended): Full process jail isolation with resource limits
|
|
296
|
+
|
|
297
|
+
**Policy Enforcement Modes**:
|
|
298
|
+
- `--enforce-policy` (default): Enables pattern-based filtering for security. Use this for production evaluation of untrusted LLM-generated code.
|
|
299
|
+
- `--no-enforce-policy`: Disables pattern filtering for pure HumanEval compatibility. Use this when you need exact 1:1 comparability with the original HumanEval benchmark format (research/publication mode).
|
|
300
|
+
|
|
301
|
+
**Sandbox Modes**:
|
|
302
|
+
- `firejail` (recommended): Uses Firejail for Linux process isolation with `--net=none`, private filesystem, memory/CPU limits
|
|
303
|
+
- `none`: No sandboxing (UNSAFE - only for local development with trusted code)
|
|
304
|
+
- Auto-detect (default): Automatically detects Firejail availability; prompts for installation or unsafe mode if unavailable
|
|
305
|
+
|
|
306
|
+
**Firejail Setup** (Linux only):
|
|
307
|
+
```bash
|
|
308
|
+
# Install Firejail
|
|
309
|
+
sudo apt-get install firejail # Debian/Ubuntu
|
|
310
|
+
# or
|
|
311
|
+
sudo dnf install firejail # Fedora/RHEL
|
|
312
|
+
# or
|
|
313
|
+
sudo yum install firejail # CentOS
|
|
314
|
+
# or
|
|
315
|
+
sudo pacman -S firejail # Arch Linux
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**Interactive Installation Flow**:
|
|
319
|
+
|
|
320
|
+
When Firejail is not available, the evaluator presents an interactive prompt:
|
|
321
|
+
1. **Install Firejail**: Attempts automatic installation via system package manager
|
|
322
|
+
2. **Cancel**: Exit without running evaluation
|
|
323
|
+
3. **Proceed without sandbox**: Only after explicit confirmation (UNSAFE)
|
|
324
|
+
|
|
325
|
+
**Non-Interactive Mode**:
|
|
326
|
+
|
|
327
|
+
For CI/CD pipelines or automated scripts, use the `--allow-no-sandbox` flag to bypass interactive prompts:
|
|
328
|
+
```bash
|
|
329
|
+
# In CI, when Firejail is available
|
|
330
|
+
evaluate_functional_correctness samples.jsonl --sandbox-mode=firejail
|
|
331
|
+
|
|
332
|
+
# In CI, when you've verified the environment is secure
|
|
333
|
+
evaluate_functional_correctness samples.jsonl --sandbox-mode=none --allow-no-sandbox
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
## Dataset Format
|
|
337
|
+
|
|
338
|
+
The HumanEval Rust dataset (`data/HumanEval_rust.jsonl`) contains 164 Rust programming problems. Each problem includes:
|
|
339
|
+
- `task_id`: Unique identifier (e.g., "HumanEval/0")
|
|
340
|
+
- `prompt`: Function signature and docstring
|
|
341
|
+
- `canonical_solution`: Reference implementation
|
|
342
|
+
- `test`: Rust test cases using `#[cfg(test)]`
|
|
343
|
+
- `entry_point`: Function name
|
|
344
|
+
|
|
345
|
+
Sample format:
|
|
346
|
+
```json
|
|
347
|
+
{"task_id": "HumanEval/0", "prompt": "fn has_close_elements(...) -> bool{", "canonical_solution": "...", "test": "#[cfg(test)]\nmod tests {...}", "entry_point": "has_close_elements"}
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
### Enhanced Prompt Format (v1.4.4+)
|
|
351
|
+
|
|
352
|
+
When using the SigilDERG evaluation pipeline (lambda-package), prompts are automatically enhanced with Rust-specific instructions:
|
|
353
|
+
|
|
354
|
+
- Includes the Rust function signature and doc comment from the problem
|
|
355
|
+
- Adds explicit instructions: "Implement only the requested function in Rust"
|
|
356
|
+
- Prohibits `fn main`, tests, example code, and unnecessary comments
|
|
357
|
+
- Prohibits `...`, `todo!()`, and `unimplemented!()`
|
|
358
|
+
- Includes Rust-specific reminders about imports and mutability
|
|
359
|
+
|
|
360
|
+
This ensures models generate focused, correct Rust code without extra scaffolding.
|
|
361
|
+
|
|
362
|
+
## Integration with SigilDERG Pipeline
|
|
363
|
+
|
|
364
|
+
### Complete Workflow
|
|
365
|
+
|
|
366
|
+
1. **Data Production** → Generate training data with [SigilDERG-Data_Production](https://github.com/Superuser666-Sigil/SigilDERG-Data_Production)
|
|
367
|
+
2. **Model Fine-Tuning** → Train on Rust code with [SigilDERG-Finetuner](https://github.com/Superuser666-Sigil/SigilDERG-Finetuner)
|
|
368
|
+
3. **Evaluation** → Assess performance with this HumanEval Rust harness
|
|
369
|
+
4. **Iteration** → Use results to guide further training and data collection
|
|
370
|
+
|
|
371
|
+
### Metrics and Benchmarking
|
|
372
|
+
|
|
373
|
+
This evaluator provides comprehensive metrics for Rust code generation:
|
|
374
|
+
|
|
375
|
+
**Standard HumanEval Metrics:**
|
|
376
|
+
- **pass@k**: Functional correctness at k samples (pass@1, pass@2, pass@10, pass@100)
|
|
377
|
+
|
|
378
|
+
**Enhanced Metrics (v1.4.4+):**
|
|
379
|
+
- **compile_rate**: Fraction of samples that compile successfully
|
|
380
|
+
- **main_free_rate**: Percentage of completions without `fn main()` functions
|
|
381
|
+
|
|
382
|
+
**Result Schema (v1.4.4+):**
|
|
383
|
+
Each evaluation result includes enhanced fields for trust and auditability:
|
|
384
|
+
```json
|
|
385
|
+
{
|
|
386
|
+
"task_id": "HumanEval/0",
|
|
387
|
+
"completion": "...",
|
|
388
|
+
"compile_ok": true,
|
|
389
|
+
"test_ok": true,
|
|
390
|
+
"error_type": null,
|
|
391
|
+
"stderr": "",
|
|
392
|
+
"main_free": true,
|
|
393
|
+
"passed": true,
|
|
394
|
+
"result": "passed"
|
|
395
|
+
}
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
**Error Types:**
|
|
399
|
+
- `infra_missing_toolchain`: Infrastructure failure (rustc not available)
|
|
400
|
+
- `compile_error`: Code failed to compile
|
|
401
|
+
- `runtime_error`: Code compiled but crashed during execution
|
|
402
|
+
- `assertion_failure`: Tests failed (code ran but assertions failed)
|
|
403
|
+
|
|
404
|
+
**Preflight Checks:**
|
|
405
|
+
- Validates `rustc` availability before evaluation (fails fast on infrastructure issues)
|
|
406
|
+
- Never drops completions silently - all samples are included in results with appropriate status
|
|
407
|
+
|
|
408
|
+
Together, these metrics provide a complete picture of model performance for Rust code generation, with full auditability for Rule Zero compliance.
|
|
409
|
+
|
|
410
|
+
## Hardware Optimizations (H100 Configuration)
|
|
411
|
+
|
|
412
|
+
Version 2.0.0+ includes optimizations specifically tuned for high-performance GPU evaluation environments (e.g., 1x H100 with 26 vCPUs and 225GB RAM):
|
|
413
|
+
|
|
414
|
+
### Default Configuration
|
|
415
|
+
- **Parallel Workers**: 24 (default `--n_workers=24`) - Optimized to saturate 26 vCPUs (reserving 2 for OS/orchestration)
|
|
416
|
+
- **Timeout**: 10.0 seconds (default `--timeout=10.0`) - Increased from 3.0s to handle compilation latency on loaded systems
|
|
417
|
+
- **Firejail Memory Limit**: 4GB per process - Handles complex, macro-heavy Rust code compilation
|
|
418
|
+
|
|
419
|
+
### Resource Usage
|
|
420
|
+
With 24 workers and 4GB memory per process:
|
|
421
|
+
- **Maximum Memory Usage**: ~96GB (24 workers × 4GB) - Well within 225GB safety margin
|
|
422
|
+
- **CPU Utilization**: ~92% (24/26 vCPUs) - Near-saturation for maximum throughput
|
|
423
|
+
|
|
424
|
+
These defaults are optimized for production evaluation on high-end hardware. For smaller systems, you can override with `--n_workers` and `--timeout` flags.
|
|
425
|
+
|
|
426
|
+
## Version 2.0.0 Breaking Changes
|
|
427
|
+
|
|
428
|
+
**Docker Support Removed**: Version 2.0.0 removes Docker-based sandboxing in favor of Firejail-first architecture:
|
|
429
|
+
- Simpler deployment: No Docker daemon required
|
|
430
|
+
- Faster startup: No container overhead
|
|
431
|
+
- Interactive installation: Prompts to install Firejail if missing
|
|
432
|
+
- Non-interactive mode: `--allow-no-sandbox` for CI/CD pipelines
|
|
433
|
+
|
|
434
|
+
**Migration from v1.x**:
|
|
435
|
+
- If you were using `--sandbox-mode=docker`, switch to `--sandbox-mode=firejail`
|
|
436
|
+
- Install Firejail on your system (see Firejail Setup above)
|
|
437
|
+
- For CI/CD, use `--allow-no-sandbox` if running in a secure environment without Firejail
|
|
438
|
+
|
|
439
|
+
## Version 1.3.2+ Features
|
|
440
|
+
|
|
441
|
+
**Completion Extraction & Cleaning:**
|
|
442
|
+
- Automatically extracts function bodies from model completions
|
|
443
|
+
- Removes extra `main()` functions and standalone code
|
|
444
|
+
- Strips markdown code blocks (```rust, ```)
|
|
445
|
+
- Handles completions with or without function signatures
|
|
446
|
+
- Improves evaluation accuracy by ensuring only the target function is tested
|
|
447
|
+
|
|
448
|
+
**Robust Validation:**
|
|
449
|
+
- Validates `rustc` availability before evaluation (fails fast if unavailable)
|
|
450
|
+
- Prevents silent failures across thousands of samples
|
|
451
|
+
|
|
452
|
+
## Known Issues
|
|
453
|
+
|
|
454
|
+
While evaluation uses very little memory, you might see the following error message when the system is running out of RAM. Since this may cause some correct programs to fail, we recommend that you free some memory and try again.
|
|
455
|
+
```
|
|
456
|
+
malloc: can't allocate region
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
## Citation
|
|
460
|
+
|
|
461
|
+
This evaluation harness is based on the HumanEval benchmark format described in the original Codex paper. Please cite:
|
|
462
|
+
|
|
463
|
+
```
|
|
464
|
+
@article{chen2021codex,
|
|
465
|
+
title={Evaluating Large Language Models Trained on Code},
|
|
466
|
+
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
|
|
467
|
+
year={2021},
|
|
468
|
+
eprint={2107.03374},
|
|
469
|
+
archivePrefix={arXiv},
|
|
470
|
+
primaryClass={cs.LG}
|
|
471
|
+
}
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
## License
|
|
475
|
+
|
|
476
|
+
MIT License
|
|
477
|
+
|
|
478
|
+
## Security Model
|
|
479
|
+
This release hardens Firejail usage with seccomp, capability dropping, CPU/file/process limits, and read-only mounts to reduce risk when running untrusted Rust code.
|
|
480
|
+
|
|
481
|
+
## Metrics
|
|
482
|
+
The evaluator now reports compile rate, main-free rate, clippy pass rate, average compile time, and binary sizes alongside pass@k.
|
|
483
|
+
|
|
484
|
+
## Extended Dataset
|
|
485
|
+
An extended Rust dataset stub is available at `data/HumanEval_rust_extended.jsonl` and can be regenerated with `scripts/generate_extended_dataset.py`.
|
|
486
|
+
|
|
487
|
+
## Logging
|
|
488
|
+
Use `human_eval.logging_config.setup_logging` to configure structured logging for CLI invocations.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
data/HumanEval_rust.jsonl,sha256=W5rLnRrlxAnV73UDmxLsw6F_yApl8oCLk75gZhL9f84,230937
|
|
2
|
+
data/HumanEval_rust_extended.jsonl,sha256=-Jl9wooTyGd9L-_5wdy3n24rGEPVyulVuO3vFHxGv3M,435
|
|
3
|
+
data/example_rust_problem.jsonl,sha256=vEWiLY-6yf9GeB538nARf6cfP5bllVPz0SzCqJbrs2Y,283
|
|
4
|
+
data/example_rust_samples.jsonl,sha256=ktoInRr-AWmDfJL3-hLgqfzMXsxI-aAnbjUDHGwiK8I,297
|
|
5
|
+
human_eval/__init__.py,sha256=DBcEi5tTN6YMpn-zX6FjwFkVOXubHKmiXEc_vG9bafQ,556
|
|
6
|
+
human_eval/data.py,sha256=8fgKfNPcQRSHAly302Fr8OXLkgydQjw_8KYQ2MfxQHI,2487
|
|
7
|
+
human_eval/evaluate_functional_correctness.py,sha256=qBkygyCeYir3m_sQBzH0vmHV1H5cxwWIPtb7kfncz9w,3924
|
|
8
|
+
human_eval/evaluation.py,sha256=icHwySal4J2_gKIwdS1m_qSspai287iM_8PThGSK1w0,10538
|
|
9
|
+
human_eval/execution.py,sha256=WA4lSHOQlM-AKZtFQMp1uQwPIlYZTrRi1G45fdFQg-w,4979
|
|
10
|
+
human_eval/logging_config.py,sha256=uLzw-EZW-acU047y79XBQhogtznyVomKQlb9D6PgHmA,1103
|
|
11
|
+
human_eval/resource_monitor.py,sha256=TrZ8pguMsoQCB8WTXd7FkuJrGkVQW7KXEqt69HUBNLE,1700
|
|
12
|
+
human_eval/rust_execution.py,sha256=ZguC4uSD1a9GKwrtJSiX7Eo8zbUZfJBxhOsRKA1uT9g,26661
|
|
13
|
+
human_eval/sandbox.py,sha256=DWgnbiavjK2A3kcYCU7o34gFxiDNIXdgvK8FElUmc7o,19883
|
|
14
|
+
human_eval_rust-2.1.0.dist-info/licenses/LICENSE,sha256=vLo94hSFHM5G7Vr0LWaYBEYW7qzoh8MjG8eiBHSrY54,1083
|
|
15
|
+
human_eval_rust-2.1.0.dist-info/METADATA,sha256=luD1JTFX_yPonAKktsjZVikdcczMhVAm2HiCM7Fl_ik,21394
|
|
16
|
+
human_eval_rust-2.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
human_eval_rust-2.1.0.dist-info/entry_points.txt,sha256=FQxT_vQ9vAXvzf_vvn3k-f4gR2nJQetStJHUDpNt4lA,100
|
|
18
|
+
human_eval_rust-2.1.0.dist-info/top_level.txt,sha256=bC3-WWQQ9sm_sM38_Nuww07ZJDiutclr4yCrn8Tlx0k,11
|
|
19
|
+
human_eval_rust-2.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) OpenAI (https://openai.com)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
human_eval
|