rlenv-audit 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rlenv_audit-0.3.0/.claude-plugin/marketplace.json +11 -0
- rlenv_audit-0.3.0/.claude-plugin/plugin.json +10 -0
- rlenv_audit-0.3.0/.gitignore +32 -0
- rlenv_audit-0.3.0/COMMITS.md +56 -0
- rlenv_audit-0.3.0/DESIGN.md +140 -0
- rlenv_audit-0.3.0/PKG-INFO +150 -0
- rlenv_audit-0.3.0/README.md +130 -0
- rlenv_audit-0.3.0/REWARD_DESIGN.md +115 -0
- rlenv_audit-0.3.0/pyproject.toml +43 -0
- rlenv_audit-0.3.0/rlenv_audit/__init__.py +16 -0
- rlenv_audit-0.3.0/rlenv_audit/_sandbox_runner.py +48 -0
- rlenv_audit-0.3.0/rlenv_audit/adapters/__init__.py +6 -0
- rlenv_audit-0.3.0/rlenv_audit/adapters/verifiers.py +280 -0
- rlenv_audit-0.3.0/rlenv_audit/cli.py +146 -0
- rlenv_audit-0.3.0/rlenv_audit/sandbox.py +208 -0
- rlenv_audit-0.3.0/rlenv_audit/tools.py +304 -0
- rlenv_audit-0.3.0/scripts/survey.py +125 -0
- rlenv_audit-0.3.0/skills/contamination/SKILL.md +47 -0
- rlenv_audit-0.3.0/skills/env-audit/SKILL.md +99 -0
- rlenv_audit-0.3.0/skills/integrity/SKILL.md +48 -0
- rlenv_audit-0.3.0/skills/latency/SKILL.md +42 -0
- rlenv_audit-0.3.0/skills/problem-alignment/SKILL.md +44 -0
- rlenv_audit-0.3.0/skills/reward-design/SKILL.md +58 -0
- rlenv_audit-0.3.0/skills/rollout-quality/SKILL.md +49 -0
- rlenv_audit-0.3.0/tests/conftest.py +17 -0
- rlenv_audit-0.3.0/tests/test_adapter.py +33 -0
- rlenv_audit-0.3.0/tests/test_tools.py +80 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "rlenv-audit",
|
|
3
|
+
"owner": { "name": "Vivek", "email": "vivekvkashyap10@gmail.com" },
|
|
4
|
+
"plugins": [
|
|
5
|
+
{
|
|
6
|
+
"name": "env-audit",
|
|
7
|
+
"source": "./",
|
|
8
|
+
"description": "Audit a verifiers RL environment before training on it: six agent-run checks, one scorecard. Ask: \"Audit primeintellect/gsm8k\"."
|
|
9
|
+
}
|
|
10
|
+
]
|
|
11
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "env-audit",
|
|
3
|
+
"description": "Audit a Prime Intellect / verifiers RL environment before training on it. Six judgment-based checks (integrity, problem-statement alignment, reward design, latency, rollout quality, contamination) run by the agent, producing a scorecard with scores and written justifications.",
|
|
4
|
+
"version": "0.3.0",
|
|
5
|
+
"author": { "name": "Vivek", "email": "vivekvkashyap10@gmail.com" },
|
|
6
|
+
"homepage": "https://github.com/vivekvkashyap/RLEnv_audit",
|
|
7
|
+
"repository": "https://github.com/vivekvkashyap/RLEnv_audit",
|
|
8
|
+
"license": "MIT",
|
|
9
|
+
"keywords": ["rl", "reinforcement-learning", "verifiers", "prime-intellect", "audit"]
|
|
10
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
|
|
9
|
+
# Virtual envs
|
|
10
|
+
.venv/
|
|
11
|
+
venv/
|
|
12
|
+
env/
|
|
13
|
+
|
|
14
|
+
# Tooling / caches
|
|
15
|
+
.pytest_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
.mypy_cache/
|
|
18
|
+
|
|
19
|
+
# RLEnv_audit local cache (downloaded eval sets for the contamination check)
|
|
20
|
+
.rlenv_audit_cache/
|
|
21
|
+
|
|
22
|
+
# Reports written during runs
|
|
23
|
+
report.json
|
|
24
|
+
*.report.json
|
|
25
|
+
|
|
26
|
+
# OS
|
|
27
|
+
.DS_Store
|
|
28
|
+
|
|
29
|
+
# Survey run artifacts
|
|
30
|
+
survey_reports/
|
|
31
|
+
survey.json
|
|
32
|
+
.venv311/
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# COMMITS — build plan for RLEnv_audit
|
|
2
|
+
|
|
3
|
+
Built commit-by-commit; each commit builds and is self-contained. Tick a box when
|
|
4
|
+
the commit lands. Milestone is commit 4: the moment `rlenv-audit run gsm8k --only
|
|
5
|
+
determinism` prints `determinism PASS` on a real Hub env, the architecture is
|
|
6
|
+
proven.
|
|
7
|
+
|
|
8
|
+
- [x] **1. chore: scaffold** — `pyproject.toml` (verifiers==0.1.14, click, rich,
|
|
9
|
+
docker), `.gitignore`, README stub, empty package dirs, `DESIGN.md`,
|
|
10
|
+
`COMMITS.md`. Dedicated venv (`uv`, py3.10); `verifiers==0.1.14` + editable
|
|
11
|
+
install; `vf-install gsm8k -r` to get the reference env.
|
|
12
|
+
- [x] **2. feat(base+report): data model** — `checks/base.py` (`CheckStatus`,
|
|
13
|
+
`CheckResult`), `report.py` (`Scorecard.to_terminal` / `to_json`).
|
|
14
|
+
- [x] **3. feat(adapter): verifiers EnvHandle** — load env; normalize
|
|
15
|
+
RubricGroup-aware rubric / parser / dataset; synchronous `score()` over the
|
|
16
|
+
async path; `teardown()`. Graceful load failures.
|
|
17
|
+
- [x] **4. feat(determinism) + CLI** — determinism check + `audit()` orchestrator
|
|
18
|
+
+ `cli.py` (`run`, `list-checks`). **Milestone proven on gsm8k.**
|
|
19
|
+
- [x] **5. feat(sandbox+exploits)** — `sandbox.py` Docker isolation + exploits
|
|
20
|
+
check running cheat patterns inside it; SKIP cleanly if Docker is down.
|
|
21
|
+
- [x] **6. feat(parser)** — parser-robustness check.
|
|
22
|
+
- [x] **7. feat(contamination)** — n-gram overlap vs cached eval sets.
|
|
23
|
+
- [x] **8. feat(latency)** — timing check.
|
|
24
|
+
- [x] **9. feat(distribution)** — GPU/vLLM check; SKIP-degrading when absent.
|
|
25
|
+
- [x] **10. docs+tests** — README, sample scorecard, what each check means; real
|
|
26
|
+
tests in `tests/`.
|
|
27
|
+
- [x] **11. push** — `git push -u origin main` after confirming the remote +
|
|
28
|
+
stored credentials work.
|
|
29
|
+
|
|
30
|
+
## v0.2 — the skill-based rewrite
|
|
31
|
+
|
|
32
|
+
Everything above built the v0.1 script battery. v0.2 scrapped the deterministic
|
|
33
|
+
checks: the audits are judgment-heavy, so each check became a **skill file**
|
|
34
|
+
(`skills/<check>/SKILL.md`) executed by an agent, leaning on a thin deterministic
|
|
35
|
+
tool layer (`rlenv-audit inspect / score / rollouts / scorecard`). See
|
|
36
|
+
`DESIGN.md` for the current architecture.
|
|
37
|
+
|
|
38
|
+
- [x] **12. feat!: skill-based audit** — delete `checks/`, `core.py`,
|
|
39
|
+
`report.py`, `skills.py`; add `tools.py` (inspect / score / rollouts /
|
|
40
|
+
scorecard), rewrite `cli.py` around the four tools, add the six check skills +
|
|
41
|
+
the `env-audit` orchestrator, rewrite README/DESIGN, port `scripts/survey.py`
|
|
42
|
+
to the inspect tool, new `tests/test_tools.py`.
|
|
43
|
+
- [x] **13. feat(distribution): one-command install** — `.claude-plugin/`
|
|
44
|
+
manifests (repo doubles as a Claude Code plugin marketplace), skills bundled
|
|
45
|
+
into the wheel, `rlenv-audit install-skills`, self-bootstrapping setup step in
|
|
46
|
+
the orchestrator skill (pip-installs the tools + `vf-install`s the env).
|
|
47
|
+
|
|
48
|
+
## Guardrails (apply throughout)
|
|
49
|
+
|
|
50
|
+
- Library-first: every check callable programmatically; CLI is a thin shell.
|
|
51
|
+
- Keep `verifiers` pinned to `==0.1.14`; never pull `verifiers-rl`/torch/vLLM as
|
|
52
|
+
hard deps.
|
|
53
|
+
- Fail gracefully: env won't load / no vLLM / Docker down → clear SKIP or error,
|
|
54
|
+
never a traceback dump.
|
|
55
|
+
- No plugin system, no config framework, no multi-format support. Six checks,
|
|
56
|
+
one format, that's v0.
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# DESIGN — env_audit
|
|
2
|
+
|
|
3
|
+
> A skill-based auditing system for RL environments. An agent (Claude Code /
|
|
4
|
+
> Codex) runs six judgment-based checks over a `verifiers` environment and emits
|
|
5
|
+
> a scorecard with scores + written justifications.
|
|
6
|
+
|
|
7
|
+
## 1. Why this exists
|
|
8
|
+
|
|
9
|
+
RL post-training environments are treated like training data, but nobody tests
|
|
10
|
+
them before burning GPU hours. A broken reward doesn't crash — it silently
|
|
11
|
+
teaches the policy garbage: non-deterministic rewards (noisy gradient),
|
|
12
|
+
exploitable rewards (the policy cheats), rewards that don't discriminate (no
|
|
13
|
+
signal), brittle parsers (correct answers scored wrong), contaminated datasets
|
|
14
|
+
(memorized eval). env_audit catches these first.
|
|
15
|
+
|
|
16
|
+
## 2. The core decision: skills, not scripts
|
|
17
|
+
|
|
18
|
+
The six checks are **judgment-heavy and non-deterministic** — "does this reward
|
|
19
|
+
agree with a competent grader?", "is the system prompt missing something?", "does
|
|
20
|
+
this dataset overlap a benchmark?". A deterministic script can only approximate
|
|
21
|
+
these with brittle heuristics. An agent does them well.
|
|
22
|
+
|
|
23
|
+
So each check is a **skill file** (`skills/<check>/SKILL.md`, SKILL.md style with
|
|
24
|
+
`name` + `description` frontmatter) that an agent reads and executes with its own
|
|
25
|
+
reasoning. The agent leans on a thin **tool layer** (`rlenv-audit ...`) only for
|
|
26
|
+
the parts that must be exact and reproducible:
|
|
27
|
+
|
|
28
|
+
- **load + introspect** the environment,
|
|
29
|
+
- **score** agent-written completions through the real reward function,
|
|
30
|
+
- run + cache a **shared set of rollouts**,
|
|
31
|
+
- **render** the scorecard.
|
|
32
|
+
|
|
33
|
+
Tools are pure JSON-in / JSON-out so a skill can shell out and read the result.
|
|
34
|
+
This split keeps judgment in the agent and determinism in the code.
|
|
35
|
+
|
|
36
|
+
## 3. Target format: `verifiers==0.1.14`
|
|
37
|
+
|
|
38
|
+
We build against the `verifiers` library (the Hub standard), **pinned to 0.1.14**
|
|
39
|
+
(newer versions drag in torch/vLLM; the target box has old CUDA). The adapter was
|
|
40
|
+
written against the actually-installed source. Facts that shaped it:
|
|
41
|
+
|
|
42
|
+
| Concern | Reality in 0.1.14 |
|
|
43
|
+
| --- | --- |
|
|
44
|
+
| Loading | `verifiers.load_environment(env_id)` is **synchronous**; it imports a module named `env_id.replace("-","_").split("/")[-1]` and calls its `load_environment()`. The env must be pip-installed (`vf-install`). |
|
|
45
|
+
| Rubric | Often a **`RubricGroup`** whose own `.funcs` is empty — real reward funcs surface via `rubric._get_reward_func_names()`. |
|
|
46
|
+
| Scoring | **Async**, mutates a `state` dict in place. Branch on `rubric.has_group_rewards` (`score_group` vs `score_rollout`). |
|
|
47
|
+
| Reward funcs | May own a `ProcessPoolExecutor` (e.g. `MathRubric`) → must `teardown()`. |
|
|
48
|
+
| Parser | `parser.parse_answer(messages) -> str | None`. |
|
|
49
|
+
| Dataset | HF `Dataset`; rows carry `prompt` (chat messages), `answer`, plus env columns. Many Hub envs are eval-only. |
|
|
50
|
+
|
|
51
|
+
## 4. The adapter — `EnvHandle` (`adapters/verifiers.py`)
|
|
52
|
+
|
|
53
|
+
The only code that touches `verifiers`. It normalizes a loaded env into a stable,
|
|
54
|
+
synchronous handle the tools use:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
EnvHandle:
|
|
58
|
+
load_handle(env_id) -> EnvHandle # clean EnvLoadError on failure
|
|
59
|
+
reward_func_names() / reward_sources() # names + getsource of reward fns (RubricGroup-aware)
|
|
60
|
+
system_prompt() / module_file() # env framing + source file
|
|
61
|
+
dataset(n) / dataset_size() # normalized rows, train↔eval fallback
|
|
62
|
+
score(text, prompt, answer, columns) -> (reward, metrics) # sync over async scoring
|
|
63
|
+
canonical_answer(answer) / teardown()
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
`score()` builds a `state` dict, runs the rubric (group-aware), and returns the
|
|
67
|
+
reward — uniform across `Rubric`, `MathRubric`, and `RubricGroup`. All dataset
|
|
68
|
+
columns are threaded through so reward funcs that read custom fields work.
|
|
69
|
+
|
|
70
|
+
## 5. The tool layer (`tools.py`, `cli.py`)
|
|
71
|
+
|
|
72
|
+
Four commands, each JSON-in/JSON-out:
|
|
73
|
+
|
|
74
|
+
- `rlenv-audit inspect <env> -n 20` → `{loaded, env_type, parser_type, module_file,
|
|
75
|
+
dataset_size, system_prompt, reward_funcs:[{name,weight,source}], sample:[...]}`.
|
|
76
|
+
Load failures are captured as `{loaded: false, error}` so the integrity check
|
|
77
|
+
sees them as data. Used by checks 1, 2, 3, 6.
|
|
78
|
+
- `rlenv-audit score <env> completions.json` → scores agent-written
|
|
79
|
+
`[{prompt_index, label, text}]` through the reward function. Used by check 3.
|
|
80
|
+
- `rlenv-audit rollouts <env> --endpoint --model -n 20 -k 8` (or `--dummy`) →
|
|
81
|
+
generates 8 rollouts over ~20 tasks **once**, scores + times them, caches to
|
|
82
|
+
JSON. Checks 4 and 5 share this single cache.
|
|
83
|
+
- `rlenv-audit scorecard results.json` → computes the overall grade + rating
|
|
84
|
+
(average of the checks that ran; N/A excluded) and renders the table.
|
|
85
|
+
|
|
86
|
+
## 6. The six checks (`skills/`)
|
|
87
|
+
|
|
88
|
+
1. **integrity** — does it run and is it shaped right (dataset, reward, conventions,
|
|
89
|
+
imports). No endpoint.
|
|
90
|
+
2. **problem-alignment** (conditional) — given the user's problem statement, does
|
|
91
|
+
the env actually test it. **N/A** without a problem statement. No endpoint.
|
|
92
|
+
3. **reward-design** — agent writes ~20 synthetic completions (correct / wrong /
|
|
93
|
+
edge / format perturbations), scores them, and checks (a) variance &
|
|
94
|
+
discrimination and (b) agreement between the reward and the agent's own quality
|
|
95
|
+
judgment. No endpoint.
|
|
96
|
+
4. **latency** — end-to-end rollout timing from the shared cache. Needs an endpoint.
|
|
97
|
+
5. **rollout-quality** — reads actual rollouts and judges the env setup (system
|
|
98
|
+
prompt, output sensibility, env-caused failure modes). Needs an endpoint.
|
|
99
|
+
6. **contamination** — infer domain → pick benchmarks → check dataset overlap. No
|
|
100
|
+
endpoint.
|
|
101
|
+
|
|
102
|
+
The **env-audit** orchestrator skill gathers inputs (env id, optional problem
|
|
103
|
+
statement, optional endpoint), runs the no-endpoint checks, generates the shared
|
|
104
|
+
rollouts once if an endpoint is given, runs the endpoint checks from that cache,
|
|
105
|
+
and assembles the scorecard.
|
|
106
|
+
|
|
107
|
+
## 7. Scoring model
|
|
108
|
+
|
|
109
|
+
Each check returns `{name, status, score (0–100|null), justification}`.
|
|
110
|
+
|
|
111
|
+
- **status**: PASS (~75–100) / WARN (~40–74) / FAIL (~0–39) / **N/A** (documented
|
|
112
|
+
skip: no problem statement, no endpoint).
|
|
113
|
+
- **rating**: the mean of the numeric scores over checks that actually ran (N/A
|
|
114
|
+
excluded), mapped to an A–F letter.
|
|
115
|
+
- **grade**: the worst meaningful status (any FAIL → FAIL).
|
|
116
|
+
|
|
117
|
+
Every score must be grounded in observed evidence — tool output, completions the
|
|
118
|
+
agent wrote, rollouts it read — never a vibe. `REWARD_DESIGN.md` is the rubric the
|
|
119
|
+
reward-design and rollout-quality checks judge against.
|
|
120
|
+
|
|
121
|
+
## 8. Distribution
|
|
122
|
+
|
|
123
|
+
One repo, published two ways, so a user never clones it:
|
|
124
|
+
|
|
125
|
+
- **Claude Code plugin** — `.claude-plugin/marketplace.json` makes the repo a
|
|
126
|
+
one-plugin marketplace; `/plugin install env-audit@rlenv-audit` ships the
|
|
127
|
+
skill files straight from GitHub.
|
|
128
|
+
- **PyPI wheel** — the wheel force-includes `skills/` as package data;
|
|
129
|
+
`rlenv-audit install-skills` (or `uvx rlenv-audit install-skills`) copies them
|
|
130
|
+
into `~/.claude/skills/`.
|
|
131
|
+
|
|
132
|
+
The skills carry no other setup burden: the orchestrator's bootstrap step has
|
|
133
|
+
the agent pip-install the tools and `vf-install` the target environment itself
|
|
134
|
+
on first run. The user types one install command once, then just "audit <env>".
|
|
135
|
+
|
|
136
|
+
## 9. Honest scope
|
|
137
|
+
|
|
138
|
+
Six checks, one format (`verifiers`), agent-driven. Determinism lives in the
|
|
139
|
+
tools; judgment lives in the skills. No plugin system, no config framework — the
|
|
140
|
+
`adapters/` + `skills/` seams make extension possible without building it now.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rlenv-audit
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: env_audit — a skill-based auditing system for Prime Intellect `verifiers` RL environments
|
|
5
|
+
Project-URL: Repository, https://github.com/vivekvkashyap/RLEnv_audit
|
|
6
|
+
Author-email: Vivek <vivekvkashyap10@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: audit,prime-intellect,reinforcement-learning,reward-hacking,rl,skills,verifiers
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: click>=8.1
|
|
11
|
+
Requires-Dist: docker>=7.0
|
|
12
|
+
Requires-Dist: openai>=1.0
|
|
13
|
+
Requires-Dist: rich>=13.0
|
|
14
|
+
Requires-Dist: verifiers==0.1.14
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
17
|
+
Provides-Extra: gpu
|
|
18
|
+
Requires-Dist: vllm; extra == 'gpu'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# env_audit
|
|
22
|
+
|
|
23
|
+
**A skill-based auditing system for RL environments.** Point an agent (Claude
|
|
24
|
+
Code / Codex) at a [`verifiers`](https://github.com/PrimeIntellect-ai/verifiers)
|
|
25
|
+
environment from the Prime Intellect Hub and it runs **six checks** and produces
|
|
26
|
+
a scorecard — *before* you spend GPU hours training on a broken reward.
|
|
27
|
+
|
|
28
|
+
RL environments are treated like training data, but nobody tests them first. A
|
|
29
|
+
broken reward function doesn't crash — it silently teaches the policy garbage.
|
|
30
|
+
env_audit catches that.
|
|
31
|
+
|
|
32
|
+
## Why skills, not scripts
|
|
33
|
+
|
|
34
|
+
The six checks are **judgment-heavy, non-deterministic evaluations** — "does this
|
|
35
|
+
reward agree with a competent grader?", "is the system prompt missing something?",
|
|
36
|
+
"does this dataset overlap a benchmark?". Those are done well by an *agent*, not a
|
|
37
|
+
hard-coded script. So each check is a **skill file** (`skills/<check>/SKILL.md`)
|
|
38
|
+
that the agent reads and executes with its own reasoning, leaning on a small layer
|
|
39
|
+
of deterministic **tools** (`rlenv-audit ...`) for the exact parts: loading the
|
|
40
|
+
env, calling the reward function, running rollouts, rendering the scorecard.
|
|
41
|
+
|
|
42
|
+
Each check returns a **score (0–100), a status, and a written justification**.
|
|
43
|
+
|
|
44
|
+
## The six checks
|
|
45
|
+
|
|
46
|
+
| # | Check | Needs | What it does |
|
|
47
|
+
|---|-------|-------|--------------|
|
|
48
|
+
| 1 | **integrity** | — | Does it even run and is it shaped right: dataset loads & is well-formed, reward present & callable, follows verifiers conventions, no missing fields / broken imports. |
|
|
49
|
+
| 2 | **problem-statement alignment** | *(a problem statement)* | Given what the user says the env is *for*, judge whether the dataset + reward + prompt actually test that. **N/A** if no problem statement is provided. |
|
|
50
|
+
| 3 | **reward design** | — | Stress-tests the reward without the policy: the agent writes ~20 synthetic completions (correct / wrong / edge / format perturbations), scores them through the real reward, and checks (a) the reward varies & discriminates sensibly and (b) each reward matches the agent's own judgment of quality. |
|
|
51
|
+
| 4 | **latency** | model endpoint | How long rollouts take end to end. Reads the shared cached rollouts. |
|
|
52
|
+
| 5 | **rollout quality** | model endpoint | Reads actual rollouts and judges whether the env is set up well in practice — system prompt right, outputs sensible, obvious env-caused failure modes. |
|
|
53
|
+
| 6 | **contamination** | — | Infers the domain, picks the public benchmarks for it, and checks whether dataset instances match/near-match benchmark instances. |
|
|
54
|
+
|
|
55
|
+
**Shared rollouts (checks 4 & 5).** Both need a model, so env_audit asks once
|
|
56
|
+
which endpoint/model to use (or "dummy"), runs rollouts **once** (8 rollouts over
|
|
57
|
+
~20 samples, scored + timed, cached), and both checks read that single cache.
|
|
58
|
+
Checks 1, 2, 3, 6 need no endpoint. No endpoint → 4 & 5 are **N/A**.
|
|
59
|
+
|
|
60
|
+
## Quickstart
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Install the skills (pick one)
|
|
64
|
+
uvx --from git+https://github.com/vivekvkashyap/RLEnv_audit.git rlenv-audit install-skills
|
|
65
|
+
pip install git+https://github.com/vivekvkashyap/RLEnv_audit.git && rlenv-audit install-skills
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Or as a Claude Code plugin, no terminal needed:
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
/plugin marketplace add vivekvkashyap/RLEnv_audit
|
|
72
|
+
/plugin install env-audit@rlenv-audit
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Then point your agent (Claude Code / Codex) at an environment:
|
|
76
|
+
|
|
77
|
+
> "Audit the `gsm8k` environment." / "Audit `primeintellect/aime2024`
|
|
78
|
+
> — I'm trying to train a competition-math solver — using my vLLM at
|
|
79
|
+
> `http://localhost:8000/v1`."
|
|
80
|
+
|
|
81
|
+
That's it — everything else is self-bootstrapping: on the first audit the skill
|
|
82
|
+
installs the `rlenv-audit` tools (if missing) and `vf-install`s the environment
|
|
83
|
+
itself. The agent runs the six checks and prints the scorecard:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
env_audit · gsm8k
|
|
87
|
+
┃ check ┃ status ┃ score ┃ justification ┃
|
|
88
|
+
│ integrity │ PASS │ 95 │ loads, reward callable, well-formed │
|
|
89
|
+
│ problem_alignment │ N/A │ — │ no problem statement provided │
|
|
90
|
+
│ reward_design │ PASS │ 88 │ discriminates; matches judgment 18/20 │
|
|
91
|
+
│ latency │ N/A │ — │ no endpoint │
|
|
92
|
+
│ rollout_quality │ N/A │ — │ no endpoint │
|
|
93
|
+
│ contamination │ WARN │ 60 │ 3 near-matches with GSM8K test │
|
|
94
|
+
overall: WARN rating: B (81/100)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### From a checkout (development)
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install -e . # the rlenv-audit / env-audit tools
|
|
101
|
+
rlenv-audit install-skills # copy skills/ into ~/.claude/skills
|
|
102
|
+
vf-install primeintellect/gsm8k # install an environment to audit by hand
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
> Most Hub envs require **Python 3.11+**; `verifiers==0.1.14` (pinned) also runs
|
|
106
|
+
> on 3.10 for old-CUDA boxes, where you can install the older example envs. The
|
|
107
|
+
> env must be installed into the **same Python environment** as `rlenv-audit` —
|
|
108
|
+
> verifiers loads environments by importing them.
|
|
109
|
+
|
|
110
|
+
### The tools (what the skills call)
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
rlenv-audit inspect <env> -n 20 # load + introspect -> JSON (reward source, samples, prompt)
|
|
114
|
+
rlenv-audit score <env> completions.json # score agent-written completions through the reward fn
|
|
115
|
+
rlenv-audit rollouts <env> --endpoint <url> --model <m> -n 20 -k 8 # run+cache shared rollouts
|
|
116
|
+
rlenv-audit rollouts <env> --dummy # fake rollouts, no endpoint (dry run)
|
|
117
|
+
rlenv-audit scorecard results.json # render the final scorecard
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
These are deterministic and JSON-in/JSON-out — usable directly, but normally
|
|
121
|
+
driven by the skills.
|
|
122
|
+
|
|
123
|
+
## What good looks like
|
|
124
|
+
|
|
125
|
+
[`REWARD_DESIGN.md`](REWARD_DESIGN.md) is the reference the reward-design and
|
|
126
|
+
rollout-quality checks judge against — determinism, discrimination, baseline
|
|
127
|
+
floor, partial credit, bounds, anti-hacking, parser contract, contamination.
|
|
128
|
+
|
|
129
|
+
## Layout
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
skills/ the six checks + the env-audit orchestrator (SKILL.md each)
|
|
133
|
+
.claude-plugin/ plugin + marketplace manifests (repo doubles as a Claude Code plugin)
|
|
134
|
+
rlenv_audit/
|
|
135
|
+
adapters/verifiers.py EnvHandle — the only code that touches verifiers
|
|
136
|
+
tools.py inspect / score / rollouts / scorecard
|
|
137
|
+
sandbox.py Docker isolation (for executing risky completions)
|
|
138
|
+
cli.py the rlenv-audit / env-audit CLI (+ install-skills)
|
|
139
|
+
REWARD_DESIGN.md the design guide the judgment checks cite
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Development
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pip install -e ".[dev]" && pytest tests/
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
MIT
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# env_audit
|
|
2
|
+
|
|
3
|
+
**A skill-based auditing system for RL environments.** Point an agent (Claude
|
|
4
|
+
Code / Codex) at a [`verifiers`](https://github.com/PrimeIntellect-ai/verifiers)
|
|
5
|
+
environment from the Prime Intellect Hub and it runs **six checks** and produces
|
|
6
|
+
a scorecard — *before* you spend GPU hours training on a broken reward.
|
|
7
|
+
|
|
8
|
+
RL environments are treated like training data, but nobody tests them first. A
|
|
9
|
+
broken reward function doesn't crash — it silently teaches the policy garbage.
|
|
10
|
+
env_audit catches that.
|
|
11
|
+
|
|
12
|
+
## Why skills, not scripts
|
|
13
|
+
|
|
14
|
+
The six checks are **judgment-heavy, non-deterministic evaluations** — "does this
|
|
15
|
+
reward agree with a competent grader?", "is the system prompt missing something?",
|
|
16
|
+
"does this dataset overlap a benchmark?". Those are done well by an *agent*, not a
|
|
17
|
+
hard-coded script. So each check is a **skill file** (`skills/<check>/SKILL.md`)
|
|
18
|
+
that the agent reads and executes with its own reasoning, leaning on a small layer
|
|
19
|
+
of deterministic **tools** (`rlenv-audit ...`) for the exact parts: loading the
|
|
20
|
+
env, calling the reward function, running rollouts, rendering the scorecard.
|
|
21
|
+
|
|
22
|
+
Each check returns a **score (0–100), a status, and a written justification**.
|
|
23
|
+
|
|
24
|
+
## The six checks
|
|
25
|
+
|
|
26
|
+
| # | Check | Needs | What it does |
|
|
27
|
+
|---|-------|-------|--------------|
|
|
28
|
+
| 1 | **integrity** | — | Does it even run and is it shaped right: dataset loads & is well-formed, reward present & callable, follows verifiers conventions, no missing fields / broken imports. |
|
|
29
|
+
| 2 | **problem-statement alignment** | *(a problem statement)* | Given what the user says the env is *for*, judge whether the dataset + reward + prompt actually test that. **N/A** if no problem statement is provided. |
|
|
30
|
+
| 3 | **reward design** | — | Stress-tests the reward without the policy: the agent writes ~20 synthetic completions (correct / wrong / edge / format perturbations), scores them through the real reward, and checks (a) the reward varies & discriminates sensibly and (b) each reward matches the agent's own judgment of quality. |
|
|
31
|
+
| 4 | **latency** | model endpoint | How long rollouts take end to end. Reads the shared cached rollouts. |
|
|
32
|
+
| 5 | **rollout quality** | model endpoint | Reads actual rollouts and judges whether the env is set up well in practice — system prompt right, outputs sensible, obvious env-caused failure modes. |
|
|
33
|
+
| 6 | **contamination** | — | Infers the domain, picks the public benchmarks for it, and checks whether dataset instances match/near-match benchmark instances. |
|
|
34
|
+
|
|
35
|
+
**Shared rollouts (checks 4 & 5).** Both need a model, so env_audit asks once
|
|
36
|
+
which endpoint/model to use (or "dummy"), runs rollouts **once** (8 rollouts over
|
|
37
|
+
~20 samples, scored + timed, cached), and both checks read that single cache.
|
|
38
|
+
Checks 1, 2, 3, 6 need no endpoint. No endpoint → 4 & 5 are **N/A**.
|
|
39
|
+
|
|
40
|
+
## Quickstart
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Install the skills (pick one)
|
|
44
|
+
uvx --from git+https://github.com/vivekvkashyap/RLEnv_audit.git rlenv-audit install-skills
|
|
45
|
+
pip install git+https://github.com/vivekvkashyap/RLEnv_audit.git && rlenv-audit install-skills
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or as a Claude Code plugin, no terminal needed:
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
/plugin marketplace add vivekvkashyap/RLEnv_audit
|
|
52
|
+
/plugin install env-audit@rlenv-audit
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Then point your agent (Claude Code / Codex) at an environment:
|
|
56
|
+
|
|
57
|
+
> "Audit the `gsm8k` environment." / "Audit `primeintellect/aime2024`
|
|
58
|
+
> — I'm trying to train a competition-math solver — using my vLLM at
|
|
59
|
+
> `http://localhost:8000/v1`."
|
|
60
|
+
|
|
61
|
+
That's it — everything else is self-bootstrapping: on the first audit the skill
|
|
62
|
+
installs the `rlenv-audit` tools (if missing) and `vf-install`s the environment
|
|
63
|
+
itself. The agent runs the six checks and prints the scorecard:
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
env_audit · gsm8k
|
|
67
|
+
┃ check ┃ status ┃ score ┃ justification ┃
|
|
68
|
+
│ integrity │ PASS │ 95 │ loads, reward callable, well-formed │
|
|
69
|
+
│ problem_alignment │ N/A │ — │ no problem statement provided │
|
|
70
|
+
│ reward_design │ PASS │ 88 │ discriminates; matches judgment 18/20 │
|
|
71
|
+
│ latency │ N/A │ — │ no endpoint │
|
|
72
|
+
│ rollout_quality │ N/A │ — │ no endpoint │
|
|
73
|
+
│ contamination │ WARN │ 60 │ 3 near-matches with GSM8K test │
|
|
74
|
+
overall: WARN rating: B (81/100)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### From a checkout (development)
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install -e . # the rlenv-audit / env-audit tools
|
|
81
|
+
rlenv-audit install-skills # copy skills/ into ~/.claude/skills
|
|
82
|
+
vf-install primeintellect/gsm8k # install an environment to audit by hand
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
> Most Hub envs require **Python 3.11+**; `verifiers==0.1.14` (pinned) also runs
|
|
86
|
+
> on 3.10 for old-CUDA boxes, where you can install the older example envs. The
|
|
87
|
+
> env must be installed into the **same Python environment** as `rlenv-audit` —
|
|
88
|
+
> verifiers loads environments by importing them.
|
|
89
|
+
|
|
90
|
+
### The tools (what the skills call)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
rlenv-audit inspect <env> -n 20 # load + introspect -> JSON (reward source, samples, prompt)
|
|
94
|
+
rlenv-audit score <env> completions.json # score agent-written completions through the reward fn
|
|
95
|
+
rlenv-audit rollouts <env> --endpoint <url> --model <m> -n 20 -k 8 # run+cache shared rollouts
|
|
96
|
+
rlenv-audit rollouts <env> --dummy # fake rollouts, no endpoint (dry run)
|
|
97
|
+
rlenv-audit scorecard results.json # render the final scorecard
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
These are deterministic and JSON-in/JSON-out — usable directly, but normally
|
|
101
|
+
driven by the skills.
|
|
102
|
+
|
|
103
|
+
## What good looks like
|
|
104
|
+
|
|
105
|
+
[`REWARD_DESIGN.md`](REWARD_DESIGN.md) is the reference the reward-design and
|
|
106
|
+
rollout-quality checks judge against — determinism, discrimination, baseline
|
|
107
|
+
floor, partial credit, bounds, anti-hacking, parser contract, contamination.
|
|
108
|
+
|
|
109
|
+
## Layout
|
|
110
|
+
|
|
111
|
+
```
|
|
112
|
+
skills/ the six checks + the env-audit orchestrator (SKILL.md each)
|
|
113
|
+
.claude-plugin/ plugin + marketplace manifests (repo doubles as a Claude Code plugin)
|
|
114
|
+
rlenv_audit/
|
|
115
|
+
adapters/verifiers.py EnvHandle — the only code that touches verifiers
|
|
116
|
+
tools.py inspect / score / rollouts / scorecard
|
|
117
|
+
sandbox.py Docker isolation (for executing risky completions)
|
|
118
|
+
cli.py the rlenv-audit / env-audit CLI (+ install-skills)
|
|
119
|
+
REWARD_DESIGN.md the design guide the judgment checks cite
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Development
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
pip install -e ".[dev]" && pytest tests/
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## License
|
|
129
|
+
|
|
130
|
+
MIT
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# REWARD_DESIGN.md — how to design a good `verifiers` RL environment
|
|
2
|
+
|
|
3
|
+
This is the reference the `rlenv-audit` checks point at. Every recommendation in
|
|
4
|
+
a scorecard cites a section here. It's a checklist for building an environment
|
|
5
|
+
whose reward is actually a good training signal — written so you can fix a flagged
|
|
6
|
+
issue without guessing what "good" means.
|
|
7
|
+
|
|
8
|
+
The one-line principle: **the reward must go up if and only if the policy gets
|
|
9
|
+
better at the task.** Everything below is a way that principle breaks.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## §determinism — the same completion must always score the same
|
|
14
|
+
|
|
15
|
+
Score a fixed completion repeatedly; the reward must not move. Sources of
|
|
16
|
+
non-determinism to remove:
|
|
17
|
+
|
|
18
|
+
- unseeded RNG anywhere in scoring (`random`, `numpy`, sampling a judge);
|
|
19
|
+
- wall-clock or timeout-dependent scoring (a slow machine scores differently);
|
|
20
|
+
- network/API calls whose result varies (rate limits, model drift);
|
|
21
|
+
- LLM judges with `temperature > 0` — pin to `0` and ideally cache.
|
|
22
|
+
|
|
23
|
+
A reward that varies across identical re-scores injects pure noise into the
|
|
24
|
+
gradient: the policy chases randomness, not skill.
|
|
25
|
+
|
|
26
|
+
## §discrimination — correct must out-score garbage
|
|
27
|
+
|
|
28
|
+
A correct answer must score strictly higher than an empty or nonsense
|
|
29
|
+
completion. If it doesn't, the gradient points nowhere. The usual culprit is the
|
|
30
|
+
**parser/matcher rejecting the dataset's own gold answers** — if the env can't
|
|
31
|
+
reward its own answer key, no policy can learn from it. Test: run each row's gold
|
|
32
|
+
`answer` through the verifier and confirm it earns the max reward.
|
|
33
|
+
|
|
34
|
+
## §baseline-floor — don't pay everyone
|
|
35
|
+
|
|
36
|
+
If every response (even garbage) earns a constant positive reward — a
|
|
37
|
+
participation, length, or format bonus applied unconditionally — the *relative*
|
|
38
|
+
advantage of solving the task shrinks. Prefer a zero floor. If you keep a format
|
|
39
|
+
reward, weight it near zero and gate it on the answer being present, not on mere
|
|
40
|
+
formatting.
|
|
41
|
+
|
|
42
|
+
## §partial-credit — graded beats binary on hard tasks
|
|
43
|
+
|
|
44
|
+
Strict 0-or-1 reward is correct but sparse: on tasks the policy almost never
|
|
45
|
+
solves, the gradient is almost always zero. Where the task allows, give graded
|
|
46
|
+
partial credit (fraction of unit tests passing, edit-distance/LCS similarity,
|
|
47
|
+
sub-goal completion) so there's signal at the frontier.
|
|
48
|
+
|
|
49
|
+
## §bounds — keep the aggregate in [0, 1]
|
|
50
|
+
|
|
51
|
+
Keep the summed reward in a known, bounded range (normally `[0, 1]`). Unbounded or
|
|
52
|
+
wildly-scaled rewards complicate advantage normalization and make environments
|
|
53
|
+
incomparable. If you must use another range, document it.
|
|
54
|
+
|
|
55
|
+
## §weights — sanity-check the weighted sum
|
|
56
|
+
|
|
57
|
+
- At least one reward function must have a non-zero weight (all-zero weights →
|
|
58
|
+
reward is 0 by construction → no learning).
|
|
59
|
+
- Negative-weight penalties are easy to get backwards; make sure a penalty can't
|
|
60
|
+
dominate the positive signal and reward the policy for doing nothing.
|
|
61
|
+
|
|
62
|
+
## §anti-hacking — assume the policy will cheat
|
|
63
|
+
|
|
64
|
+
RL policies find every shortcut. A robust verifier:
|
|
65
|
+
|
|
66
|
+
- never trusts an exit code alone (a `sys.exit(0)` before tests must not pass);
|
|
67
|
+
- keeps expected outputs / test files **out of** the execution working directory
|
|
68
|
+
(so the solution can't read the answer off disk);
|
|
69
|
+
- re-asserts results *after* the submission runs (so monkeypatching `assert`,
|
|
70
|
+
overriding builtins, or printing the answer without computing it fails);
|
|
71
|
+
- rejects empty / no-op submissions.
|
|
72
|
+
|
|
73
|
+
If a no-solution completion earns reward, the policy will learn that shortcut
|
|
74
|
+
instead of the task.
|
|
75
|
+
|
|
76
|
+
## §parser-contract — parse what models actually emit
|
|
77
|
+
|
|
78
|
+
The parser must extract the answer from *real* model output, not just the
|
|
79
|
+
canonical format. Be liberal in what you accept: strip surrounding whitespace and
|
|
80
|
+
trailing punctuation, match case-insensitively, take the last occurrence when the
|
|
81
|
+
answer is restated, tolerate reasoning before the answer. And state the required
|
|
82
|
+
format explicitly in the system prompt — if the parser extracts nothing from real
|
|
83
|
+
rollouts, the format contract isn't being communicated.
|
|
84
|
+
|
|
85
|
+
## §difficulty-curriculum — avoid all-pass / all-fail batches
|
|
86
|
+
|
|
87
|
+
Generate rollouts with a reference policy and histogram the rewards. An all-zero
|
|
88
|
+
batch (too hard / broken) or all-one batch (trivial) gives zero gradient. Aim for
|
|
89
|
+
a spread of outcomes at the model's current ability — mix difficulties, or filter
|
|
90
|
+
the dataset to the band where the policy sometimes-but-not-always succeeds.
|
|
91
|
+
|
|
92
|
+
## §contamination — don't train on the eval set
|
|
93
|
+
|
|
94
|
+
N-gram/embedding-check your training tasks against the benchmarks you'll report
|
|
95
|
+
on (AIME, MATH, GSM8K, HumanEval, LiveCodeBench, …). Overlap means measured
|
|
96
|
+
"improvement" is partly memorization. For an explicitly *eval-only* environment,
|
|
97
|
+
overlap with its own benchmark is expected — just never use it for training.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
### How the audit checks map to these sections
|
|
102
|
+
|
|
103
|
+
| Check | Sections it judges against |
|
|
104
|
+
| --- | --- |
|
|
105
|
+
| integrity | §parser-contract, §weights |
|
|
106
|
+
| reward_design | §determinism, §discrimination, §baseline-floor, §partial-credit, §bounds, §weights, §anti-hacking, §parser-contract |
|
|
107
|
+
| rollout_quality | §parser-contract, §difficulty-curriculum |
|
|
108
|
+
| contamination | §contamination |
|
|
109
|
+
|
|
110
|
+
(problem_alignment and latency judge things outside this guide: the user's stated
|
|
111
|
+
goal and throughput.)
|
|
112
|
+
|
|
113
|
+
A clean scorecard means none of these failure modes were detected on the slice we
|
|
114
|
+
could measure — not a proof of correctness, but the cheap faults are ruled out
|
|
115
|
+
before you spend GPU hours.
|