rewardprobe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rewardprobe-0.1.0/.gitignore +22 -0
- rewardprobe-0.1.0/CLAUDE.md +121 -0
- rewardprobe-0.1.0/PKG-INFO +249 -0
- rewardprobe-0.1.0/README.md +213 -0
- rewardprobe-0.1.0/action.yml +46 -0
- rewardprobe-0.1.0/examples/sample_tasks.jsonl +10 -0
- rewardprobe-0.1.0/examples/test_simple_reward.py +37 -0
- rewardprobe-0.1.0/examples/test_trl_reward.py +51 -0
- rewardprobe-0.1.0/pyproject.toml +64 -0
- rewardprobe-0.1.0/spec.md +585 -0
- rewardprobe-0.1.0/src/rewardprobe/__init__.py +14 -0
- rewardprobe-0.1.0/src/rewardprobe/adapters/__init__.py +77 -0
- rewardprobe-0.1.0/src/rewardprobe/adapters/auto.py +497 -0
- rewardprobe-0.1.0/src/rewardprobe/adapters/base.py +20 -0
- rewardprobe-0.1.0/src/rewardprobe/adapters/raw.py +189 -0
- rewardprobe-0.1.0/src/rewardprobe/adapters/trl.py +134 -0
- rewardprobe-0.1.0/src/rewardprobe/adapters/verifiers.py +266 -0
- rewardprobe-0.1.0/src/rewardprobe/cli.py +304 -0
- rewardprobe-0.1.0/src/rewardprobe/families/__init__.py +44 -0
- rewardprobe-0.1.0/src/rewardprobe/families/base.py +213 -0
- rewardprobe-0.1.0/src/rewardprobe/families/basic_sanity.py +372 -0
- rewardprobe-0.1.0/src/rewardprobe/families/composition.py +339 -0
- rewardprobe-0.1.0/src/rewardprobe/families/parser_bugs.py +497 -0
- rewardprobe-0.1.0/src/rewardprobe/families/schema.py +223 -0
- rewardprobe-0.1.0/src/rewardprobe/families/stateful.py +259 -0
- rewardprobe-0.1.0/src/rewardprobe/families/trivial_bypass.py +253 -0
- rewardprobe-0.1.0/src/rewardprobe/probe.py +365 -0
- rewardprobe-0.1.0/src/rewardprobe/report.py +178 -0
- rewardprobe-0.1.0/src/rewardprobe/simulator.py +466 -0
- rewardprobe-0.1.0/src/rewardprobe/tier2/__init__.py +21 -0
- rewardprobe-0.1.0/src/rewardprobe/tier2/adversarial.py +127 -0
- rewardprobe-0.1.0/src/rewardprobe/tier2/client.py +117 -0
- rewardprobe-0.1.0/src/rewardprobe/tier2/code_analyzer.py +142 -0
- rewardprobe-0.1.0/src/rewardprobe/tier2/prompts.py +174 -0
- rewardprobe-0.1.0/src/rewardprobe/tier2/validator.py +82 -0
- rewardprobe-0.1.0/src/rewardprobe/types.py +128 -0
- rewardprobe-0.1.0/tests/__init__.py +0 -0
- rewardprobe-0.1.0/tests/conftest.py +160 -0
- rewardprobe-0.1.0/tests/test_adapters/__init__.py +0 -0
- rewardprobe-0.1.0/tests/test_adapters/test_raw.py +77 -0
- rewardprobe-0.1.0/tests/test_families/__init__.py +0 -0
- rewardprobe-0.1.0/tests/test_families/test_basic_sanity.py +47 -0
- rewardprobe-0.1.0/tests/test_families/test_trivial_bypass.py +29 -0
- rewardprobe-0.1.0/tests/test_probe.py +85 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
.eggs/
|
|
8
|
+
*.egg
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
.env
|
|
12
|
+
.pytest_cache/
|
|
13
|
+
.ruff_cache/
|
|
14
|
+
htmlcov/
|
|
15
|
+
.coverage
|
|
16
|
+
*.log
|
|
17
|
+
|
|
18
|
+
# Vitals provenance data
|
|
19
|
+
.vitals/
|
|
20
|
+
hunt/
|
|
21
|
+
.claude/
|
|
22
|
+
excalidraw.log
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# rewardprobe — Architecture Guide
|
|
2
|
+
|
|
3
|
+
## What This Is
|
|
4
|
+
|
|
5
|
+
rewardprobe answers: **"What will my model learn to do against this reward function?"**
|
|
6
|
+
|
|
7
|
+
Three modes:
|
|
8
|
+
1. **Quick Check** — 30 deterministic probes, instant, free
|
|
9
|
+
2. **Deep Analysis** — Claude reads your code + generates adversarial inputs
|
|
10
|
+
3. **Simulate** — generates diverse completions, maps the reward landscape
|
|
11
|
+
|
|
12
|
+
## How It Works (Plain English)
|
|
13
|
+
|
|
14
|
+
### Quick Check
|
|
15
|
+
We generate adversarial inputs — empty strings, format tricks, parser exploits, wrong-but-formatted answers — and feed them to your reward function. If it scores them high, that's a bug.
|
|
16
|
+
|
|
17
|
+
### Deep Analysis
|
|
18
|
+
We send your reward function's Python source code to Claude. Claude reads it and says: "This function uses rfind() to find the last \boxed{} tag — a model can output multiple tags and only the last one is checked." Then Claude generates realistic wrong completions and we actually run them against your function.
|
|
19
|
+
|
|
20
|
+
### Simulate
|
|
21
|
+
We use Claude to generate 10 completions per task, each representing a different strategy a model might adopt during training:
|
|
22
|
+
|
|
23
|
+
- **perfect** — correct reasoning + correct answer
|
|
24
|
+
- **correct_lazy** — just the answer, no work shown
|
|
25
|
+
- **shortcut** — skips computation, guesses based on patterns
|
|
26
|
+
- **format_only** — perfect format, wrong content
|
|
27
|
+
- **near_miss** — plausible but slightly wrong
|
|
28
|
+
- **hedge** — multiple answers hoping one matches
|
|
29
|
+
- etc.
|
|
30
|
+
|
|
31
|
+
Then we score every completion against your reward function and build the **strategy scoreboard** — a bar chart showing which behaviors get the most reward. If "shortcut" scores as high as "perfect," the model will learn to take shortcuts.
|
|
32
|
+
|
|
33
|
+
## Architecture
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
src/rewardprobe/
|
|
37
|
+
├── probe.py # Probe class — main entry point
|
|
38
|
+
├── types.py # Core data models (Pydantic)
|
|
39
|
+
├── report.py # Terminal output (Rich)
|
|
40
|
+
├── simulator.py # Reward landscape simulation
|
|
41
|
+
├── adapters/
|
|
42
|
+
│ ├── auto.py # Universal signature detection
|
|
43
|
+
│ ├── verifiers.py # verifiers framework adapter
|
|
44
|
+
│ ├── trl.py # TRL GRPOTrainer adapter
|
|
45
|
+
│ └── raw.py # Raw Python functions
|
|
46
|
+
├── families/
|
|
47
|
+
│ ├── base.py # Attack base classes + format detection
|
|
48
|
+
│ ├── basic_sanity.py # Empty, whitespace, range, determinism (7)
|
|
49
|
+
│ ├── trivial_bypass.py # Echo, constant, format-only (4)
|
|
50
|
+
│ ├── parser_bugs.py # Reasoning leaks, regex, malformed (6)
|
|
51
|
+
│ ├── composition.py # Weight dominance, zero-weight leaks (3)
|
|
52
|
+
│ ├── schema.py # Missing columns, type mismatches (5)
|
|
53
|
+
│ └── stateful.py # State leak, stale state, tool skip (5)
|
|
54
|
+
└── tier2/
|
|
55
|
+
├── client.py # LLM client (Anthropic, OpenAI, Ollama)
|
|
56
|
+
├── code_analyzer.py # Claude reads source code
|
|
57
|
+
├── adversarial.py # Claude generates adversarial completions
|
|
58
|
+
├── validator.py # Filters false positives from Tier 1
|
|
59
|
+
└── prompts.py # All LLM prompt templates
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Key Concepts
|
|
63
|
+
|
|
64
|
+
### NormalizedEnvironment
|
|
65
|
+
Every adapter converts framework-specific reward functions into one universal interface:
|
|
66
|
+
```python
|
|
67
|
+
NormalizedEnvironment(
|
|
68
|
+
reward_fns=[NormalizedRewardFn(name="...", fn=wrapped_fn, weight=1.0)],
|
|
69
|
+
tasks=[{"prompt": "...", "answer": "..."}, ...],
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
The wrapped `fn` always has signature `(completion_text: str, task: dict) -> float`.
|
|
73
|
+
|
|
74
|
+
### Auto-Detection
|
|
75
|
+
`auto.py` classifies any callable into one of 5 patterns:
|
|
76
|
+
- `SINGLE_PAIR` — `fn(completion, answer) -> float`
|
|
77
|
+
- `SINGLE_FLEXIBLE` — `fn(completion) -> float`
|
|
78
|
+
- `BATCH_SIMPLE` — `fn(completions, **kwargs) -> list[float]`
|
|
79
|
+
- `BATCH_WITH_PROMPTS` — `fn(prompts, completions, **kwargs) -> list[float]`
|
|
80
|
+
- `VERIFIERS_ASYNC` — `async fn(completion, answer, ...) -> float`
|
|
81
|
+
|
|
82
|
+
### Format Detection
|
|
83
|
+
Reward functions expect answers in different formats. Before running attacks, we probe the function with common formats (`\boxed{}`, `####`, `<answer>`, raw) to discover what it accepts. This prevents false positives on format-enforcing functions.
|
|
84
|
+
|
|
85
|
+
### Answer Key Normalization
|
|
86
|
+
Datasets use different column names for ground truth (`answer`, `solution`, `label`, `target`, `expected_output`). We detect and normalize to `answer` before running attacks.
|
|
87
|
+
|
|
88
|
+
## Adding a New Attack
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
class MyAttack(BaseAttack):
|
|
92
|
+
name = "my_attack"
|
|
93
|
+
family = "basic_sanity"
|
|
94
|
+
severity = Severity.WARNING
|
|
95
|
+
|
|
96
|
+
def run(self, env: NormalizedEnvironment) -> list[Finding]:
|
|
97
|
+
findings = []
|
|
98
|
+
for fn in env.reward_fns:
|
|
99
|
+
for idx, task in enumerate(env.tasks):
|
|
100
|
+
score, err = _safe_call(fn, "adversarial input", task)
|
|
101
|
+
if err is None and score is not None and score > 0.5:
|
|
102
|
+
findings.append(Finding(
|
|
103
|
+
attack=self.name, family=self.family,
|
|
104
|
+
severity=self.severity,
|
|
105
|
+
input="adversarial input",
|
|
106
|
+
actual_reward=score,
|
|
107
|
+
affected_tasks=[idx],
|
|
108
|
+
recommendation="Plain English explanation of the problem",
|
|
109
|
+
))
|
|
110
|
+
return findings
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Add to the family's `__init__` attack list. Add a test in `tests/test_families/`.
|
|
114
|
+
|
|
115
|
+
## Design Principles
|
|
116
|
+
|
|
117
|
+
1. **No false positives.** A finding the developer ignores erodes all trust. Every finding must be real and actionable.
|
|
118
|
+
2. **Plain English.** No jargon in output. "Your function scores wrong answers at 0.8" not "parser_bugs/regex_greediness FAILED."
|
|
119
|
+
3. **Domain agnostic.** Works for math, code, text, classification — any RL reward function. No hardcoded formats.
|
|
120
|
+
4. **Auto-detect everything.** The developer should never configure their framework, format, or column names.
|
|
121
|
+
5. **Seconds, not minutes.** Tier 1 is instant. Tier 2 is under a minute. Simulate is 2-3 minutes.
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rewardprobe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pre-training stress-testing for reward functions. Find bugs in minutes on CPU instead of days into a $10K training run.
|
|
5
|
+
Project-URL: Homepage, https://github.com/rewardprobe/rewardprobe
|
|
6
|
+
Project-URL: Documentation, https://github.com/rewardprobe/rewardprobe
|
|
7
|
+
Project-URL: Repository, https://github.com/rewardprobe/rewardprobe
|
|
8
|
+
Author: rewardprobe contributors
|
|
9
|
+
License-Expression: Apache-2.0
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: click>=8.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0
|
|
23
|
+
Requires-Dist: rich>=13.0
|
|
24
|
+
Provides-Extra: all
|
|
25
|
+
Requires-Dist: trl>=0.15.0; extra == 'all'
|
|
26
|
+
Requires-Dist: verifiers>=0.1.0; extra == 'all'
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.8.0; extra == 'dev'
|
|
31
|
+
Provides-Extra: trl
|
|
32
|
+
Requires-Dist: trl>=0.15.0; extra == 'trl'
|
|
33
|
+
Provides-Extra: verifiers
|
|
34
|
+
Requires-Dist: verifiers>=0.1.0; extra == 'verifiers'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# rewardprobe
|
|
38
|
+
|
|
39
|
+
**Know what your model will learn — before you train.**
|
|
40
|
+
|
|
41
|
+
[](https://pypi.org/project/rewardprobe/)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
[](https://python.org)
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
You write a reward function. You're about to spend $10K on a GRPO training run. rewardprobe tells you what the model will actually learn to do:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
rewardprobe simulate — production_math_rlvr
|
|
51
|
+
50 completions across 5 tasks
|
|
52
|
+
|
|
53
|
+
2 critical found
|
|
54
|
+
|
|
55
|
+
1. critical
|
|
56
|
+
'Shortcut' strategy scores 0.71
|
|
57
|
+
A model using the shortcut strategy earns 103% of what a correct
|
|
58
|
+
answer earns. It will learn to skip computation and take shortcuts
|
|
59
|
+
because that's easier AND scores higher.
|
|
60
|
+
|
|
61
|
+
2. critical
|
|
62
|
+
'Lazy correct' strategy scores only 0.07
|
|
63
|
+
A correct answer without formatting scores near zero. Your reward
|
|
64
|
+
function punishes correct-but-unformatted answers more than it
|
|
65
|
+
punishes wrong-but-formatted ones.
|
|
66
|
+
|
|
67
|
+
Strategy scoreboard:
|
|
68
|
+
perfect ████████████████████ 1.00
|
|
69
|
+
correct_verbose ████████████████████ 1.00
|
|
70
|
+
shortcut ██████████████░░░░░░ 0.71 ← problem
|
|
71
|
+
near_miss █████░░░░░░░░░░░░░░░ 0.29
|
|
72
|
+
format_only █████░░░░░░░░░░░░░░░ 0.29
|
|
73
|
+
garbage ███░░░░░░░░░░░░░░░░░ 0.18
|
|
74
|
+
correct_lazy █░░░░░░░░░░░░░░░░░░░ 0.07 ← problem
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
The **strategy scoreboard** shows exactly how your reward function scores different model behaviors. If a lazy or wrong strategy scores close to a correct one, the model will learn the lazy path. You see this in 30 seconds instead of discovering it 3 days into training.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## The Problem
|
|
82
|
+
|
|
83
|
+
You write a reward function for RL training. It looks correct. You start training. Days later, the model is gaming the reward — outputting shortcuts, copying format without thinking, or guessing. OpenAI [documented](https://openai.com/index/chain-of-thought-monitoring/) this happening with `exit(0)` and `raise SkipTest`. METR [found](https://metr.org/blog/2025-06-05-recent-reward-hacking/) frontier models monkey-patching their own graders.
|
|
84
|
+
|
|
85
|
+
The fix is to test reward functions **before** training, the same way you test code before deploying.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Install
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install rewardprobe
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Three Modes
|
|
98
|
+
|
|
99
|
+
### 1. Quick Check (free, instant, no API key)
|
|
100
|
+
|
|
101
|
+
30 deterministic probes. Catches parser bugs, edge cases, format tricks. Runs in under a second on CPU.
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
```
|
|
108
|
+
rewardprobe — my_reward
|
|
109
|
+
|
|
110
|
+
1 critical, 2 warning found
|
|
111
|
+
|
|
112
|
+
1. critical
|
|
113
|
+
Correct answer in reasoning section scores 1.0 even when the
|
|
114
|
+
answer field contains a wrong answer.
|
|
115
|
+
|
|
116
|
+
2. warning
|
|
117
|
+
Different scores depending on answer tag order.
|
|
118
|
+
|
|
119
|
+
28/30 checks passed.
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 2. Deep Analysis (needs API key)
|
|
123
|
+
|
|
124
|
+
Claude reads your source code, understands what each function does, and generates realistic adversarial completions. Finds bugs that static probes can't.
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
export ANTHROPIC_API_KEY=sk-...
|
|
128
|
+
rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --deep
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
This adds:
|
|
132
|
+
- **Code analysis** — Claude identifies logic bugs by reading your Python code
|
|
133
|
+
- **Adversarial completions** — wrong-but-plausible model outputs tested against your function
|
|
134
|
+
- **False positive filtering** — classifies each function (correctness/format/auxiliary) so findings are precise
|
|
135
|
+
|
|
136
|
+
### 3. Simulate (needs API key)
|
|
137
|
+
|
|
138
|
+
The flagship feature. Generates diverse completions spanning the full range of what a model might produce during training — from perfect solutions to garbage — and maps the reward landscape.
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
rewardprobe simulate my_reward.py::my_fn --dataset tasks.jsonl
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
The strategy scoreboard shows you at a glance:
|
|
145
|
+
- **Green strategies** (perfect, correct_lazy, correct_verbose) — what you WANT the model to learn
|
|
146
|
+
- **Red strategies** (shortcut, format_only, hedge, garbage) — what you DON'T want
|
|
147
|
+
|
|
148
|
+
If a red strategy scores close to or higher than a green one, your reward function has a problem.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## What We Found
|
|
153
|
+
|
|
154
|
+
We ran rewardprobe against reward functions from 4 major RL codebases plus 3 non-math domains. Results:
|
|
155
|
+
|
|
156
|
+
| Codebase | Domain | Key Finding |
|
|
157
|
+
|----------|--------|-------------|
|
|
158
|
+
| **verifiers/gsm8k** (Prime Intellect) | Math | Model can skip reasoning — `correct_lazy` scores 1.0 |
|
|
159
|
+
| **Open-R1** (HuggingFace) | Math | `first_match` mode lets models hedge with multiple answers |
|
|
160
|
+
| **verl** (ByteDance) | Math | `format_score` parameter can reward wrong answers |
|
|
161
|
+
| **willccbb GRPO gist** | Math | Returns 2.0 (outside [0,1]); rejects "42.0" for "42" |
|
|
162
|
+
| Custom code reward | Code | Off-by-one bugs score 0.83 — substring matching misses logic errors |
|
|
163
|
+
| Sentiment classifier | Text | Reasoned answers score 0.0, bare labels score 1.0 |
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Works With Any Framework
|
|
168
|
+
|
|
169
|
+
Auto-detects your reward function's signature. No configuration.
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
# Any of these just work:
|
|
173
|
+
def my_reward(completion, answer): ... # Raw Python
|
|
174
|
+
def accuracy_reward(completions, solution, **kwargs): ... # TRL / GRPO
|
|
175
|
+
def correctness(prompts, completions, answer, **kwargs): ... # TRL with prompts
|
|
176
|
+
async def correct_answer(completion, answer): ... # verifiers
|
|
177
|
+
def compute_score(solution_str, ground_truth): ... # ByteDance verl
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
rewardprobe test file.py::fn --dataset tasks.jsonl # Just works
|
|
182
|
+
rewardprobe test environments/gsm8k.py # verifiers environments too
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## GitHub Action
|
|
188
|
+
|
|
189
|
+
```yaml
|
|
190
|
+
- run: pip install rewardprobe
|
|
191
|
+
- run: rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --ci
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Exit code 1 on critical findings. Add `--deep` with `ANTHROPIC_API_KEY` secret for AI analysis in CI.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## Python API
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
from rewardprobe import Probe
|
|
202
|
+
|
|
203
|
+
# Quick check
|
|
204
|
+
report = Probe().test_fn(my_reward, tasks)
|
|
205
|
+
print(report.passed) # True / False
|
|
206
|
+
|
|
207
|
+
# Deep analysis
|
|
208
|
+
report = Probe(deep=True).test_fn(my_reward, tasks)
|
|
209
|
+
|
|
210
|
+
# Simulate
|
|
211
|
+
from rewardprobe.simulator import simulate, print_simulation
|
|
212
|
+
from rewardprobe.tier2.client import get_client
|
|
213
|
+
from rewardprobe.adapters.auto import auto_adapt
|
|
214
|
+
|
|
215
|
+
env = auto_adapt(my_reward, tasks)
|
|
216
|
+
result = simulate(env, get_client("sonnet"), n_tasks=5)
|
|
217
|
+
print_simulation(result)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## How It Works
|
|
223
|
+
|
|
224
|
+
**Quick Check** generates adversarial inputs (empty strings, format tricks, parser exploits, wrong-but-formatted answers) and tests your reward function against them. 30 probes across 6 families, all deterministic, all on CPU.
|
|
225
|
+
|
|
226
|
+
**Deep Analysis** uses Claude to read your reward function's Python source code. It understands what the function checks, identifies logic bugs, and generates realistic wrong completions that a model might produce during training. Each completion is actually run against your function — only real exploits are reported.
|
|
227
|
+
|
|
228
|
+
**Simulate** uses Claude to generate 10 diverse completions per task, each representing a different strategy a model might learn (perfect, lazy, shortcut, hedging, garbage, etc). Scores them all against your reward function. The strategy scoreboard shows which behaviors your reward function actually incentivizes.
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## What rewardprobe Is NOT
|
|
233
|
+
|
|
234
|
+
- **Not a training monitor.** We run *before* training starts.
|
|
235
|
+
- **Not a formal prover.** We find bugs empirically with concrete inputs.
|
|
236
|
+
- **Not a guarantee.** A clean report means "we tested these patterns and found nothing." The nastiest reward hacks are novel and environment-specific.
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Contributing
|
|
241
|
+
|
|
242
|
+
See [CLAUDE.md](CLAUDE.md) for architecture, how to add attacks, and how the simulator works.
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
git clone https://github.com/rewardprobe/rewardprobe && cd rewardprobe
|
|
246
|
+
uv sync --extra dev && pytest tests/
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
Apache 2.0
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# rewardprobe
|
|
2
|
+
|
|
3
|
+
**Know what your model will learn — before you train.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/rewardprobe/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](https://python.org)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
You write a reward function. You're about to spend $10K on a GRPO training run. rewardprobe tells you what the model will actually learn to do:
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
rewardprobe simulate — production_math_rlvr
|
|
15
|
+
50 completions across 5 tasks
|
|
16
|
+
|
|
17
|
+
2 critical found
|
|
18
|
+
|
|
19
|
+
1. critical
|
|
20
|
+
'Shortcut' strategy scores 0.71
|
|
21
|
+
A model using the shortcut strategy earns 103% of what a correct
|
|
22
|
+
answer earns. It will learn to skip computation and take shortcuts
|
|
23
|
+
because that's easier AND scores higher.
|
|
24
|
+
|
|
25
|
+
2. critical
|
|
26
|
+
'Lazy correct' strategy scores only 0.07
|
|
27
|
+
A correct answer without formatting scores near zero. Your reward
|
|
28
|
+
function punishes correct-but-unformatted answers more than it
|
|
29
|
+
punishes wrong-but-formatted ones.
|
|
30
|
+
|
|
31
|
+
Strategy scoreboard:
|
|
32
|
+
perfect ████████████████████ 1.00
|
|
33
|
+
correct_verbose ████████████████████ 1.00
|
|
34
|
+
shortcut ██████████████░░░░░░ 0.71 ← problem
|
|
35
|
+
near_miss █████░░░░░░░░░░░░░░░ 0.29
|
|
36
|
+
format_only █████░░░░░░░░░░░░░░░ 0.29
|
|
37
|
+
garbage ███░░░░░░░░░░░░░░░░░ 0.18
|
|
38
|
+
correct_lazy █░░░░░░░░░░░░░░░░░░░ 0.07 ← problem
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
The **strategy scoreboard** shows exactly how your reward function scores different model behaviors. If a lazy or wrong strategy scores close to a correct one, the model will learn the lazy path. You see this in 30 seconds instead of discovering it 3 days into training.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## The Problem
|
|
46
|
+
|
|
47
|
+
You write a reward function for RL training. It looks correct. You start training. Days later, the model is gaming the reward — outputting shortcuts, copying format without thinking, or guessing. OpenAI [documented](https://openai.com/index/chain-of-thought-monitoring/) this happening with `exit(0)` and `raise SkipTest`. METR [found](https://metr.org/blog/2025-06-05-recent-reward-hacking/) frontier models monkey-patching their own graders.
|
|
48
|
+
|
|
49
|
+
The fix is to test reward functions **before** training, the same way you test code before deploying.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install rewardprobe
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Three Modes
|
|
62
|
+
|
|
63
|
+
### 1. Quick Check (free, instant, no API key)
|
|
64
|
+
|
|
65
|
+
30 deterministic probes. Catches parser bugs, edge cases, format tricks. Runs in under a second on CPU.
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
rewardprobe — my_reward
|
|
73
|
+
|
|
74
|
+
1 critical, 2 warning found
|
|
75
|
+
|
|
76
|
+
1. critical
|
|
77
|
+
Correct answer in reasoning section scores 1.0 even when the
|
|
78
|
+
answer field contains a wrong answer.
|
|
79
|
+
|
|
80
|
+
2. warning
|
|
81
|
+
Different scores depending on answer tag order.
|
|
82
|
+
|
|
83
|
+
28/30 checks passed.
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 2. Deep Analysis (needs API key)
|
|
87
|
+
|
|
88
|
+
Claude reads your source code, understands what each function does, and generates realistic adversarial completions. Finds bugs that static probes can't.
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
export ANTHROPIC_API_KEY=sk-...
|
|
92
|
+
rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --deep
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
This adds:
|
|
96
|
+
- **Code analysis** — Claude identifies logic bugs by reading your Python code
|
|
97
|
+
- **Adversarial completions** — wrong-but-plausible model outputs tested against your function
|
|
98
|
+
- **False positive filtering** — classifies each function (correctness/format/auxiliary) so findings are precise
|
|
99
|
+
|
|
100
|
+
### 3. Simulate (needs API key)
|
|
101
|
+
|
|
102
|
+
The flagship feature. Generates diverse completions spanning the full range of what a model might produce during training — from perfect solutions to garbage — and maps the reward landscape.
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
rewardprobe simulate my_reward.py::my_fn --dataset tasks.jsonl
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
The strategy scoreboard shows you at a glance:
|
|
109
|
+
- **Green strategies** (perfect, correct_lazy, correct_verbose) — what you WANT the model to learn
|
|
110
|
+
- **Red strategies** (shortcut, format_only, hedge, garbage) — what you DON'T want
|
|
111
|
+
|
|
112
|
+
If a red strategy scores close to or higher than a green one, your reward function has a problem.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## What We Found
|
|
117
|
+
|
|
118
|
+
We ran rewardprobe against reward functions from 4 major RL codebases plus 3 non-math domains. Results:
|
|
119
|
+
|
|
120
|
+
| Codebase | Domain | Key Finding |
|
|
121
|
+
|----------|--------|-------------|
|
|
122
|
+
| **verifiers/gsm8k** (Prime Intellect) | Math | Model can skip reasoning — `correct_lazy` scores 1.0 |
|
|
123
|
+
| **Open-R1** (HuggingFace) | Math | `first_match` mode lets models hedge with multiple answers |
|
|
124
|
+
| **verl** (ByteDance) | Math | `format_score` parameter can reward wrong answers |
|
|
125
|
+
| **willccbb GRPO gist** | Math | Returns 2.0 (outside [0,1]); rejects "42.0" for "42" |
|
|
126
|
+
| Custom code reward | Code | Off-by-one bugs score 0.83 — substring matching misses logic errors |
|
|
127
|
+
| Sentiment classifier | Text | Reasoned answers score 0.0, bare labels score 1.0 |
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Works With Any Framework
|
|
132
|
+
|
|
133
|
+
Auto-detects your reward function's signature. No configuration.
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# Any of these just work:
|
|
137
|
+
def my_reward(completion, answer): ... # Raw Python
|
|
138
|
+
def accuracy_reward(completions, solution, **kwargs): ... # TRL / GRPO
|
|
139
|
+
def correctness(prompts, completions, answer, **kwargs): ... # TRL with prompts
|
|
140
|
+
async def correct_answer(completion, answer): ... # verifiers
|
|
141
|
+
def compute_score(solution_str, ground_truth): ... # ByteDance verl
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
rewardprobe test file.py::fn --dataset tasks.jsonl # Just works
|
|
146
|
+
rewardprobe test environments/gsm8k.py # verifiers environments too
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## GitHub Action
|
|
152
|
+
|
|
153
|
+
```yaml
|
|
154
|
+
- run: pip install rewardprobe
|
|
155
|
+
- run: rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --ci
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Exit code 1 on critical findings. Add `--deep` with `ANTHROPIC_API_KEY` secret for AI analysis in CI.
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Python API
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from rewardprobe import Probe
|
|
166
|
+
|
|
167
|
+
# Quick check
|
|
168
|
+
report = Probe().test_fn(my_reward, tasks)
|
|
169
|
+
print(report.passed) # True / False
|
|
170
|
+
|
|
171
|
+
# Deep analysis
|
|
172
|
+
report = Probe(deep=True).test_fn(my_reward, tasks)
|
|
173
|
+
|
|
174
|
+
# Simulate
|
|
175
|
+
from rewardprobe.simulator import simulate, print_simulation
|
|
176
|
+
from rewardprobe.tier2.client import get_client
|
|
177
|
+
from rewardprobe.adapters.auto import auto_adapt
|
|
178
|
+
|
|
179
|
+
env = auto_adapt(my_reward, tasks)
|
|
180
|
+
result = simulate(env, get_client("sonnet"), n_tasks=5)
|
|
181
|
+
print_simulation(result)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## How It Works
|
|
187
|
+
|
|
188
|
+
**Quick Check** generates adversarial inputs (empty strings, format tricks, parser exploits, wrong-but-formatted answers) and tests your reward function against them. 30 probes across 6 families, all deterministic, all on CPU.
|
|
189
|
+
|
|
190
|
+
**Deep Analysis** uses Claude to read your reward function's Python source code. It understands what the function checks, identifies logic bugs, and generates realistic wrong completions that a model might produce during training. Each completion is actually run against your function — only real exploits are reported.
|
|
191
|
+
|
|
192
|
+
**Simulate** uses Claude to generate 10 diverse completions per task, each representing a different strategy a model might learn (perfect, lazy, shortcut, hedging, garbage, etc). Scores them all against your reward function. The strategy scoreboard shows which behaviors your reward function actually incentivizes.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## What rewardprobe Is NOT
|
|
197
|
+
|
|
198
|
+
- **Not a training monitor.** We run *before* training starts.
|
|
199
|
+
- **Not a formal prover.** We find bugs empirically with concrete inputs.
|
|
200
|
+
- **Not a guarantee.** A clean report means "we tested these patterns and found nothing." The nastiest reward hacks are novel and environment-specific.
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## Contributing
|
|
205
|
+
|
|
206
|
+
See [CLAUDE.md](CLAUDE.md) for architecture, how to add attacks, and how the simulator works.
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
git clone https://github.com/rewardprobe/rewardprobe && cd rewardprobe
|
|
210
|
+
uv sync --extra dev && pytest tests/
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Apache 2.0
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
name: "rewardprobe"
|
|
2
|
+
description: "Stress-test RL reward functions before training. Catch reward hacking in CI."
|
|
3
|
+
branding:
|
|
4
|
+
icon: "shield"
|
|
5
|
+
color: "red"
|
|
6
|
+
|
|
7
|
+
inputs:
|
|
8
|
+
target:
|
|
9
|
+
description: "Path to reward function (file.py::fn) or verifiers environment (env.py)"
|
|
10
|
+
required: true
|
|
11
|
+
dataset:
|
|
12
|
+
description: "Path to JSONL tasks file (required for non-verifiers targets)"
|
|
13
|
+
required: false
|
|
14
|
+
python-version:
|
|
15
|
+
description: "Python version to use"
|
|
16
|
+
required: false
|
|
17
|
+
default: "3.12"
|
|
18
|
+
strict:
|
|
19
|
+
description: "Fail on warnings too (not just critical findings)"
|
|
20
|
+
required: false
|
|
21
|
+
default: "false"
|
|
22
|
+
|
|
23
|
+
runs:
|
|
24
|
+
using: "composite"
|
|
25
|
+
steps:
|
|
26
|
+
- name: Set up Python
|
|
27
|
+
uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ inputs.python-version }}
|
|
30
|
+
|
|
31
|
+
- name: Install rewardprobe
|
|
32
|
+
shell: bash
|
|
33
|
+
run: pip install rewardprobe
|
|
34
|
+
|
|
35
|
+
- name: Run rewardprobe
|
|
36
|
+
shell: bash
|
|
37
|
+
run: |
|
|
38
|
+
ARGS="rewardprobe test ${{ inputs.target }}"
|
|
39
|
+
if [ -n "${{ inputs.dataset }}" ]; then
|
|
40
|
+
ARGS="$ARGS --dataset ${{ inputs.dataset }}"
|
|
41
|
+
fi
|
|
42
|
+
ARGS="$ARGS --ci"
|
|
43
|
+
if [ "${{ inputs.strict }}" = "true" ]; then
|
|
44
|
+
ARGS="$ARGS --strict"
|
|
45
|
+
fi
|
|
46
|
+
$ARGS
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{"prompt": "What is 2+2?", "answer": "4"}
|
|
2
|
+
{"prompt": "What is 3+3?", "answer": "6"}
|
|
3
|
+
{"prompt": "What is 5*5?", "answer": "25"}
|
|
4
|
+
{"prompt": "What is 7+3?", "answer": "10"}
|
|
5
|
+
{"prompt": "What is 6*7?", "answer": "42"}
|
|
6
|
+
{"prompt": "What is 9-4?", "answer": "5"}
|
|
7
|
+
{"prompt": "What is 100/4?", "answer": "25"}
|
|
8
|
+
{"prompt": "What is 8*2?", "answer": "16"}
|
|
9
|
+
{"prompt": "What is 15-7?", "answer": "8"}
|
|
10
|
+
{"prompt": "What is 12/3?", "answer": "4"}
|