rewardprobe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. rewardprobe-0.1.0/.gitignore +22 -0
  2. rewardprobe-0.1.0/CLAUDE.md +121 -0
  3. rewardprobe-0.1.0/PKG-INFO +249 -0
  4. rewardprobe-0.1.0/README.md +213 -0
  5. rewardprobe-0.1.0/action.yml +46 -0
  6. rewardprobe-0.1.0/examples/sample_tasks.jsonl +10 -0
  7. rewardprobe-0.1.0/examples/test_simple_reward.py +37 -0
  8. rewardprobe-0.1.0/examples/test_trl_reward.py +51 -0
  9. rewardprobe-0.1.0/pyproject.toml +64 -0
  10. rewardprobe-0.1.0/spec.md +585 -0
  11. rewardprobe-0.1.0/src/rewardprobe/__init__.py +14 -0
  12. rewardprobe-0.1.0/src/rewardprobe/adapters/__init__.py +77 -0
  13. rewardprobe-0.1.0/src/rewardprobe/adapters/auto.py +497 -0
  14. rewardprobe-0.1.0/src/rewardprobe/adapters/base.py +20 -0
  15. rewardprobe-0.1.0/src/rewardprobe/adapters/raw.py +189 -0
  16. rewardprobe-0.1.0/src/rewardprobe/adapters/trl.py +134 -0
  17. rewardprobe-0.1.0/src/rewardprobe/adapters/verifiers.py +266 -0
  18. rewardprobe-0.1.0/src/rewardprobe/cli.py +304 -0
  19. rewardprobe-0.1.0/src/rewardprobe/families/__init__.py +44 -0
  20. rewardprobe-0.1.0/src/rewardprobe/families/base.py +213 -0
  21. rewardprobe-0.1.0/src/rewardprobe/families/basic_sanity.py +372 -0
  22. rewardprobe-0.1.0/src/rewardprobe/families/composition.py +339 -0
  23. rewardprobe-0.1.0/src/rewardprobe/families/parser_bugs.py +497 -0
  24. rewardprobe-0.1.0/src/rewardprobe/families/schema.py +223 -0
  25. rewardprobe-0.1.0/src/rewardprobe/families/stateful.py +259 -0
  26. rewardprobe-0.1.0/src/rewardprobe/families/trivial_bypass.py +253 -0
  27. rewardprobe-0.1.0/src/rewardprobe/probe.py +365 -0
  28. rewardprobe-0.1.0/src/rewardprobe/report.py +178 -0
  29. rewardprobe-0.1.0/src/rewardprobe/simulator.py +466 -0
  30. rewardprobe-0.1.0/src/rewardprobe/tier2/__init__.py +21 -0
  31. rewardprobe-0.1.0/src/rewardprobe/tier2/adversarial.py +127 -0
  32. rewardprobe-0.1.0/src/rewardprobe/tier2/client.py +117 -0
  33. rewardprobe-0.1.0/src/rewardprobe/tier2/code_analyzer.py +142 -0
  34. rewardprobe-0.1.0/src/rewardprobe/tier2/prompts.py +174 -0
  35. rewardprobe-0.1.0/src/rewardprobe/tier2/validator.py +82 -0
  36. rewardprobe-0.1.0/src/rewardprobe/types.py +128 -0
  37. rewardprobe-0.1.0/tests/__init__.py +0 -0
  38. rewardprobe-0.1.0/tests/conftest.py +160 -0
  39. rewardprobe-0.1.0/tests/test_adapters/__init__.py +0 -0
  40. rewardprobe-0.1.0/tests/test_adapters/test_raw.py +77 -0
  41. rewardprobe-0.1.0/tests/test_families/__init__.py +0 -0
  42. rewardprobe-0.1.0/tests/test_families/test_basic_sanity.py +47 -0
  43. rewardprobe-0.1.0/tests/test_families/test_trivial_bypass.py +29 -0
  44. rewardprobe-0.1.0/tests/test_probe.py +85 -0
@@ -0,0 +1,22 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+ .venv/
10
+ venv/
11
+ .env
12
+ .pytest_cache/
13
+ .ruff_cache/
14
+ htmlcov/
15
+ .coverage
16
+ *.log
17
+
18
+ # Vitals provenance data
19
+ .vitals/
20
+ hunt/
21
+ .claude/
22
+ excalidraw.log
@@ -0,0 +1,121 @@
1
+ # rewardprobe — Architecture Guide
2
+
3
+ ## What This Is
4
+
5
+ rewardprobe answers: **"What will my model learn to do against this reward function?"**
6
+
7
+ Three modes:
8
+ 1. **Quick Check** — 30 deterministic probes, instant, free
9
+ 2. **Deep Analysis** — Claude reads your code + generates adversarial inputs
10
+ 3. **Simulate** — generates diverse completions, maps the reward landscape
11
+
12
+ ## How It Works (Plain English)
13
+
14
+ ### Quick Check
15
+ We generate adversarial inputs — empty strings, format tricks, parser exploits, wrong-but-formatted answers — and feed them to your reward function. If it scores them high, that's a bug.
16
+
17
+ ### Deep Analysis
18
+ We send your reward function's Python source code to Claude. Claude reads it and says: "This function uses rfind() to find the last \boxed{} tag — a model can output multiple tags and only the last one is checked." Then Claude generates realistic wrong completions and we actually run them against your function.
19
+
20
+ ### Simulate
21
+ We use Claude to generate 10 completions per task, each representing a different strategy a model might adopt during training:
22
+
23
+ - **perfect** — correct reasoning + correct answer
24
+ - **correct_lazy** — just the answer, no work shown
25
+ - **shortcut** — skips computation, guesses based on patterns
26
+ - **format_only** — perfect format, wrong content
27
+ - **near_miss** — plausible but slightly wrong
28
+ - **hedge** — multiple answers hoping one matches
29
+ - etc.
30
+
31
+ Then we score every completion against your reward function and build the **strategy scoreboard** — a bar chart showing which behaviors get the most reward. If "shortcut" scores as high as "perfect," the model will learn to take shortcuts.
32
+
33
+ ## Architecture
34
+
35
+ ```
36
+ src/rewardprobe/
37
+ ├── probe.py # Probe class — main entry point
38
+ ├── types.py # Core data models (Pydantic)
39
+ ├── report.py # Terminal output (Rich)
40
+ ├── simulator.py # Reward landscape simulation
41
+ ├── adapters/
42
+ │ ├── auto.py # Universal signature detection
43
+ │ ├── verifiers.py # verifiers framework adapter
44
+ │ ├── trl.py # TRL GRPOTrainer adapter
45
+ │ └── raw.py # Raw Python functions
46
+ ├── families/
47
+ │ ├── base.py # Attack base classes + format detection
48
+ │ ├── basic_sanity.py # Empty, whitespace, range, determinism (7)
49
+ │ ├── trivial_bypass.py # Echo, constant, format-only (4)
50
+ │ ├── parser_bugs.py # Reasoning leaks, regex, malformed (6)
51
+ │ ├── composition.py # Weight dominance, zero-weight leaks (3)
52
+ │ ├── schema.py # Missing columns, type mismatches (5)
53
+ │ └── stateful.py # State leak, stale state, tool skip (5)
54
+ └── tier2/
55
+ ├── client.py # LLM client (Anthropic, OpenAI, Ollama)
56
+ ├── code_analyzer.py # Claude reads source code
57
+ ├── adversarial.py # Claude generates adversarial completions
58
+ ├── validator.py # Filters false positives from Tier 1
59
+ └── prompts.py # All LLM prompt templates
60
+ ```
61
+
62
+ ## Key Concepts
63
+
64
+ ### NormalizedEnvironment
65
+ Every adapter converts framework-specific reward functions into one universal interface:
66
+ ```python
67
+ NormalizedEnvironment(
68
+ reward_fns=[NormalizedRewardFn(name="...", fn=wrapped_fn, weight=1.0)],
69
+ tasks=[{"prompt": "...", "answer": "..."}, ...],
70
+ )
71
+ ```
72
+ The wrapped `fn` always has signature `(completion_text: str, task: dict) -> float`.
73
+
74
+ ### Auto-Detection
75
+ `auto.py` classifies any callable into one of 5 patterns:
76
+ - `SINGLE_PAIR` — `fn(completion, answer) -> float`
77
+ - `SINGLE_FLEXIBLE` — `fn(completion) -> float`
78
+ - `BATCH_SIMPLE` — `fn(completions, **kwargs) -> list[float]`
79
+ - `BATCH_WITH_PROMPTS` — `fn(prompts, completions, **kwargs) -> list[float]`
80
+ - `VERIFIERS_ASYNC` — `async fn(completion, answer, ...) -> float`
81
+
82
+ ### Format Detection
83
+ Reward functions expect answers in different formats. Before running attacks, we probe the function with common formats (`\boxed{}`, `####`, `<answer>`, raw) to discover what it accepts. This prevents false positives on format-enforcing functions.
84
+
85
+ ### Answer Key Normalization
86
+ Datasets use different column names for ground truth (`answer`, `solution`, `label`, `target`, `expected_output`). We detect and normalize to `answer` before running attacks.
87
+
88
+ ## Adding a New Attack
89
+
90
+ ```python
91
+ class MyAttack(BaseAttack):
92
+ name = "my_attack"
93
+ family = "basic_sanity"
94
+ severity = Severity.WARNING
95
+
96
+ def run(self, env: NormalizedEnvironment) -> list[Finding]:
97
+ findings = []
98
+ for fn in env.reward_fns:
99
+ for idx, task in enumerate(env.tasks):
100
+ score, err = _safe_call(fn, "adversarial input", task)
101
+ if err is None and score is not None and score > 0.5:
102
+ findings.append(Finding(
103
+ attack=self.name, family=self.family,
104
+ severity=self.severity,
105
+ input="adversarial input",
106
+ actual_reward=score,
107
+ affected_tasks=[idx],
108
+ recommendation="Plain English explanation of the problem",
109
+ ))
110
+ return findings
111
+ ```
112
+
113
+ Add to the family's `__init__` attack list. Add a test in `tests/test_families/`.
114
+
115
+ ## Design Principles
116
+
117
+ 1. **No false positives.** A finding the developer ignores erodes all trust. Every finding must be real and actionable.
118
+ 2. **Plain English.** No jargon in output. "Your function scores wrong answers at 0.8" not "parser_bugs/regex_greediness FAILED."
119
+ 3. **Domain agnostic.** Works for math, code, text, classification — any RL reward function. No hardcoded formats.
120
+ 4. **Auto-detect everything.** The developer should never configure their framework, format, or column names.
121
+ 5. **Seconds, not minutes.** Tier 1 is instant. Tier 2 is under a minute. Simulate is 2-3 minutes.
@@ -0,0 +1,249 @@
1
+ Metadata-Version: 2.4
2
+ Name: rewardprobe
3
+ Version: 0.1.0
4
+ Summary: Pre-training stress-testing for reward functions. Find bugs in minutes on CPU instead of days into a $10K training run.
5
+ Project-URL: Homepage, https://github.com/rewardprobe/rewardprobe
6
+ Project-URL: Documentation, https://github.com/rewardprobe/rewardprobe
7
+ Project-URL: Repository, https://github.com/rewardprobe/rewardprobe
8
+ Author: rewardprobe contributors
9
+ License-Expression: Apache-2.0
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: click>=8.0
22
+ Requires-Dist: pydantic>=2.0
23
+ Requires-Dist: rich>=13.0
24
+ Provides-Extra: all
25
+ Requires-Dist: trl>=0.15.0; extra == 'all'
26
+ Requires-Dist: verifiers>=0.1.0; extra == 'all'
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
29
+ Requires-Dist: pytest>=8.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.8.0; extra == 'dev'
31
+ Provides-Extra: trl
32
+ Requires-Dist: trl>=0.15.0; extra == 'trl'
33
+ Provides-Extra: verifiers
34
+ Requires-Dist: verifiers>=0.1.0; extra == 'verifiers'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # rewardprobe
38
+
39
+ **Know what your model will learn — before you train.**
40
+
41
+ [![PyPI](https://img.shields.io/pypi/v/rewardprobe)](https://pypi.org/project/rewardprobe/)
42
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
43
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](https://python.org)
44
+
45
+ ---
46
+
47
+ You write a reward function. You're about to spend $10K on a GRPO training run. rewardprobe tells you what the model will actually learn to do:
48
+
49
+ ```
50
+ rewardprobe simulate — production_math_rlvr
51
+ 50 completions across 5 tasks
52
+
53
+ 2 critical found
54
+
55
+ 1. critical
56
+ 'Shortcut' strategy scores 0.71
57
+ A model using the shortcut strategy earns 103% of what a correct
58
+ answer earns. It will learn to skip computation and take shortcuts
59
+ because that's easier AND scores higher.
60
+
61
+ 2. critical
62
+ 'Lazy correct' strategy scores only 0.07
63
+ A correct answer without formatting scores near zero. Your reward
64
+ function punishes correct-but-unformatted answers more than it
65
+ punishes wrong-but-formatted ones.
66
+
67
+ Strategy scoreboard:
68
+ perfect ████████████████████ 1.00
69
+ correct_verbose ████████████████████ 1.00
70
+ shortcut ██████████████░░░░░░ 0.71 ← problem
71
+ near_miss █████░░░░░░░░░░░░░░░ 0.29
72
+ format_only █████░░░░░░░░░░░░░░░ 0.29
73
+ garbage ███░░░░░░░░░░░░░░░░░ 0.18
74
+ correct_lazy █░░░░░░░░░░░░░░░░░░░ 0.07 ← problem
75
+ ```
76
+
77
+ The **strategy scoreboard** shows exactly how your reward function scores different model behaviors. If a lazy or wrong strategy scores close to a correct one, the model will learn the lazy path. You see this in 30 seconds instead of discovering it 3 days into training.
78
+
79
+ ---
80
+
81
+ ## The Problem
82
+
83
+ You write a reward function for RL training. It looks correct. You start training. Days later, the model is gaming the reward — outputting shortcuts, copying format without thinking, or guessing. OpenAI [documented](https://openai.com/index/chain-of-thought-monitoring/) this happening with `exit(0)` and `raise SkipTest`. METR [found](https://metr.org/blog/2025-06-05-recent-reward-hacking/) frontier models monkey-patching their own graders.
84
+
85
+ The fix is to test reward functions **before** training, the same way you test code before deploying.
86
+
87
+ ---
88
+
89
+ ## Install
90
+
91
+ ```bash
92
+ pip install rewardprobe
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Three Modes
98
+
99
+ ### 1. Quick Check (free, instant, no API key)
100
+
101
+ 30 deterministic probes. Catches parser bugs, edge cases, format tricks. Runs in under a second on CPU.
102
+
103
+ ```bash
104
+ rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl
105
+ ```
106
+
107
+ ```
108
+ rewardprobe — my_reward
109
+
110
+ 1 critical, 2 warning found
111
+
112
+ 1. critical
113
+ Correct answer in reasoning section scores 1.0 even when the
114
+ answer field contains a wrong answer.
115
+
116
+ 2. warning
117
+ Different scores depending on answer tag order.
118
+
119
+ 28/30 checks passed.
120
+ ```
121
+
122
+ ### 2. Deep Analysis (needs API key)
123
+
124
+ Claude reads your source code, understands what each function does, and generates realistic adversarial completions. Finds bugs that static probes can't.
125
+
126
+ ```bash
127
+ export ANTHROPIC_API_KEY=sk-...
128
+ rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --deep
129
+ ```
130
+
131
+ This adds:
132
+ - **Code analysis** — Claude identifies logic bugs by reading your Python code
133
+ - **Adversarial completions** — wrong-but-plausible model outputs tested against your function
134
+ - **False positive filtering** — classifies each function (correctness/format/auxiliary) so findings are precise
135
+
136
+ ### 3. Simulate (needs API key)
137
+
138
+ The flagship feature. Generates diverse completions spanning the full range of what a model might produce during training — from perfect solutions to garbage — and maps the reward landscape.
139
+
140
+ ```bash
141
+ rewardprobe simulate my_reward.py::my_fn --dataset tasks.jsonl
142
+ ```
143
+
144
+ The strategy scoreboard shows you at a glance:
145
+ - **Green strategies** (perfect, correct_lazy, correct_verbose) — what you WANT the model to learn
146
+ - **Red strategies** (shortcut, format_only, hedge, garbage) — what you DON'T want
147
+
148
+ If a red strategy scores close to or higher than a green one, your reward function has a problem.
149
+
150
+ ---
151
+
152
+ ## What We Found
153
+
154
+ We ran rewardprobe against reward functions from 4 major RL codebases plus 3 non-math domains. Results:
155
+
156
+ | Codebase | Domain | Key Finding |
157
+ |----------|--------|-------------|
158
+ | **verifiers/gsm8k** (Prime Intellect) | Math | Model can skip reasoning — `correct_lazy` scores 1.0 |
159
+ | **Open-R1** (HuggingFace) | Math | `first_match` mode lets models hedge with multiple answers |
160
+ | **verl** (ByteDance) | Math | `format_score` parameter can reward wrong answers |
161
+ | **willccbb GRPO gist** | Math | Returns 2.0 (outside [0,1]); rejects "42.0" for "42" |
162
+ | Custom code reward | Code | Off-by-one bugs score 0.83 — substring matching misses logic errors |
163
+ | Sentiment classifier | Text | Reasoned answers score 0.0, bare labels score 1.0 |
164
+
165
+ ---
166
+
167
+ ## Works With Any Framework
168
+
169
+ Auto-detects your reward function's signature. No configuration.
170
+
171
+ ```python
172
+ # Any of these just work:
173
+ def my_reward(completion, answer): ... # Raw Python
174
+ def accuracy_reward(completions, solution, **kwargs): ... # TRL / GRPO
175
+ def correctness(prompts, completions, answer, **kwargs): ... # TRL with prompts
176
+ async def correct_answer(completion, answer): ... # verifiers
177
+ def compute_score(solution_str, ground_truth): ... # ByteDance verl
178
+ ```
179
+
180
+ ```bash
181
+ rewardprobe test file.py::fn --dataset tasks.jsonl # Just works
182
+ rewardprobe test environments/gsm8k.py # verifiers environments too
183
+ ```
184
+
185
+ ---
186
+
187
+ ## GitHub Action
188
+
189
+ ```yaml
190
+ - run: pip install rewardprobe
191
+ - run: rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --ci
192
+ ```
193
+
194
+ Exit code 1 on critical findings. Add `--deep` with `ANTHROPIC_API_KEY` secret for AI analysis in CI.
195
+
196
+ ---
197
+
198
+ ## Python API
199
+
200
+ ```python
201
+ from rewardprobe import Probe
202
+
203
+ # Quick check
204
+ report = Probe().test_fn(my_reward, tasks)
205
+ print(report.passed) # True / False
206
+
207
+ # Deep analysis
208
+ report = Probe(deep=True).test_fn(my_reward, tasks)
209
+
210
+ # Simulate
211
+ from rewardprobe.simulator import simulate, print_simulation
212
+ from rewardprobe.tier2.client import get_client
213
+ from rewardprobe.adapters.auto import auto_adapt
214
+
215
+ env = auto_adapt(my_reward, tasks)
216
+ result = simulate(env, get_client("sonnet"), n_tasks=5)
217
+ print_simulation(result)
218
+ ```
219
+
220
+ ---
221
+
222
+ ## How It Works
223
+
224
+ **Quick Check** generates adversarial inputs (empty strings, format tricks, parser exploits, wrong-but-formatted answers) and tests your reward function against them. 30 probes across 6 families, all deterministic, all on CPU.
225
+
226
+ **Deep Analysis** uses Claude to read your reward function's Python source code. It understands what the function checks, identifies logic bugs, and generates realistic wrong completions that a model might produce during training. Each completion is actually run against your function — only real exploits are reported.
227
+
228
+ **Simulate** uses Claude to generate 10 diverse completions per task, each representing a different strategy a model might learn (perfect, lazy, shortcut, hedging, garbage, etc). Scores them all against your reward function. The strategy scoreboard shows which behaviors your reward function actually incentivizes.
229
+
230
+ ---
231
+
232
+ ## What rewardprobe Is NOT
233
+
234
+ - **Not a training monitor.** We run *before* training starts.
235
+ - **Not a formal prover.** We find bugs empirically with concrete inputs.
236
+ - **Not a guarantee.** A clean report means "we tested these patterns and found nothing." The nastiest reward hacks are novel and environment-specific.
237
+
238
+ ---
239
+
240
+ ## Contributing
241
+
242
+ See [CLAUDE.md](CLAUDE.md) for architecture, how to add attacks, and how the simulator works.
243
+
244
+ ```bash
245
+ git clone https://github.com/rewardprobe/rewardprobe && cd rewardprobe
246
+ uv sync --extra dev && pytest tests/
247
+ ```
248
+
249
+ Apache 2.0
@@ -0,0 +1,213 @@
1
+ # rewardprobe
2
+
3
+ **Know what your model will learn — before you train.**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/rewardprobe)](https://pypi.org/project/rewardprobe/)
6
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue)](LICENSE)
7
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](https://python.org)
8
+
9
+ ---
10
+
11
+ You write a reward function. You're about to spend $10K on a GRPO training run. rewardprobe tells you what the model will actually learn to do:
12
+
13
+ ```
14
+ rewardprobe simulate — production_math_rlvr
15
+ 50 completions across 5 tasks
16
+
17
+ 2 critical found
18
+
19
+ 1. critical
20
+ 'Shortcut' strategy scores 0.71
21
+ A model using the shortcut strategy earns 103% of what a correct
22
+ answer earns. It will learn to skip computation and take shortcuts
23
+ because that's easier AND scores higher.
24
+
25
+ 2. critical
26
+ 'Lazy correct' strategy scores only 0.07
27
+ A correct answer without formatting scores near zero. Your reward
28
+ function punishes correct-but-unformatted answers more than it
29
+ punishes wrong-but-formatted ones.
30
+
31
+ Strategy scoreboard:
32
+ perfect ████████████████████ 1.00
33
+ correct_verbose ████████████████████ 1.00
34
+ shortcut ██████████████░░░░░░ 0.71 ← problem
35
+ near_miss █████░░░░░░░░░░░░░░░ 0.29
36
+ format_only █████░░░░░░░░░░░░░░░ 0.29
37
+ garbage ███░░░░░░░░░░░░░░░░░ 0.18
38
+ correct_lazy █░░░░░░░░░░░░░░░░░░░ 0.07 ← problem
39
+ ```
40
+
41
+ The **strategy scoreboard** shows exactly how your reward function scores different model behaviors. If a lazy or wrong strategy scores close to a correct one, the model will learn the lazy path. You see this in 30 seconds instead of discovering it 3 days into training.
42
+
43
+ ---
44
+
45
+ ## The Problem
46
+
47
+ You write a reward function for RL training. It looks correct. You start training. Days later, the model is gaming the reward — outputting shortcuts, copying format without thinking, or guessing. OpenAI [documented](https://openai.com/index/chain-of-thought-monitoring/) this happening with `exit(0)` and `raise SkipTest`. METR [found](https://metr.org/blog/2025-06-05-recent-reward-hacking/) frontier models monkey-patching their own graders.
48
+
49
+ The fix is to test reward functions **before** training, the same way you test code before deploying.
50
+
51
+ ---
52
+
53
+ ## Install
54
+
55
+ ```bash
56
+ pip install rewardprobe
57
+ ```
58
+
59
+ ---
60
+
61
+ ## Three Modes
62
+
63
+ ### 1. Quick Check (free, instant, no API key)
64
+
65
+ 30 deterministic probes. Catches parser bugs, edge cases, format tricks. Runs in under a second on CPU.
66
+
67
+ ```bash
68
+ rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl
69
+ ```
70
+
71
+ ```
72
+ rewardprobe — my_reward
73
+
74
+ 1 critical, 2 warning found
75
+
76
+ 1. critical
77
+ Correct answer in reasoning section scores 1.0 even when the
78
+ answer field contains a wrong answer.
79
+
80
+ 2. warning
81
+ Different scores depending on answer tag order.
82
+
83
+ 28/30 checks passed.
84
+ ```
85
+
86
+ ### 2. Deep Analysis (needs API key)
87
+
88
+ Claude reads your source code, understands what each function does, and generates realistic adversarial completions. Finds bugs that static probes can't.
89
+
90
+ ```bash
91
+ export ANTHROPIC_API_KEY=sk-...
92
+ rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --deep
93
+ ```
94
+
95
+ This adds:
96
+ - **Code analysis** — Claude identifies logic bugs by reading your Python code
97
+ - **Adversarial completions** — wrong-but-plausible model outputs tested against your function
98
+ - **False positive filtering** — classifies each function (correctness/format/auxiliary) so findings are precise
99
+
100
+ ### 3. Simulate (needs API key)
101
+
102
+ The flagship feature. Generates diverse completions spanning the full range of what a model might produce during training — from perfect solutions to garbage — and maps the reward landscape.
103
+
104
+ ```bash
105
+ rewardprobe simulate my_reward.py::my_fn --dataset tasks.jsonl
106
+ ```
107
+
108
+ The strategy scoreboard shows you at a glance:
109
+ - **Green strategies** (perfect, correct_lazy, correct_verbose) — what you WANT the model to learn
110
+ - **Red strategies** (shortcut, format_only, hedge, garbage) — what you DON'T want
111
+
112
+ If a red strategy scores close to or higher than a green one, your reward function has a problem.
113
+
114
+ ---
115
+
116
+ ## What We Found
117
+
118
+ We ran rewardprobe against reward functions from 4 major RL codebases plus 3 non-math domains. Results:
119
+
120
+ | Codebase | Domain | Key Finding |
121
+ |----------|--------|-------------|
122
+ | **verifiers/gsm8k** (Prime Intellect) | Math | Model can skip reasoning — `correct_lazy` scores 1.0 |
123
+ | **Open-R1** (HuggingFace) | Math | `first_match` mode lets models hedge with multiple answers |
124
+ | **verl** (ByteDance) | Math | `format_score` parameter can reward wrong answers |
125
+ | **willccbb GRPO gist** | Math | Returns 2.0 (outside [0,1]); rejects "42.0" for "42" |
126
+ | Custom code reward | Code | Off-by-one bugs score 0.83 — substring matching misses logic errors |
127
+ | Sentiment classifier | Text | Reasoned answers score 0.0, bare labels score 1.0 |
128
+
129
+ ---
130
+
131
+ ## Works With Any Framework
132
+
133
+ Auto-detects your reward function's signature. No configuration.
134
+
135
+ ```python
136
+ # Any of these just work:
137
+ def my_reward(completion, answer): ... # Raw Python
138
+ def accuracy_reward(completions, solution, **kwargs): ... # TRL / GRPO
139
+ def correctness(prompts, completions, answer, **kwargs): ... # TRL with prompts
140
+ async def correct_answer(completion, answer): ... # verifiers
141
+ def compute_score(solution_str, ground_truth): ... # ByteDance verl
142
+ ```
143
+
144
+ ```bash
145
+ rewardprobe test file.py::fn --dataset tasks.jsonl # Just works
146
+ rewardprobe test environments/gsm8k.py # verifiers environments too
147
+ ```
148
+
149
+ ---
150
+
151
+ ## GitHub Action
152
+
153
+ ```yaml
154
+ - run: pip install rewardprobe
155
+ - run: rewardprobe test my_reward.py::my_fn --dataset tasks.jsonl --ci
156
+ ```
157
+
158
+ Exit code 1 on critical findings. Add `--deep` with `ANTHROPIC_API_KEY` secret for AI analysis in CI.
159
+
160
+ ---
161
+
162
+ ## Python API
163
+
164
+ ```python
165
+ from rewardprobe import Probe
166
+
167
+ # Quick check
168
+ report = Probe().test_fn(my_reward, tasks)
169
+ print(report.passed) # True / False
170
+
171
+ # Deep analysis
172
+ report = Probe(deep=True).test_fn(my_reward, tasks)
173
+
174
+ # Simulate
175
+ from rewardprobe.simulator import simulate, print_simulation
176
+ from rewardprobe.tier2.client import get_client
177
+ from rewardprobe.adapters.auto import auto_adapt
178
+
179
+ env = auto_adapt(my_reward, tasks)
180
+ result = simulate(env, get_client("sonnet"), n_tasks=5)
181
+ print_simulation(result)
182
+ ```
183
+
184
+ ---
185
+
186
+ ## How It Works
187
+
188
+ **Quick Check** generates adversarial inputs (empty strings, format tricks, parser exploits, wrong-but-formatted answers) and tests your reward function against them. 30 probes across 6 families, all deterministic, all on CPU.
189
+
190
+ **Deep Analysis** uses Claude to read your reward function's Python source code. It understands what the function checks, identifies logic bugs, and generates realistic wrong completions that a model might produce during training. Each completion is actually run against your function — only real exploits are reported.
191
+
192
+ **Simulate** uses Claude to generate 10 diverse completions per task, each representing a different strategy a model might learn (perfect, lazy, shortcut, hedging, garbage, etc). Scores them all against your reward function. The strategy scoreboard shows which behaviors your reward function actually incentivizes.
193
+
194
+ ---
195
+
196
+ ## What rewardprobe Is NOT
197
+
198
+ - **Not a training monitor.** We run *before* training starts.
199
+ - **Not a formal prover.** We find bugs empirically with concrete inputs.
200
+ - **Not a guarantee.** A clean report means "we tested these patterns and found nothing." The nastiest reward hacks are novel and environment-specific.
201
+
202
+ ---
203
+
204
+ ## Contributing
205
+
206
+ See [CLAUDE.md](CLAUDE.md) for architecture, how to add attacks, and how the simulator works.
207
+
208
+ ```bash
209
+ git clone https://github.com/rewardprobe/rewardprobe && cd rewardprobe
210
+ uv sync --extra dev && pytest tests/
211
+ ```
212
+
213
+ Apache 2.0
@@ -0,0 +1,46 @@
1
+ name: "rewardprobe"
2
+ description: "Stress-test RL reward functions before training. Catch reward hacking in CI."
3
+ branding:
4
+ icon: "shield"
5
+ color: "red"
6
+
7
+ inputs:
8
+ target:
9
+ description: "Path to reward function (file.py::fn) or verifiers environment (env.py)"
10
+ required: true
11
+ dataset:
12
+ description: "Path to JSONL tasks file (required for non-verifiers targets)"
13
+ required: false
14
+ python-version:
15
+ description: "Python version to use"
16
+ required: false
17
+ default: "3.12"
18
+ strict:
19
+ description: "Fail on warnings too (not just critical findings)"
20
+ required: false
21
+ default: "false"
22
+
23
+ runs:
24
+ using: "composite"
25
+ steps:
26
+ - name: Set up Python
27
+ uses: actions/setup-python@v5
28
+ with:
29
+ python-version: ${{ inputs.python-version }}
30
+
31
+ - name: Install rewardprobe
32
+ shell: bash
33
+ run: pip install rewardprobe
34
+
35
+ - name: Run rewardprobe
36
+ shell: bash
37
+ run: |
38
+ ARGS="rewardprobe test ${{ inputs.target }}"
39
+ if [ -n "${{ inputs.dataset }}" ]; then
40
+ ARGS="$ARGS --dataset ${{ inputs.dataset }}"
41
+ fi
42
+ ARGS="$ARGS --ci"
43
+ if [ "${{ inputs.strict }}" = "true" ]; then
44
+ ARGS="$ARGS --strict"
45
+ fi
46
+ $ARGS
@@ -0,0 +1,10 @@
1
+ {"prompt": "What is 2+2?", "answer": "4"}
2
+ {"prompt": "What is 3+3?", "answer": "6"}
3
+ {"prompt": "What is 5*5?", "answer": "25"}
4
+ {"prompt": "What is 7+3?", "answer": "10"}
5
+ {"prompt": "What is 6*7?", "answer": "42"}
6
+ {"prompt": "What is 9-4?", "answer": "5"}
7
+ {"prompt": "What is 100/4?", "answer": "25"}
8
+ {"prompt": "What is 8*2?", "answer": "16"}
9
+ {"prompt": "What is 15-7?", "answer": "8"}
10
+ {"prompt": "What is 12/3?", "answer": "4"}