@doidor/agentrig 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +224 -0
- package/dist/agent/claude.js +125 -0
- package/dist/agent/claude.js.map +1 -0
- package/dist/agent/copilot.js +147 -0
- package/dist/agent/copilot.js.map +1 -0
- package/dist/agent/index.js +17 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/agent/provider.js +10 -0
- package/dist/agent/provider.js.map +1 -0
- package/dist/cli.js +169 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/compile.js +42 -0
- package/dist/commands/compile.js.map +1 -0
- package/dist/commands/dashboard.js +35 -0
- package/dist/commands/dashboard.js.map +1 -0
- package/dist/commands/doctor.js +40 -0
- package/dist/commands/doctor.js.map +1 -0
- package/dist/commands/eval.js +178 -0
- package/dist/commands/eval.js.map +1 -0
- package/dist/commands/init.js +100 -0
- package/dist/commands/init.js.map +1 -0
- package/dist/commands/update.js +176 -0
- package/dist/commands/update.js.map +1 -0
- package/dist/core/activity.js +80 -0
- package/dist/core/activity.js.map +1 -0
- package/dist/core/audit.js +112 -0
- package/dist/core/audit.js.map +1 -0
- package/dist/core/compile.js +250 -0
- package/dist/core/compile.js.map +1 -0
- package/dist/core/fsutil.js +45 -0
- package/dist/core/fsutil.js.map +1 -0
- package/dist/core/install.js +97 -0
- package/dist/core/install.js.map +1 -0
- package/dist/core/knowledge.js +34 -0
- package/dist/core/knowledge.js.map +1 -0
- package/dist/core/logger.js +31 -0
- package/dist/core/logger.js.map +1 -0
- package/dist/core/paths.js +22 -0
- package/dist/core/paths.js.map +1 -0
- package/dist/core/setupsteps.js +72 -0
- package/dist/core/setupsteps.js.map +1 -0
- package/dist/core/state.js +19 -0
- package/dist/core/state.js.map +1 -0
- package/dist/core/surfaces.js +62 -0
- package/dist/core/surfaces.js.map +1 -0
- package/dist/prompts/index.js +117 -0
- package/dist/prompts/index.js.map +1 -0
- package/dist/version.js +26 -0
- package/dist/version.js.map +1 -0
- package/knowledge/PRINCIPLES.md +106 -0
- package/knowledge/manifest.json +247 -0
- package/knowledge/templates/AGENTS.md +66 -0
- package/knowledge/templates/AGENTS.package.example.md +19 -0
- package/knowledge/templates/agents/README.md +33 -0
- package/knowledge/templates/agents/developer.md +7 -0
- package/knowledge/templates/agents/developer.yml +7 -0
- package/knowledge/templates/agents/judge.md +6 -0
- package/knowledge/templates/agents/judge.yml +6 -0
- package/knowledge/templates/agents/reviewer.md +6 -0
- package/knowledge/templates/agents/reviewer.yml +7 -0
- package/knowledge/templates/agents/triager.md +8 -0
- package/knowledge/templates/agents/triager.yml +8 -0
- package/knowledge/templates/dashboard/dashboard.mjs +261 -0
- package/knowledge/templates/eval/RUBRIC.md +94 -0
- package/knowledge/templates/eval/axes.json +56 -0
- package/knowledge/templates/eval/checks.json +304 -0
- package/knowledge/templates/eval/sandbox/eval-rules.md +23 -0
- package/knowledge/templates/eval/scenarios/README.md +24 -0
- package/knowledge/templates/eval/scenarios/add-small-feature.md +28 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test.md +27 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug.md +30 -0
- package/knowledge/templates/eval/score.mjs +257 -0
- package/knowledge/templates/eval/static-audit.mjs +112 -0
- package/knowledge/templates/harness/ORCHESTRATION.md +53 -0
- package/knowledge/templates/harness/state-machine.yml +105 -0
- package/knowledge/templates/mcp/mcp.json +12 -0
- package/knowledge/templates/rules/README.md +32 -0
- package/knowledge/templates/rules/code-review.md +26 -0
- package/knowledge/templates/rules/coding-standards.md +15 -0
- package/knowledge/templates/rules/no-debug-logging.md +16 -0
- package/knowledge/templates/rules/security.md +23 -0
- package/knowledge/templates/scripts/repair-worktrees.sh +124 -0
- package/knowledge/templates/skills/fix-ci/SKILL.md +17 -0
- package/knowledge/templates/skills/harness-eval/SKILL.md +83 -0
- package/knowledge/templates/skills/self-verify/SKILL.md +25 -0
- package/knowledge/templates/skills/skill-authoring/SKILL.md +35 -0
- package/knowledge/templates/skills/skill-improver/SKILL.md +23 -0
- package/knowledge/templates/skills/verify-loop/SKILL.md +35 -0
- package/knowledge/templates/wiki/README.md +23 -0
- package/knowledge/templates/wiki/_TEMPLATE.md +16 -0
- package/knowledge/templates/wiki/index.md +29 -0
- package/knowledge/templates/wiki/troubleshooting.md +14 -0
- package/package.json +70 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "agentrig-harness-checks/1",
|
|
3
|
+
"description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
|
|
4
|
+
"checks": [
|
|
5
|
+
{
|
|
6
|
+
"id": "state-machine",
|
|
7
|
+
"principle": 1,
|
|
8
|
+
"title": "Workflow is an explicit state machine",
|
|
9
|
+
"type": "file-contains",
|
|
10
|
+
"path": ".agentrig/harness/state-machine.yml",
|
|
11
|
+
"patterns": ["states:", "transitions:"],
|
|
12
|
+
"weight": 1
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"id": "trigger-taxonomy",
|
|
16
|
+
"principle": 1,
|
|
17
|
+
"title": "Transition trigger taxonomy declared",
|
|
18
|
+
"type": "file-contains",
|
|
19
|
+
"path": ".agentrig/harness/state-machine.yml",
|
|
20
|
+
"patterns": ["triggers:", "event_to_state"],
|
|
21
|
+
"weight": 1
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"id": "orchestration-doc",
|
|
25
|
+
"principle": 1,
|
|
26
|
+
"title": "Orchestration contract documented",
|
|
27
|
+
"type": "path-exists",
|
|
28
|
+
"path": ".agentrig/harness/ORCHESTRATION.md",
|
|
29
|
+
"weight": 1
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"id": "model-tiers",
|
|
33
|
+
"principle": 2,
|
|
34
|
+
"title": "Model tiers defined for cost/quality routing",
|
|
35
|
+
"type": "file-contains",
|
|
36
|
+
"path": ".agentrig/harness/state-machine.yml",
|
|
37
|
+
"patterns": ["model_tiers:", "premium"],
|
|
38
|
+
"weight": 1
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": "roles-distinct-models",
|
|
42
|
+
"principle": 2,
|
|
43
|
+
"title": "Specialized roles run different models",
|
|
44
|
+
"type": "roles-distinct-models",
|
|
45
|
+
"developer": ".agentrig/agents/developer.yml",
|
|
46
|
+
"reviewer": ".agentrig/agents/reviewer.yml",
|
|
47
|
+
"key": "model",
|
|
48
|
+
"weight": 1
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"id": "roles-roster",
|
|
52
|
+
"principle": 2,
|
|
53
|
+
"title": "Multiple specialized agent roles installed",
|
|
54
|
+
"type": "dir-min",
|
|
55
|
+
"path": ".agentrig/agents",
|
|
56
|
+
"min": 6,
|
|
57
|
+
"weight": 1
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"id": "role-prompts",
|
|
61
|
+
"principle": 2,
|
|
62
|
+
"title": "Roles have dedicated prompts",
|
|
63
|
+
"type": "path-exists",
|
|
64
|
+
"path": ".agentrig/agents/developer.md",
|
|
65
|
+
"weight": 1
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"id": "system-of-record",
|
|
69
|
+
"principle": 3,
|
|
70
|
+
"title": "GitHub labels mirror DAG state",
|
|
71
|
+
"type": "file-contains",
|
|
72
|
+
"path": ".agentrig/harness/state-machine.yml",
|
|
73
|
+
"patterns": ["labels:", "state_map"],
|
|
74
|
+
"weight": 1
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
"id": "reconciliation",
|
|
78
|
+
"principle": 3,
|
|
79
|
+
"title": "Reconciliation + recovery policy declared",
|
|
80
|
+
"type": "file-contains",
|
|
81
|
+
"path": ".agentrig/harness/state-machine.yml",
|
|
82
|
+
"patterns": ["reconciliation:", "recovery:", "claim_grace_seconds"],
|
|
83
|
+
"weight": 1
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"id": "dashboard",
|
|
87
|
+
"principle": 3,
|
|
88
|
+
"title": "Harness dashboard surfaces GitHub task state",
|
|
89
|
+
"type": "path-exists",
|
|
90
|
+
"path": ".agentrig/dashboard/dashboard.mjs",
|
|
91
|
+
"weight": 1
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"id": "skills-present",
|
|
95
|
+
"principle": 4,
|
|
96
|
+
"title": "Skills directory has procedural memory",
|
|
97
|
+
"type": "dir-min",
|
|
98
|
+
"path": ".agents/skills",
|
|
99
|
+
"min": 3,
|
|
100
|
+
"weight": 1
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"id": "skill-frontmatter",
|
|
104
|
+
"principle": 4,
|
|
105
|
+
"title": "Skills declare description + allowed-tools",
|
|
106
|
+
"type": "frontmatter-keys",
|
|
107
|
+
"path": ".agents/skills/self-verify/SKILL.md",
|
|
108
|
+
"keys": ["description", "allowed-tools"],
|
|
109
|
+
"weight": 1
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"id": "rules-present",
|
|
113
|
+
"principle": 4,
|
|
114
|
+
"title": "Glob-scoped rules with priority order",
|
|
115
|
+
"type": "path-exists",
|
|
116
|
+
"path": ".agents/rules/README.md",
|
|
117
|
+
"weight": 1
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"id": "default-rules",
|
|
121
|
+
"principle": 4,
|
|
122
|
+
"title": "Security + code-review reflex rules installed",
|
|
123
|
+
"type": "dir-min",
|
|
124
|
+
"path": ".agents/rules",
|
|
125
|
+
"min": 4,
|
|
126
|
+
"weight": 1
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
"id": "self-verify",
|
|
130
|
+
"principle": 5,
|
|
131
|
+
"title": "Self-verify-before-handoff skill",
|
|
132
|
+
"type": "path-exists",
|
|
133
|
+
"path": ".agents/skills/self-verify/SKILL.md",
|
|
134
|
+
"weight": 1
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
"id": "eval-rubric",
|
|
138
|
+
"principle": 6,
|
|
139
|
+
"title": "Rubric-driven evaluation present",
|
|
140
|
+
"type": "path-exists",
|
|
141
|
+
"path": ".agentrig/eval/RUBRIC.md",
|
|
142
|
+
"weight": 1
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
"id": "eval-axes",
|
|
146
|
+
"principle": 6,
|
|
147
|
+
"title": "Validated axis/issue-code registry present",
|
|
148
|
+
"type": "path-exists",
|
|
149
|
+
"path": ".agentrig/eval/axes.json",
|
|
150
|
+
"weight": 1
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"id": "eval-sandbox",
|
|
154
|
+
"principle": 6,
|
|
155
|
+
"title": "Eval sandbox guardrails present",
|
|
156
|
+
"type": "path-exists",
|
|
157
|
+
"path": ".agentrig/eval/sandbox/eval-rules.md",
|
|
158
|
+
"weight": 1
|
|
159
|
+
},
|
|
160
|
+
{
|
|
161
|
+
"id": "eval-harness-skill",
|
|
162
|
+
"principle": 6,
|
|
163
|
+
"title": "Harness-eval skill present",
|
|
164
|
+
"type": "path-exists",
|
|
165
|
+
"path": ".agents/skills/harness-eval/SKILL.md",
|
|
166
|
+
"weight": 1
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
"id": "worktrees",
|
|
170
|
+
"principle": 7,
|
|
171
|
+
"title": "Hermetic per-agent worktree script",
|
|
172
|
+
"type": "path-exists",
|
|
173
|
+
"path": "scripts/repair-worktrees.sh",
|
|
174
|
+
"weight": 1
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
"id": "wiki",
|
|
178
|
+
"principle": 8,
|
|
179
|
+
"title": "Tiered memory / wiki",
|
|
180
|
+
"type": "path-exists",
|
|
181
|
+
"path": ".agents/wiki/README.md",
|
|
182
|
+
"weight": 1
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
"id": "wiki-router",
|
|
186
|
+
"principle": 8,
|
|
187
|
+
"title": "Wiki index/router + troubleshooting present",
|
|
188
|
+
"type": "path-exists",
|
|
189
|
+
"path": ".agents/wiki/index.md",
|
|
190
|
+
"weight": 1
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"id": "skill-improver",
|
|
194
|
+
"principle": 8,
|
|
195
|
+
"title": "Skill-improver closes the feedback loop",
|
|
196
|
+
"type": "path-exists",
|
|
197
|
+
"path": ".agents/skills/skill-improver/SKILL.md",
|
|
198
|
+
"weight": 1
|
|
199
|
+
},
|
|
200
|
+
{
|
|
201
|
+
"id": "human-gates",
|
|
202
|
+
"principle": 9,
|
|
203
|
+
"title": "Human-only gates declared",
|
|
204
|
+
"type": "file-contains",
|
|
205
|
+
"path": ".agentrig/harness/state-machine.yml",
|
|
206
|
+
"patterns": ["human_only", "human"],
|
|
207
|
+
"weight": 1
|
|
208
|
+
},
|
|
209
|
+
{
|
|
210
|
+
"id": "hard-limits",
|
|
211
|
+
"principle": 10,
|
|
212
|
+
"title": "Hard limits and safety nets",
|
|
213
|
+
"type": "file-contains",
|
|
214
|
+
"path": ".agentrig/harness/state-machine.yml",
|
|
215
|
+
"patterns": ["limits:", "max_diff_chars", "runaway_token_cap"],
|
|
216
|
+
"weight": 1
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
"id": "mcp",
|
|
220
|
+
"principle": 11,
|
|
221
|
+
"title": "Tooling neutrality via MCP",
|
|
222
|
+
"type": "path-exists",
|
|
223
|
+
"path": ".mcp.json",
|
|
224
|
+
"weight": 1
|
|
225
|
+
},
|
|
226
|
+
{
|
|
227
|
+
"id": "surfaces",
|
|
228
|
+
"principle": 11,
|
|
229
|
+
"title": "Vendor surfaces mirror one canonical source",
|
|
230
|
+
"type": "path-exists",
|
|
231
|
+
"path": ".claude",
|
|
232
|
+
"weight": 1
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
"id": "copilot-instructions",
|
|
236
|
+
"principle": 11,
|
|
237
|
+
"title": "GitHub Copilot instructions projected (remote + IDE)",
|
|
238
|
+
"type": "path-exists",
|
|
239
|
+
"path": ".github/copilot-instructions.md",
|
|
240
|
+
"weight": 1
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
"id": "github-instructions",
|
|
244
|
+
"principle": 11,
|
|
245
|
+
"title": "Path-scoped .github/instructions projected from rules",
|
|
246
|
+
"type": "dir-min",
|
|
247
|
+
"path": ".github/instructions",
|
|
248
|
+
"min": 1,
|
|
249
|
+
"weight": 1
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
"id": "claude-md",
|
|
253
|
+
"principle": 11,
|
|
254
|
+
"title": "CLAUDE.md projected for Claude Code",
|
|
255
|
+
"type": "path-exists",
|
|
256
|
+
"path": "CLAUDE.md",
|
|
257
|
+
"weight": 1
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
"id": "cursor-rules",
|
|
261
|
+
"principle": 11,
|
|
262
|
+
"title": "Cursor rules (.cursor/rules) projected from rules",
|
|
263
|
+
"type": "dir-min",
|
|
264
|
+
"path": ".cursor/rules",
|
|
265
|
+
"min": 1,
|
|
266
|
+
"weight": 1
|
|
267
|
+
},
|
|
268
|
+
{
|
|
269
|
+
"id": "copilot-setup-steps",
|
|
270
|
+
"principle": 11,
|
|
271
|
+
"title": "Copilot coding-agent environment scaffolded",
|
|
272
|
+
"type": "path-exists",
|
|
273
|
+
"path": ".github/workflows/copilot-setup-steps.yml",
|
|
274
|
+
"weight": 1
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
"id": "agents-critical-rules",
|
|
278
|
+
"principle": 12,
|
|
279
|
+
"title": "AGENTS.md leads with Critical Rules",
|
|
280
|
+
"type": "file-contains",
|
|
281
|
+
"path": "AGENTS.md",
|
|
282
|
+
"patterns": ["Critical Rules"],
|
|
283
|
+
"weight": 1
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"id": "agents-context",
|
|
287
|
+
"principle": 12,
|
|
288
|
+
"title": "AGENTS.md captures repo context",
|
|
289
|
+
"type": "file-contains",
|
|
290
|
+
"path": "AGENTS.md",
|
|
291
|
+
"patterns": ["What this repository is"],
|
|
292
|
+
"weight": 1
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
"id": "agents-skills-inventory",
|
|
296
|
+
"principle": 12,
|
|
297
|
+
"title": "AGENTS.md lists the installed skills",
|
|
298
|
+
"type": "file-contains",
|
|
299
|
+
"path": "AGENTS.md",
|
|
300
|
+
"patterns": ["AGENTRIG:skills-inventory"],
|
|
301
|
+
"weight": 1
|
|
302
|
+
}
|
|
303
|
+
]
|
|
304
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Eval sandbox rules
|
|
2
|
+
|
|
3
|
+
These rules apply **only while running a dynamic harness evaluation** (`agentrig eval --dynamic`).
|
|
4
|
+
They keep the eval reproducible and side-effect-free: the eval measures how the harness *behaves*,
|
|
5
|
+
it must never mutate real branches or remotes.
|
|
6
|
+
|
|
7
|
+
## Hard guardrails (do not violate)
|
|
8
|
+
- **No `git push`** of any kind.
|
|
9
|
+
- **No PR creation, no merges, no label changes** on real issues/PRs.
|
|
10
|
+
- **No writes outside the scenario worktree** under `~/.agentrig/worktrees/…`.
|
|
11
|
+
- **No network mutations** (no `gh pr`, `gh issue edit`, release, deploy).
|
|
12
|
+
- Read-only `gh` lookups (e.g. `gh pr view`) are allowed.
|
|
13
|
+
|
|
14
|
+
## Reproducibility
|
|
15
|
+
- Start each scenario from its pinned `base_commit` (see the scenario frontmatter) so results are
|
|
16
|
+
replayable.
|
|
17
|
+
- Record per-run artifacts next to the score: `diff.patch` (the produced change), `output` (a short
|
|
18
|
+
transcript/summary), and `meta.json` (scenario id, base_commit, variant, model, duration). These
|
|
19
|
+
make regressions inspectable at the artifact level, not just the scalar score.
|
|
20
|
+
|
|
21
|
+
## If a guardrail would block legitimate work
|
|
22
|
+
Stop and **self-park** with a note. A scenario that can only be completed by pushing or merging is
|
|
23
|
+
mis-specified — fix the scenario, not the guardrail.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Dynamic-eval scenarios
|
|
2
|
+
|
|
3
|
+
Each scenario is a replayable benchmark task with YAML frontmatter:
|
|
4
|
+
|
|
5
|
+
```yaml
|
|
6
|
+
---
|
|
7
|
+
id: <scenario-id>
|
|
8
|
+
type: run | spec | review # which rubric in axes.json to score against
|
|
9
|
+
scope: patch | feature | epic # size class (epichan-style)
|
|
10
|
+
base_commit: <sha|HEAD> # pin so the task is replayable from an exact state
|
|
11
|
+
principle_focus: [..] # which harness principles this stresses
|
|
12
|
+
prompt: >- ... # the task handed to the harness
|
|
13
|
+
---
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
`agentrig eval --dynamic` runs these through the harness; an independent judge scores each against
|
|
17
|
+
`../RUBRIC.md` / `../axes.json` and persists via `../score.mjs`.
|
|
18
|
+
|
|
19
|
+
- Run one: `agentrig eval --dynamic --scenario <id>`
|
|
20
|
+
- A/B a harness change: re-run a scenario under a `--variant` and `score.mjs compare --scenario <id>`.
|
|
21
|
+
|
|
22
|
+
Add scenarios by dropping a new `*.md` here with the frontmatter above. Keep them small and focused.
|
|
23
|
+
Run results (JSON + any diff.patch/output/meta artifacts) are written to `../results/` and are
|
|
24
|
+
git-ignored.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: add-small-feature
|
|
3
|
+
type: run
|
|
4
|
+
scope: feature
|
|
5
|
+
base_commit: HEAD
|
|
6
|
+
principle_focus: [1, 5, 10]
|
|
7
|
+
prompt: >-
|
|
8
|
+
Implement a small, well-specified feature, moving through the state machine without skipping a
|
|
9
|
+
gate or exceeding a hard limit, and survive an independent reviewer.
|
|
10
|
+
---
|
|
11
|
+
# Scenario: add a small, well-specified feature
|
|
12
|
+
|
|
13
|
+
## Goal
|
|
14
|
+
Implement a small feature described in one paragraph, moving through the state machine
|
|
15
|
+
(`implementing → reviewing → judging`) without skipping a gate or exceeding a hard limit.
|
|
16
|
+
|
|
17
|
+
## Setup
|
|
18
|
+
Provide a one-paragraph spec with clear acceptance criteria and at least one edge case.
|
|
19
|
+
|
|
20
|
+
## Success criteria
|
|
21
|
+
- New behavior is covered by tests; existing tests still pass.
|
|
22
|
+
- Stays under `max_diff_chars`; no unrelated churn.
|
|
23
|
+
- Respects every state-machine gate; never applies a human-only label.
|
|
24
|
+
- Reviewer (different model) finds no blocking issue, or the developer addresses it in ≤ the
|
|
25
|
+
iteration cap.
|
|
26
|
+
|
|
27
|
+
## Score these axes (see RUBRIC.md)
|
|
28
|
+
`correctness`, `tests`, `scope`, `gate_compliance`, `tool_discipline`.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: fix-failing-test
|
|
3
|
+
type: run
|
|
4
|
+
scope: patch
|
|
5
|
+
base_commit: HEAD
|
|
6
|
+
principle_focus: [5, 8]
|
|
7
|
+
prompt: >-
|
|
8
|
+
A check in this repo is failing. Diagnose and fix the root cause, self-verify, and converge
|
|
9
|
+
without a reviewer round-trip. Do not weaken the check to force a green result.
|
|
10
|
+
---
|
|
11
|
+
# Scenario: fix a failing test
|
|
12
|
+
|
|
13
|
+
## Goal
|
|
14
|
+
Given a single failing unit test in this repo, the harness should diagnose and fix the root cause,
|
|
15
|
+
self-verify, and converge without a reviewer round-trip.
|
|
16
|
+
|
|
17
|
+
## Setup
|
|
18
|
+
Introduce (or point the agent at) one genuinely failing test. Do not tell the agent the fix.
|
|
19
|
+
|
|
20
|
+
## Success criteria
|
|
21
|
+
- Identifies the root cause, not the symptom (does not delete/skip the test).
|
|
22
|
+
- Runs `self-verify`; the full suite is green at handoff.
|
|
23
|
+
- Diff is minimal and on-target.
|
|
24
|
+
- Records a gotcha in `.agents/wiki/` if the failure was non-obvious.
|
|
25
|
+
|
|
26
|
+
## Score these axes (see RUBRIC.md)
|
|
27
|
+
`correctness`, `scope`, `self_verification`, `memory`.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: review-catches-bug
|
|
3
|
+
type: review
|
|
4
|
+
scope: patch
|
|
5
|
+
base_commit: HEAD
|
|
6
|
+
principle_focus: [2, 6]
|
|
7
|
+
prompt: >-
|
|
8
|
+
A change is presented for review that contains a genuine, non-obvious defect (e.g. an
|
|
9
|
+
input-validation gap or an off-by-one). Run the reviewer role and judge the REVIEW itself.
|
|
10
|
+
---
|
|
11
|
+
# Scenario: the reviewer catches a planted bug
|
|
12
|
+
|
|
13
|
+
## Goal
|
|
14
|
+
Tests **the review process**, not the implementation. Present a diff that looks plausible but hides
|
|
15
|
+
a real defect. The reviewer (running a **different model** than whoever produced the diff) should
|
|
16
|
+
catch it, calibrate severity correctly, and block — without drowning the signal in style nits.
|
|
17
|
+
|
|
18
|
+
## Setup
|
|
19
|
+
Provide a small diff with exactly one planted, genuine bug and some innocuous surrounding changes.
|
|
20
|
+
Do not tell the reviewer where the bug is.
|
|
21
|
+
|
|
22
|
+
## Success criteria
|
|
23
|
+
- The reviewer **finds the planted defect** and explains it with evidence.
|
|
24
|
+
- It **blocks** (requests changes) for the real bug and does not block on style/noise.
|
|
25
|
+
- Severity is calibrated (the bug is flagged as blocking; cosmetic items, if any, are non-blocking).
|
|
26
|
+
- It does not rubber-stamp, and it stays independent of the producer's reasoning.
|
|
27
|
+
|
|
28
|
+
## Score these axes (type `review`, see RUBRIC.md / axes.json)
|
|
29
|
+
`finding_correctness`, `coverage`, `severity_calibration`, `false_positive_rate`,
|
|
30
|
+
`blocking_decision`, `independence`.
|