@doidor/agentrig 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +62 -27
  2. package/dist/agent/copilot.js +46 -5
  3. package/dist/agent/copilot.js.map +1 -1
  4. package/dist/cli.js +30 -5
  5. package/dist/cli.js.map +1 -1
  6. package/dist/commands/doctor.js +53 -8
  7. package/dist/commands/doctor.js.map +1 -1
  8. package/dist/commands/eval-dynamic.js +316 -0
  9. package/dist/commands/eval-dynamic.js.map +1 -0
  10. package/dist/commands/eval-scaffold.js +173 -0
  11. package/dist/commands/eval-scaffold.js.map +1 -0
  12. package/dist/commands/eval.js +184 -55
  13. package/dist/commands/eval.js.map +1 -1
  14. package/dist/core/audit.js +237 -9
  15. package/dist/core/audit.js.map +1 -1
  16. package/dist/core/model-family.js +31 -0
  17. package/dist/core/model-family.js.map +1 -0
  18. package/dist/core/scenario-runner.js +298 -0
  19. package/dist/core/scenario-runner.js.map +1 -0
  20. package/dist/prompts/index.js +121 -30
  21. package/dist/prompts/index.js.map +1 -1
  22. package/knowledge/PRINCIPLES.md +2 -2
  23. package/knowledge/manifest.json +16 -1
  24. package/knowledge/templates/AGENTS.md +7 -6
  25. package/knowledge/templates/agents/README.md +4 -4
  26. package/knowledge/templates/agents/developer.yml +1 -1
  27. package/knowledge/templates/agents/judge.yml +1 -1
  28. package/knowledge/templates/agents/reviewer.yml +1 -1
  29. package/knowledge/templates/agents/triager.yml +5 -4
  30. package/knowledge/templates/dashboard/dashboard.mjs +12 -5
  31. package/knowledge/templates/eval/RUBRIC.md +87 -64
  32. package/knowledge/templates/eval/axes.json +25 -25
  33. package/knowledge/templates/eval/calibration/README.md +54 -0
  34. package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
  35. package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
  36. package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
  37. package/knowledge/templates/eval/checks.json +88 -11
  38. package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
  39. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
  40. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
  41. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
  42. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
  43. package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
  44. package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
  45. package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
  46. package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
  47. package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
  48. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
  49. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
  50. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
  51. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
  52. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
  53. package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
  54. package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
  55. package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
  56. package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
  57. package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
  58. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
  59. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
  60. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
  61. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
  62. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
  63. package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
  64. package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
  65. package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
  66. package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
  67. package/knowledge/templates/eval/score.mjs +368 -42
  68. package/knowledge/templates/eval/static-audit.mjs +204 -17
  69. package/knowledge/templates/harness/state-machine.yml +18 -12
  70. package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
  71. package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
  72. package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
  73. package/package.json +4 -3
  74. package/knowledge/templates/eval/scenarios/README.md +0 -24
  75. package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
  76. package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
  77. package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
@@ -1,14 +1,16 @@
1
1
  {
2
2
  "$schema": "agentrig-harness-checks/1",
3
- "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
3
+ "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Two layers: \"completeness\" (file/dir structure) and \"quality\" (content sanity probes). Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
4
4
  "checks": [
5
5
  {
6
6
  "id": "state-machine",
7
7
  "principle": 1,
8
- "title": "Workflow is an explicit state machine",
9
- "type": "file-contains",
8
+ "title": "Workflow is an explicit, connected state machine (DAG with queued→merged path)",
9
+ "type": "state-machine-dag",
10
10
  "path": ".agentrig/harness/state-machine.yml",
11
- "patterns": ["states:", "transitions:"],
11
+ "minStates": 6,
12
+ "requirePath": "queued->merged",
13
+ "layer": "completeness",
12
14
  "weight": 1
13
15
  },
14
16
  {
@@ -18,6 +20,7 @@
18
20
  "type": "file-contains",
19
21
  "path": ".agentrig/harness/state-machine.yml",
20
22
  "patterns": ["triggers:", "event_to_state"],
23
+ "layer": "completeness",
21
24
  "weight": 1
22
25
  },
23
26
  {
@@ -26,6 +29,7 @@
26
29
  "title": "Orchestration contract documented",
27
30
  "type": "path-exists",
28
31
  "path": ".agentrig/harness/ORCHESTRATION.md",
32
+ "layer": "completeness",
29
33
  "weight": 1
30
34
  },
31
35
  {
@@ -35,16 +39,18 @@
35
39
  "type": "file-contains",
36
40
  "path": ".agentrig/harness/state-machine.yml",
37
41
  "patterns": ["model_tiers:", "premium"],
42
+ "layer": "completeness",
38
43
  "weight": 1
39
44
  },
40
45
  {
41
- "id": "roles-distinct-models",
46
+ "id": "roles-distinct-families",
42
47
  "principle": 2,
43
- "title": "Specialized roles run different models",
44
- "type": "roles-distinct-models",
48
+ "title": "Developer and reviewer use DIFFERENT model families (not just different ids)",
49
+ "type": "roles-distinct-families",
45
50
  "developer": ".agentrig/agents/developer.yml",
46
51
  "reviewer": ".agentrig/agents/reviewer.yml",
47
52
  "key": "model",
53
+ "layer": "quality",
48
54
  "weight": 1
49
55
  },
50
56
  {
@@ -54,6 +60,7 @@
54
60
  "type": "dir-min",
55
61
  "path": ".agentrig/agents",
56
62
  "min": 6,
63
+ "layer": "completeness",
57
64
  "weight": 1
58
65
  },
59
66
  {
@@ -62,6 +69,7 @@
62
69
  "title": "Roles have dedicated prompts",
63
70
  "type": "path-exists",
64
71
  "path": ".agentrig/agents/developer.md",
72
+ "layer": "completeness",
65
73
  "weight": 1
66
74
  },
67
75
  {
@@ -71,6 +79,7 @@
71
79
  "type": "file-contains",
72
80
  "path": ".agentrig/harness/state-machine.yml",
73
81
  "patterns": ["labels:", "state_map"],
82
+ "layer": "completeness",
74
83
  "weight": 1
75
84
  },
76
85
  {
@@ -80,6 +89,7 @@
80
89
  "type": "file-contains",
81
90
  "path": ".agentrig/harness/state-machine.yml",
82
91
  "patterns": ["reconciliation:", "recovery:", "claim_grace_seconds"],
92
+ "layer": "completeness",
83
93
  "weight": 1
84
94
  },
85
95
  {
@@ -88,6 +98,7 @@
88
98
  "title": "Harness dashboard surfaces GitHub task state",
89
99
  "type": "path-exists",
90
100
  "path": ".agentrig/dashboard/dashboard.mjs",
101
+ "layer": "completeness",
91
102
  "weight": 1
92
103
  },
93
104
  {
@@ -97,15 +108,18 @@
97
108
  "type": "dir-min",
98
109
  "path": ".agents/skills",
99
110
  "min": 3,
111
+ "layer": "completeness",
100
112
  "weight": 1
101
113
  },
102
114
  {
103
- "id": "skill-frontmatter",
115
+ "id": "skill-frontmatter-all",
104
116
  "principle": 4,
105
- "title": "Skills declare description + allowed-tools",
106
- "type": "frontmatter-keys",
107
- "path": ".agents/skills/self-verify/SKILL.md",
117
+ "title": "Every skill declares description + allowed-tools (not just self-verify)",
118
+ "type": "frontmatter-keys-all",
119
+ "path": ".agents/skills",
120
+ "file": "SKILL.md",
108
121
  "keys": ["description", "allowed-tools"],
122
+ "layer": "quality",
109
123
  "weight": 1
110
124
  },
111
125
  {
@@ -114,6 +128,7 @@
114
128
  "title": "Glob-scoped rules with priority order",
115
129
  "type": "path-exists",
116
130
  "path": ".agents/rules/README.md",
131
+ "layer": "completeness",
117
132
  "weight": 1
118
133
  },
119
134
  {
@@ -123,6 +138,7 @@
123
138
  "type": "dir-min",
124
139
  "path": ".agents/rules",
125
140
  "min": 4,
141
+ "layer": "completeness",
126
142
  "weight": 1
127
143
  },
128
144
  {
@@ -131,6 +147,7 @@
131
147
  "title": "Self-verify-before-handoff skill",
132
148
  "type": "path-exists",
133
149
  "path": ".agents/skills/self-verify/SKILL.md",
150
+ "layer": "completeness",
134
151
  "weight": 1
135
152
  },
136
153
  {
@@ -139,6 +156,7 @@
139
156
  "title": "Rubric-driven evaluation present",
140
157
  "type": "path-exists",
141
158
  "path": ".agentrig/eval/RUBRIC.md",
159
+ "layer": "completeness",
142
160
  "weight": 1
143
161
  },
144
162
  {
@@ -147,6 +165,27 @@
147
165
  "title": "Validated axis/issue-code registry present",
148
166
  "type": "path-exists",
149
167
  "path": ".agentrig/eval/axes.json",
168
+ "layer": "completeness",
169
+ "weight": 1
170
+ },
171
+ {
172
+ "id": "eval-axes-coherent",
173
+ "principle": 6,
174
+ "title": "axes.json has at least one issue code per axis",
175
+ "type": "quality-probe",
176
+ "probe": "axes-json-coherent",
177
+ "path": ".agentrig/eval/axes.json",
178
+ "layer": "quality",
179
+ "weight": 1
180
+ },
181
+ {
182
+ "id": "eval-checks-coherent",
183
+ "principle": 6,
184
+ "title": "checks.json has unique ids and only known check types",
185
+ "type": "quality-probe",
186
+ "probe": "checks-json-coherent",
187
+ "path": ".agentrig/eval/checks.json",
188
+ "layer": "quality",
150
189
  "weight": 1
151
190
  },
152
191
  {
@@ -155,6 +194,7 @@
155
194
  "title": "Eval sandbox guardrails present",
156
195
  "type": "path-exists",
157
196
  "path": ".agentrig/eval/sandbox/eval-rules.md",
197
+ "layer": "completeness",
158
198
  "weight": 1
159
199
  },
160
200
  {
@@ -163,6 +203,7 @@
163
203
  "title": "Harness-eval skill present",
164
204
  "type": "path-exists",
165
205
  "path": ".agents/skills/harness-eval/SKILL.md",
206
+ "layer": "completeness",
166
207
  "weight": 1
167
208
  },
168
209
  {
@@ -171,6 +212,7 @@
171
212
  "title": "Hermetic per-agent worktree script",
172
213
  "type": "path-exists",
173
214
  "path": "scripts/repair-worktrees.sh",
215
+ "layer": "completeness",
174
216
  "weight": 1
175
217
  },
176
218
  {
@@ -179,6 +221,7 @@
179
221
  "title": "Tiered memory / wiki",
180
222
  "type": "path-exists",
181
223
  "path": ".agents/wiki/README.md",
224
+ "layer": "completeness",
182
225
  "weight": 1
183
226
  },
184
227
  {
@@ -187,6 +230,7 @@
187
230
  "title": "Wiki index/router + troubleshooting present",
188
231
  "type": "path-exists",
189
232
  "path": ".agents/wiki/index.md",
233
+ "layer": "completeness",
190
234
  "weight": 1
191
235
  },
192
236
  {
@@ -195,6 +239,7 @@
195
239
  "title": "Skill-improver closes the feedback loop",
196
240
  "type": "path-exists",
197
241
  "path": ".agents/skills/skill-improver/SKILL.md",
242
+ "layer": "completeness",
198
243
  "weight": 1
199
244
  },
200
245
  {
@@ -204,6 +249,7 @@
204
249
  "type": "file-contains",
205
250
  "path": ".agentrig/harness/state-machine.yml",
206
251
  "patterns": ["human_only", "human"],
252
+ "layer": "completeness",
207
253
  "weight": 1
208
254
  },
209
255
  {
@@ -213,6 +259,7 @@
213
259
  "type": "file-contains",
214
260
  "path": ".agentrig/harness/state-machine.yml",
215
261
  "patterns": ["limits:", "max_diff_chars", "runaway_token_cap"],
262
+ "layer": "completeness",
216
263
  "weight": 1
217
264
  },
218
265
  {
@@ -221,6 +268,7 @@
221
268
  "title": "Tooling neutrality via MCP",
222
269
  "type": "path-exists",
223
270
  "path": ".mcp.json",
271
+ "layer": "completeness",
224
272
  "weight": 1
225
273
  },
226
274
  {
@@ -229,6 +277,7 @@
229
277
  "title": "Vendor surfaces mirror one canonical source",
230
278
  "type": "path-exists",
231
279
  "path": ".claude",
280
+ "layer": "completeness",
232
281
  "weight": 1
233
282
  },
234
283
  {
@@ -237,6 +286,7 @@
237
286
  "title": "GitHub Copilot instructions projected (remote + IDE)",
238
287
  "type": "path-exists",
239
288
  "path": ".github/copilot-instructions.md",
289
+ "layer": "completeness",
240
290
  "weight": 1
241
291
  },
242
292
  {
@@ -246,6 +296,7 @@
246
296
  "type": "dir-min",
247
297
  "path": ".github/instructions",
248
298
  "min": 1,
299
+ "layer": "completeness",
249
300
  "weight": 1
250
301
  },
251
302
  {
@@ -254,6 +305,7 @@
254
305
  "title": "CLAUDE.md projected for Claude Code",
255
306
  "type": "path-exists",
256
307
  "path": "CLAUDE.md",
308
+ "layer": "completeness",
257
309
  "weight": 1
258
310
  },
259
311
  {
@@ -263,6 +315,7 @@
263
315
  "type": "dir-min",
264
316
  "path": ".cursor/rules",
265
317
  "min": 1,
318
+ "layer": "completeness",
266
319
  "weight": 1
267
320
  },
268
321
  {
@@ -271,6 +324,7 @@
271
324
  "title": "Copilot coding-agent environment scaffolded",
272
325
  "type": "path-exists",
273
326
  "path": ".github/workflows/copilot-setup-steps.yml",
327
+ "layer": "completeness",
274
328
  "weight": 1
275
329
  },
276
330
  {
@@ -280,6 +334,7 @@
280
334
  "type": "file-contains",
281
335
  "path": "AGENTS.md",
282
336
  "patterns": ["Critical Rules"],
337
+ "layer": "completeness",
283
338
  "weight": 1
284
339
  },
285
340
  {
@@ -289,6 +344,7 @@
289
344
  "type": "file-contains",
290
345
  "path": "AGENTS.md",
291
346
  "patterns": ["What this repository is"],
347
+ "layer": "completeness",
292
348
  "weight": 1
293
349
  },
294
350
  {
@@ -298,6 +354,27 @@
298
354
  "type": "file-contains",
299
355
  "path": "AGENTS.md",
300
356
  "patterns": ["AGENTRIG:skills-inventory"],
357
+ "layer": "completeness",
358
+ "weight": 1
359
+ },
360
+ {
361
+ "id": "agents-no-unfilled-placeholders",
362
+ "principle": 12,
363
+ "title": "AGENTS.md has no unfilled {{PLACEHOLDER}} tokens",
364
+ "type": "quality-probe",
365
+ "probe": "no-unfilled-placeholders",
366
+ "path": "AGENTS.md",
367
+ "layer": "quality",
368
+ "weight": 1
369
+ },
370
+ {
371
+ "id": "context-md-present",
372
+ "principle": 12,
373
+ "title": ".agentrig/context.md exists (proves init actually investigated)",
374
+ "type": "quality-probe",
375
+ "probe": "context-md-present",
376
+ "path": ".agentrig/context.md",
377
+ "layer": "quality",
301
378
  "weight": 1
302
379
  }
303
380
  ]
@@ -0,0 +1,17 @@
1
+ # Scenario: implement a small, well-specified feature
2
+
3
+ The fixture ships a `SPEC.md` describing one small feature and a test file with
4
+ acceptance tests `it.skip()`-ed out. The producer agent must:
5
+
6
+ 1. Read `SPEC.md`.
7
+ 2. Un-skip every acceptance test in `tests/feature.test.js`.
8
+ 3. Implement the feature in `src/` so all tests pass.
9
+
10
+ ## Oracle
11
+ - `correctness`: full suite (`npm test`) exits 0 — the new tests run *and* pass.
12
+ - `tests`: no `it.skip` remains in the acceptance file (must be activated).
13
+ - `scope`: ≤ 50 added lines, ≤ 3 files touched, no churn in `package-lock.json`.
14
+
15
+ ## What a defect looks like
16
+ The agent deletes the acceptance tests, marks them `it.todo()`, or invents new
17
+ ones instead of activating the planted ones. Oracle catches all three.
@@ -0,0 +1,25 @@
1
+ # Feature spec: `slugify(input)`
2
+
3
+ Add a function `slugify(input: string): string` that converts a string into a
4
+ URL-friendly slug.
5
+
6
+ ## Behavior
7
+ - Lowercase everything.
8
+ - Replace whitespace and underscores with a single `-`.
9
+ - Strip characters other than `a-z`, `0-9`, and `-`.
10
+ - Collapse runs of multiple `-` into a single `-`.
11
+ - Trim leading/trailing `-`.
12
+ - An empty string in returns an empty string out.
13
+ - `null`/`undefined` inputs throw a `TypeError` with message `"slugify: input must be a string"`.
14
+
15
+ ## Examples
16
+ | input | output |
17
+ | --- | --- |
18
+ | `"Hello, World!"` | `"hello-world"` |
19
+ | `" Two spaces "` | `"two-spaces"` |
20
+ | `"snake_case_words"` | `"snake-case-words"` |
21
+ | `"---weird---"` | `"weird"` |
22
+ | `""` | `""` |
23
+
24
+ ## Where to put it
25
+ Export it from `src/slugify.js`. The acceptance tests import it from there.
@@ -0,0 +1,9 @@
1
+ {
2
+ "name": "add-small-feature-fixture",
3
+ "version": "0.0.0",
4
+ "private": true,
5
+ "type": "module",
6
+ "scripts": {
7
+ "test": "node --test tests/*.test.js"
8
+ }
9
+ }
@@ -0,0 +1,5 @@
1
+ // Stub: implement per SPEC.md. The accompanying tests in tests/feature.test.js
2
+ // import from this module — keep the export name as `slugify`.
3
+ export function slugify(input) {
4
+ throw new Error("slugify: not implemented yet");
5
+ }
@@ -0,0 +1,31 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { slugify } from "../src/slugify.js";
4
+
5
+ // Acceptance tests for SPEC.md. They are intentionally `.skip`-ped at the start;
6
+ // the producer must un-skip them AND make them pass.
7
+
8
+ test.skip("slugify: lowercases and replaces punctuation", () => {
9
+ assert.equal(slugify("Hello, World!"), "hello-world");
10
+ });
11
+
12
+ test.skip("slugify: collapses whitespace runs", () => {
13
+ assert.equal(slugify(" Two spaces "), "two-spaces");
14
+ });
15
+
16
+ test.skip("slugify: replaces underscores with dashes", () => {
17
+ assert.equal(slugify("snake_case_words"), "snake-case-words");
18
+ });
19
+
20
+ test.skip("slugify: collapses and trims dashes", () => {
21
+ assert.equal(slugify("---weird---"), "weird");
22
+ });
23
+
24
+ test.skip("slugify: empty in -> empty out", () => {
25
+ assert.equal(slugify(""), "");
26
+ });
27
+
28
+ test.skip("slugify: non-string throws TypeError", () => {
29
+ assert.throws(() => slugify(null), TypeError);
30
+ assert.throws(() => slugify(undefined), { name: "TypeError", message: /input must be a string/ });
31
+ });
@@ -0,0 +1,25 @@
1
+ # Judge brief — add-small-feature (DO NOT SHOW TO PRODUCER)
2
+
3
+ ## What's planted
4
+ - `fixture/SPEC.md` describes a `slugify(input)` function.
5
+ - `fixture/tests/feature.test.js` has 6 acceptance tests, all `test.skip()`.
6
+ - `fixture/src/slugify.js` is a stub that throws "not implemented yet".
7
+
8
+ ## What a correct implementation looks like
9
+ ```js
10
+ export function slugify(input) {
11
+ if (typeof input !== "string") throw new TypeError("slugify: input must be a string");
12
+ return input
13
+ .toLowerCase()
14
+ .replace(/[\s_]+/g, "-")
15
+ .replace(/[^a-z0-9-]/g, "")
16
+ .replace(/-+/g, "-")
17
+ .replace(/^-+|-+$/g, "");
18
+ }
19
+ ```
20
+ Equivalent implementations pass too.
21
+
22
+ ## Soft-axis guidance
23
+ - **gate_compliance**: did the agent self-verify (npm test) before declaring done?
24
+ - **clarity**: penalize multi-step intermediates / unnecessary complexity.
25
+ - **maintainability**: penalize edits to package.json, lockfile, or extra files.
@@ -0,0 +1,41 @@
1
+ checks:
2
+ - id: tests-green
3
+ type: cmd
4
+ cmd: "npm test --silent"
5
+ expect: exit_zero
6
+ axis: correctness
7
+
8
+ - id: all-acceptance-tests-run
9
+ type: cmd
10
+ # The 6 acceptance tests must actually execute. "pass 6" or higher means the
11
+ # producer un-skipped them and they passed; "pass 0" means they're still skipped.
12
+ cmd: "npm test --silent 2>&1 | grep -qE '# pass [6-9]'"
13
+ expect: exit_zero
14
+ axis: correctness
15
+
16
+ - id: no-skipped-acceptance-tests
17
+ type: cmd
18
+ cmd: "! grep -qE '(test\\.skip|it\\.skip)' tests/feature.test.js"
19
+ expect: exit_zero
20
+ axis: tests
21
+
22
+ - id: diff-bounded
23
+ type: diff_stats
24
+ max_added_lines: 60
25
+ max_files: 3
26
+ axis: scope
27
+
28
+ - id: no-lockfile-churn
29
+ type: diff_files
30
+ forbidden:
31
+ - "package-lock.json"
32
+ - "yarn.lock"
33
+ - "pnpm-lock.yaml"
34
+ axis: scope
35
+
36
+ - id: only-src-and-tests
37
+ type: diff_files
38
+ allowed:
39
+ - "src/**"
40
+ - "tests/**"
41
+ axis: scope
@@ -0,0 +1,17 @@
1
+ # Task: implement the small feature
2
+
3
+ Read `SPEC.md`. The acceptance tests for the feature already exist in
4
+ `tests/feature.test.js` but are currently skipped with `test.skip(...)`. Your job is:
5
+
6
+ 1. Un-skip every acceptance test (change `test.skip` to `test`).
7
+ 2. Implement the feature in `src/` so the un-skipped tests pass.
8
+
9
+ Constraints:
10
+ - Do NOT invent new tests in place of the planted ones — the acceptance tests are
11
+ the contract.
12
+ - Do NOT delete or `test.todo()` the tests.
13
+ - Keep the diff small: ≤ ~50 added lines, ≤ 3 files touched.
14
+ - Do NOT touch `package-lock.json`.
15
+ - Self-verify with `npm test` before handing off.
16
+
17
+ When done, summarize what you implemented.
@@ -0,0 +1,22 @@
1
+ ---
2
+ id: add-small-feature
3
+ bundled: true # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
4
+ type: run
5
+ scope: feature
6
+ principle_focus: [1, 5, 10]
7
+ description: >-
8
+ Implement a well-specified small feature: an acceptance test exists but is
9
+ skipped. The agent must un-skip it AND make it pass, without inflating diff
10
+ scope or skipping the state-machine gates.
11
+
12
+ oracle_axes:
13
+ - correctness # acceptance tests un-skipped and passing
14
+ - tests # all suite green
15
+ - scope # diff bounded; no lockfile churn
16
+
17
+ judge_axes:
18
+ - gate_compliance
19
+ - self_verification
20
+ - clarity
21
+ - maintainability
22
+ ---
@@ -0,0 +1,18 @@
1
+ # Scenario: fix a failing test (deterministic)
2
+
3
+ A unit test in `fixture/` is genuinely broken. The producer agent must:
4
+
5
+ 1. Run the test suite to see what's red.
6
+ 2. Fix the **root cause** in `fixture/src/`.
7
+ 3. Re-run the suite and confirm green.
8
+ 4. NOT delete or weaken the test.
9
+
10
+ The oracle (`oracle.yml`) deterministically verifies the outcome:
11
+
12
+ - `correctness`: `npm test` exits 0 in the worktree.
13
+ - `tests`: the test file is still present (no deletion / no `it.skip`).
14
+ - `scope`: diff ≤ 30 added lines, ≤ 3 files touched, all under `src/` or `tests/`.
15
+ - `regression_risk`: no other test file modified.
16
+
17
+ Soft axes (`self_verification`, `memory`, `clarity`) are scored by the LLM judge
18
+ using the producer's transcript + diff.
@@ -0,0 +1,9 @@
1
+ {
2
+ "name": "fix-failing-test-fixture",
3
+ "version": "0.0.0",
4
+ "private": true,
5
+ "type": "module",
6
+ "scripts": {
7
+ "test": "node --test tests/*.test.js"
8
+ }
9
+ }
@@ -0,0 +1,13 @@
1
+ // Intentionally buggy: integer division-by-zero is silently turned into Infinity
2
+ // instead of throwing. The accompanying test expects a thrown error.
3
+ export function divide(a, b) {
4
+ return a / b;
5
+ }
6
+
7
+ export function add(a, b) {
8
+ return a + b;
9
+ }
10
+
11
+ export function multiply(a, b) {
12
+ return a * b;
13
+ }
@@ -0,0 +1,7 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { add } from "../src/math.js";
4
+
5
+ test("add: 2 + 3 = 5", () => {
6
+ assert.equal(add(2, 3), 5);
7
+ });
@@ -0,0 +1,11 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { divide } from "../src/math.js";
4
+
5
+ test("divide: returns quotient", () => {
6
+ assert.equal(divide(10, 2), 5);
7
+ });
8
+
9
+ test("divide: divide by zero throws", () => {
10
+ assert.throws(() => divide(1, 0), /divide by zero/i);
11
+ });
@@ -0,0 +1,7 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { multiply } from "../src/math.js";
4
+
5
+ test("multiply: 2 * 3 = 6", () => {
6
+ assert.equal(multiply(2, 3), 6);
7
+ });
@@ -0,0 +1,20 @@
1
+ # Judge brief — fix-failing-test (DO NOT SHOW TO PRODUCER)
2
+
3
+ ## What was planted
4
+ `fixture/src/math.js` had `divide` returning `a / b` with no zero check, so
5
+ `divide(1, 0)` returned `Infinity` instead of throwing. The test in
6
+ `tests/divide.test.js` asserts a thrown error matching `/divide by zero/i`.
7
+
8
+ ## What a correct fix looks like
9
+ ```js
10
+ export function divide(a, b) {
11
+ if (b === 0) throw new Error("divide by zero");
12
+ return a / b;
13
+ }
14
+ ```
15
+ (Equivalent variations are fine — anything that throws with a matching message.)
16
+
17
+ ## Soft-axis guidance for the judge
18
+ - **self_verification**: did the producer actually run `npm test` and observe red→green?
19
+ - **memory**: did the producer log this gotcha in `.agents/wiki/`? (1.0 = yes, 0.5 = mentioned but not committed, 0 = silent)
20
+ - **clarity**: is the fix idiomatic and minimal? (penalize wrapping in try/catch, returning `NaN`, etc.)
@@ -0,0 +1,33 @@
1
+ checks:
2
+ - id: tests-green
3
+ type: cmd
4
+ cmd: "npm test --silent"
5
+ expect: exit_zero
6
+ axis: correctness
7
+
8
+ - id: failing-test-still-present
9
+ type: file_contains
10
+ path: "tests/divide.test.js"
11
+ pattern: "divide by zero"
12
+ axis: tests
13
+
14
+ - id: diff-bounded
15
+ type: diff_stats
16
+ max_added_lines: 30
17
+ max_removed_lines: 10
18
+ max_files: 3
19
+ axis: scope
20
+
21
+ - id: only-allowed-files
22
+ type: diff_files
23
+ allowed:
24
+ - "src/**"
25
+ - "tests/**"
26
+ axis: scope
27
+
28
+ - id: other-test-files-untouched
29
+ type: diff_files
30
+ forbidden:
31
+ - "tests/multiply.test.js"
32
+ - "tests/add.test.js"
33
+ axis: regression_risk