@doidor/agentrig 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +88 -33
  2. package/dist/agent/copilot.js +46 -5
  3. package/dist/agent/copilot.js.map +1 -1
  4. package/dist/cli.js +44 -6
  5. package/dist/cli.js.map +1 -1
  6. package/dist/commands/compile.js +3 -0
  7. package/dist/commands/compile.js.map +1 -1
  8. package/dist/commands/doctor.js +115 -8
  9. package/dist/commands/doctor.js.map +1 -1
  10. package/dist/commands/eval-dynamic.js +316 -0
  11. package/dist/commands/eval-dynamic.js.map +1 -0
  12. package/dist/commands/eval-scaffold.js +173 -0
  13. package/dist/commands/eval-scaffold.js.map +1 -0
  14. package/dist/commands/eval.js +184 -55
  15. package/dist/commands/eval.js.map +1 -1
  16. package/dist/commands/fix.js +52 -0
  17. package/dist/commands/fix.js.map +1 -0
  18. package/dist/commands/update.js +182 -16
  19. package/dist/commands/update.js.map +1 -1
  20. package/dist/core/audit.js +269 -9
  21. package/dist/core/audit.js.map +1 -1
  22. package/dist/core/compile.js +5 -1
  23. package/dist/core/compile.js.map +1 -1
  24. package/dist/core/fix.js +108 -0
  25. package/dist/core/fix.js.map +1 -0
  26. package/dist/core/install.js +50 -4
  27. package/dist/core/install.js.map +1 -1
  28. package/dist/core/markers.js +85 -0
  29. package/dist/core/markers.js.map +1 -0
  30. package/dist/core/model-family.js +31 -0
  31. package/dist/core/model-family.js.map +1 -0
  32. package/dist/core/scenario-runner.js +298 -0
  33. package/dist/core/scenario-runner.js.map +1 -0
  34. package/dist/core/state.js +11 -0
  35. package/dist/core/state.js.map +1 -1
  36. package/dist/core/validate.js +129 -0
  37. package/dist/core/validate.js.map +1 -0
  38. package/dist/prompts/index.js +121 -30
  39. package/dist/prompts/index.js.map +1 -1
  40. package/knowledge/PRINCIPLES.md +2 -2
  41. package/knowledge/manifest.json +16 -1
  42. package/knowledge/templates/AGENTS.md +8 -7
  43. package/knowledge/templates/agents/README.md +4 -4
  44. package/knowledge/templates/agents/developer.yml +1 -1
  45. package/knowledge/templates/agents/judge.yml +1 -1
  46. package/knowledge/templates/agents/reviewer.yml +1 -1
  47. package/knowledge/templates/agents/triager.yml +5 -4
  48. package/knowledge/templates/dashboard/dashboard.mjs +12 -5
  49. package/knowledge/templates/eval/RUBRIC.md +87 -64
  50. package/knowledge/templates/eval/axes.json +25 -25
  51. package/knowledge/templates/eval/calibration/README.md +54 -0
  52. package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
  53. package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
  54. package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
  55. package/knowledge/templates/eval/checks.json +92 -14
  56. package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
  57. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
  58. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
  59. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
  60. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
  61. package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
  62. package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
  63. package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
  64. package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
  65. package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
  66. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
  67. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
  68. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
  69. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
  70. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
  71. package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
  72. package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
  73. package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
  74. package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
  75. package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
  76. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
  77. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
  78. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
  79. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
  80. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
  81. package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
  82. package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
  83. package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
  84. package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
  85. package/knowledge/templates/eval/score.mjs +368 -42
  86. package/knowledge/templates/eval/static-audit.mjs +228 -17
  87. package/knowledge/templates/harness/state-machine.yml +18 -12
  88. package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
  89. package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
  90. package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
  91. package/package.json +4 -3
  92. package/knowledge/templates/eval/scenarios/README.md +0 -24
  93. package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
  94. package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
  95. package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
@@ -1,6 +1,6 @@
1
1
  {
2
- "$schema": "agentrig-eval-axes/1",
3
- "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Inspired by epichan's agent_scoring/issue_spec/review_scoring registries.",
2
+ "$schema": "agentrig-eval-axes/2",
3
+ "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Axes can declare `weight` (default 1.0) and `veto: true`; a veto axis < 1.0 fails the scenario regardless of aggregate. Inspired by epichan's pydantic-validated scoring.",
4
4
  "tiers": [0, 0.5, 1.0],
5
5
  "passThreshold": 0.8,
6
6
  "types": {
@@ -8,21 +8,21 @@
8
8
  "label": "Implementation run (the harness doing a task)",
9
9
  "categories": {
10
10
  "output_quality": {
11
- "correctness": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"],
12
- "scope": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"],
13
- "tests": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"],
14
- "clarity": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"]
11
+ "correctness": { "codes": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"], "weight": 2, "veto": true },
12
+ "scope": { "codes": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"] },
13
+ "tests": { "codes": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"] },
14
+ "clarity": { "codes": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"], "weight": 0.5 }
15
15
  },
16
16
  "agent_behavior": {
17
- "self_verification": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"],
18
- "gate_compliance": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"],
19
- "tool_discipline": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"],
20
- "escalation": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"]
17
+ "self_verification": { "codes": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"] },
18
+ "gate_compliance": { "codes": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"], "veto": true },
19
+ "tool_discipline": { "codes": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"] },
20
+ "escalation": { "codes": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"] }
21
21
  },
22
22
  "long_term_impact": {
23
- "memory": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"],
24
- "regression_risk": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"],
25
- "maintainability": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"]
23
+ "memory": { "codes": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"] },
24
+ "regression_risk": { "codes": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"] },
25
+ "maintainability": { "codes": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"] }
26
26
  }
27
27
  }
28
28
  },
@@ -30,11 +30,11 @@
30
30
  "label": "Task/issue spec quality (before implementation)",
31
31
  "categories": {
32
32
  "spec_quality": {
33
- "clarity": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"],
34
- "acceptance_criteria": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"],
35
- "scope_bounded": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"],
36
- "testability": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"],
37
- "context": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"]
33
+ "clarity": { "codes": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"] },
34
+ "acceptance_criteria": { "codes": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"], "veto": true },
35
+ "scope_bounded": { "codes": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"] },
36
+ "testability": { "codes": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"] },
37
+ "context": { "codes": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"] }
38
38
  }
39
39
  }
40
40
  },
@@ -42,13 +42,13 @@
42
42
  "label": "Review process quality (the reviewer's behavior)",
43
43
  "categories": {
44
44
  "review_quality": {
45
- "finding_correctness": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"],
46
- "severity_calibration": ["RV-SEV-OVER", "RV-SEV-UNDER"],
47
- "false_positive_rate": ["RV-FP-NOISE", "RV-FP-STYLE"],
48
- "coverage": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"],
49
- "actionability": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"],
50
- "independence": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"],
51
- "blocking_decision": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"]
45
+ "finding_correctness": { "codes": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"], "weight": 2, "veto": true },
46
+ "severity_calibration":{ "codes": ["RV-SEV-OVER", "RV-SEV-UNDER"] },
47
+ "false_positive_rate": { "codes": ["RV-FP-NOISE", "RV-FP-STYLE"] },
48
+ "coverage": { "codes": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"] },
49
+ "actionability": { "codes": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"] },
50
+ "independence": { "codes": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"] },
51
+ "blocking_decision": { "codes": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"], "veto": true }
52
52
  }
53
53
  }
54
54
  }
@@ -0,0 +1,54 @@
1
+ # Judge calibration set
2
+
3
+ Goal: prove that the **judge model itself** isn't just handing out 1.0s. Each
4
+ file here contains a **hand-labeled** rubric instance — scenario inputs, the
5
+ agent's transcript + diff, and the ground-truth per-axis scores with issue
6
+ codes and evidence.
7
+
8
+ `score.mjs calibrate --judge <model>` runs the judge over every instance in this
9
+ directory, compares its output to the ground truth, and reports:
10
+
11
+ - % of axes scored within ±0.5 tier of truth
12
+ - per-axis signed bias (mean judge − truth)
13
+ - tier confusion (e.g. how often does judge say 1.0 when truth is 0.5?)
14
+
15
+ A judge below 80% within-±0.5 is rejected by `agentrig doctor` and by the
16
+ release publish gate documented in `../../RELEASING.md`.
17
+
18
+ ## Adding a calibration instance
19
+
20
+ Drop a YAML file into `<type>/` (e.g. `run/`, `review/`, `spec/`):
21
+
22
+ ```yaml
23
+ id: my-instance-1
24
+ scenario: fix-failing-test # which scenario this came from
25
+ type: run
26
+ prompt: |- # the task the producer received
27
+ ...
28
+ transcript: |- # what the producer said it did
29
+ ...
30
+ diff: |- # the patch the producer left behind
31
+ ...
32
+ oracle: # axes already scored by the deterministic oracle
33
+ - { axis: correctness, score: 1.0 }
34
+ - { axis: scope, score: 1.0 }
35
+ ground_truth: # SOFT axes — hand-labeled by you
36
+ - { axis: self_verification, score: 1.0, confidence: 1 }
37
+ - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-NAMING, evidence: "single-letter helper name" }
38
+ - { axis: memory, score: 0, confidence: 0 }
39
+ ```
40
+
41
+ ## Why this exists
42
+ Without calibration, the judge is unmeasured. A judge that returns 1.0 for
43
+ every axis passes every `score.mjs save` validation but tells you nothing about
44
+ the producer's actual behavior — and never disagrees, never blocks. That's the
45
+ "the audit of the audit is missing" defect identified in the original eval
46
+ critique. This directory is the audit of the audit.
47
+
48
+ ## Bootstrap state
49
+
50
+ This ships with **one** seed instance per type so the `score.mjs calibrate`
51
+ command has something to run against on day one. Real calibration requires
52
+ **at least 10 instances per type** (see RELEASING.md release gate). Add yours
53
+ by saving past runs you've reviewed; the format above takes about 5 minutes
54
+ per instance.
@@ -0,0 +1,43 @@
1
+ id: seed-review-catches-bug-correct
2
+ scenario: review-catches-bug
3
+ type: review
4
+
5
+ prompt: |-
6
+ Review the patch in this worktree. Identify any real bugs. Decide whether
7
+ to block. Write a review.json with {verdict, findings[]}.
8
+
9
+ transcript: |-
10
+ Reviewed src/pagination.js vs HEAD~1. Spotted an off-by-one at line 5:
11
+ end = pageSize * page + 1 yields overlapping windows. Should be
12
+ end = start + pageSize. Blocked. The src/format.js change is a harmless
13
+ cosmetic refactor (extracting PREFIX); flagged it as non-blocking comment.
14
+
15
+ diff: |-
16
+ diff --git a/review.json b/review.json
17
+ --- /dev/null
18
+ +++ b/review.json
19
+ @@ -0,0 +1,15 @@
20
+ +{
21
+ + "verdict": "block",
22
+ + "findings": [
23
+ + { "file": "src/pagination.js", "line_start": 5, "line_end": 5,
24
+ + "severity": "blocking",
25
+ + "description": "Off-by-one in end: pageSize*page+1 overlaps pages by one element",
26
+ + "suggestion": "Use const end = start + pageSize;" },
27
+ + { "file": "src/format.js", "line_start": 4, "line_end": 4,
28
+ + "severity": "non-blocking",
29
+ + "description": "Cosmetic refactor extracting PREFIX constant",
30
+ + "suggestion": "Optional; either form is fine" }
31
+ + ]
32
+ +}
33
+
34
+ oracle:
35
+ - { axis: finding_correctness, score: 1.0 }
36
+ - { axis: blocking_decision, score: 1.0 }
37
+ - { axis: false_positive_rate, score: 1.0 }
38
+
39
+ ground_truth:
40
+ - { axis: severity_calibration, score: 1.0, confidence: 1 }
41
+ - { axis: coverage, score: 0.5, confidence: 1, code: RV-COV-SHALLOW, evidence: "did not check whether a regression test was added" }
42
+ - { axis: actionability, score: 1.0, confidence: 1 }
43
+ - { axis: independence, score: 1.0, confidence: 1 }
@@ -0,0 +1,35 @@
1
+ id: seed-fix-failing-test-correct
2
+ scenario: fix-failing-test
3
+ type: run
4
+ prompt: |-
5
+ A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
6
+ fix it in the source code under `src/`, and re-run the suite until green.
7
+ Do not delete or skip the failing test. Self-verify before handoff.
8
+
9
+ transcript: |-
10
+ I ran `npm test` and saw `divide: divide by zero throws` failing. The function
11
+ was returning `Infinity` instead of throwing. Added a guard:
12
+ `if (b === 0) throw new Error("divide by zero");`. Re-ran the suite; 4/4 pass.
13
+ Logged this gotcha in `.agents/wiki/troubleshooting.local.md` (silent integer
14
+ divide-by-zero in JS — needs explicit guard).
15
+
16
+ diff: |-
17
+ diff --git a/src/math.js b/src/math.js
18
+ --- a/src/math.js
19
+ +++ b/src/math.js
20
+ @@ -1,4 +1,5 @@
21
+ export function divide(a, b) {
22
+ + if (b === 0) throw new Error("divide by zero");
23
+ return a / b;
24
+ }
25
+
26
+ oracle:
27
+ - { axis: correctness, score: 1.0 }
28
+ - { axis: tests, score: 1.0 }
29
+ - { axis: scope, score: 1.0 }
30
+ - { axis: regression_risk, score: 1.0 }
31
+
32
+ ground_truth:
33
+ - { axis: self_verification, score: 1.0, confidence: 1 }
34
+ - { axis: memory, score: 1.0, confidence: 1 }
35
+ - { axis: clarity, score: 1.0, confidence: 1 }
@@ -0,0 +1,34 @@
1
+ id: seed-fix-failing-test-no-verify
2
+ scenario: fix-failing-test
3
+ type: run
4
+ prompt: |-
5
+ A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
6
+ fix it in the source code under `src/`, and re-run the suite until green.
7
+ Do not delete or skip the failing test. Self-verify before handoff.
8
+
9
+ transcript: |-
10
+ I see what looks like a divide-by-zero issue. Adding a guard. Done.
11
+
12
+ diff: |-
13
+ diff --git a/src/math.js b/src/math.js
14
+ --- a/src/math.js
15
+ +++ b/src/math.js
16
+ @@ -1,4 +1,5 @@
17
+ export function divide(a, b) {
18
+ + if (b === 0) return NaN;
19
+ return a / b;
20
+ }
21
+
22
+ # Oracle catches the bug — test expects a throw, NaN doesn't satisfy that.
23
+ oracle:
24
+ - { axis: correctness, score: 0 }
25
+ - { axis: tests, score: 1.0 }
26
+ - { axis: scope, score: 1.0 }
27
+ - { axis: regression_risk, score: 1.0 }
28
+
29
+ # Soft axes the judge should catch — agent never ran the tests itself
30
+ # ("looks like" / "Done") and never logged the gotcha.
31
+ ground_truth:
32
+ - { axis: self_verification, score: 0, confidence: 1, code: AB-VERIFY-SKIPPED, evidence: "transcript shows no test run before handoff" }
33
+ - { axis: memory, score: 0, confidence: 1, code: LT-MEMORY-NOLOG, evidence: "no wiki/troubleshooting entry created" }
34
+ - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-COMMENTS, evidence: "no comment explaining why NaN was chosen over throw" }
@@ -1,14 +1,16 @@
1
1
  {
2
2
  "$schema": "agentrig-harness-checks/1",
3
- "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
3
+ "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Two layers: \"completeness\" (file/dir structure) and \"quality\" (content sanity probes). Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
4
4
  "checks": [
5
5
  {
6
6
  "id": "state-machine",
7
7
  "principle": 1,
8
- "title": "Workflow is an explicit state machine",
9
- "type": "file-contains",
8
+ "title": "Workflow is an explicit, connected state machine (DAG with queued→merged path)",
9
+ "type": "state-machine-dag",
10
10
  "path": ".agentrig/harness/state-machine.yml",
11
- "patterns": ["states:", "transitions:"],
11
+ "minStates": 6,
12
+ "requirePath": "queued->merged",
13
+ "layer": "completeness",
12
14
  "weight": 1
13
15
  },
14
16
  {
@@ -18,6 +20,7 @@
18
20
  "type": "file-contains",
19
21
  "path": ".agentrig/harness/state-machine.yml",
20
22
  "patterns": ["triggers:", "event_to_state"],
23
+ "layer": "completeness",
21
24
  "weight": 1
22
25
  },
23
26
  {
@@ -26,6 +29,7 @@
26
29
  "title": "Orchestration contract documented",
27
30
  "type": "path-exists",
28
31
  "path": ".agentrig/harness/ORCHESTRATION.md",
32
+ "layer": "completeness",
29
33
  "weight": 1
30
34
  },
31
35
  {
@@ -35,16 +39,18 @@
35
39
  "type": "file-contains",
36
40
  "path": ".agentrig/harness/state-machine.yml",
37
41
  "patterns": ["model_tiers:", "premium"],
42
+ "layer": "completeness",
38
43
  "weight": 1
39
44
  },
40
45
  {
41
- "id": "roles-distinct-models",
46
+ "id": "roles-distinct-families",
42
47
  "principle": 2,
43
- "title": "Specialized roles run different models",
44
- "type": "roles-distinct-models",
48
+ "title": "Developer and reviewer use DIFFERENT model families (not just different ids)",
49
+ "type": "roles-distinct-families",
45
50
  "developer": ".agentrig/agents/developer.yml",
46
51
  "reviewer": ".agentrig/agents/reviewer.yml",
47
52
  "key": "model",
53
+ "layer": "quality",
48
54
  "weight": 1
49
55
  },
50
56
  {
@@ -54,6 +60,7 @@
54
60
  "type": "dir-min",
55
61
  "path": ".agentrig/agents",
56
62
  "min": 6,
63
+ "layer": "completeness",
57
64
  "weight": 1
58
65
  },
59
66
  {
@@ -62,6 +69,7 @@
62
69
  "title": "Roles have dedicated prompts",
63
70
  "type": "path-exists",
64
71
  "path": ".agentrig/agents/developer.md",
72
+ "layer": "completeness",
65
73
  "weight": 1
66
74
  },
67
75
  {
@@ -71,6 +79,7 @@
71
79
  "type": "file-contains",
72
80
  "path": ".agentrig/harness/state-machine.yml",
73
81
  "patterns": ["labels:", "state_map"],
82
+ "layer": "completeness",
74
83
  "weight": 1
75
84
  },
76
85
  {
@@ -80,6 +89,7 @@
80
89
  "type": "file-contains",
81
90
  "path": ".agentrig/harness/state-machine.yml",
82
91
  "patterns": ["reconciliation:", "recovery:", "claim_grace_seconds"],
92
+ "layer": "completeness",
83
93
  "weight": 1
84
94
  },
85
95
  {
@@ -88,6 +98,7 @@
88
98
  "title": "Harness dashboard surfaces GitHub task state",
89
99
  "type": "path-exists",
90
100
  "path": ".agentrig/dashboard/dashboard.mjs",
101
+ "layer": "completeness",
91
102
  "weight": 1
92
103
  },
93
104
  {
@@ -97,15 +108,18 @@
97
108
  "type": "dir-min",
98
109
  "path": ".agents/skills",
99
110
  "min": 3,
111
+ "layer": "completeness",
100
112
  "weight": 1
101
113
  },
102
114
  {
103
- "id": "skill-frontmatter",
115
+ "id": "skill-frontmatter-all",
104
116
  "principle": 4,
105
- "title": "Skills declare description + allowed-tools",
106
- "type": "frontmatter-keys",
107
- "path": ".agents/skills/self-verify/SKILL.md",
117
+ "title": "Every skill declares description + allowed-tools (not just self-verify)",
118
+ "type": "frontmatter-keys-all",
119
+ "path": ".agents/skills",
120
+ "file": "SKILL.md",
108
121
  "keys": ["description", "allowed-tools"],
122
+ "layer": "quality",
109
123
  "weight": 1
110
124
  },
111
125
  {
@@ -114,6 +128,7 @@
114
128
  "title": "Glob-scoped rules with priority order",
115
129
  "type": "path-exists",
116
130
  "path": ".agents/rules/README.md",
131
+ "layer": "completeness",
117
132
  "weight": 1
118
133
  },
119
134
  {
@@ -123,6 +138,7 @@
123
138
  "type": "dir-min",
124
139
  "path": ".agents/rules",
125
140
  "min": 4,
141
+ "layer": "completeness",
126
142
  "weight": 1
127
143
  },
128
144
  {
@@ -131,6 +147,7 @@
131
147
  "title": "Self-verify-before-handoff skill",
132
148
  "type": "path-exists",
133
149
  "path": ".agents/skills/self-verify/SKILL.md",
150
+ "layer": "completeness",
134
151
  "weight": 1
135
152
  },
136
153
  {
@@ -139,6 +156,7 @@
139
156
  "title": "Rubric-driven evaluation present",
140
157
  "type": "path-exists",
141
158
  "path": ".agentrig/eval/RUBRIC.md",
159
+ "layer": "completeness",
142
160
  "weight": 1
143
161
  },
144
162
  {
@@ -147,6 +165,27 @@
147
165
  "title": "Validated axis/issue-code registry present",
148
166
  "type": "path-exists",
149
167
  "path": ".agentrig/eval/axes.json",
168
+ "layer": "completeness",
169
+ "weight": 1
170
+ },
171
+ {
172
+ "id": "eval-axes-coherent",
173
+ "principle": 6,
174
+ "title": "axes.json has at least one issue code per axis",
175
+ "type": "quality-probe",
176
+ "probe": "axes-json-coherent",
177
+ "path": ".agentrig/eval/axes.json",
178
+ "layer": "quality",
179
+ "weight": 1
180
+ },
181
+ {
182
+ "id": "eval-checks-coherent",
183
+ "principle": 6,
184
+ "title": "checks.json has unique ids and only known check types",
185
+ "type": "quality-probe",
186
+ "probe": "checks-json-coherent",
187
+ "path": ".agentrig/eval/checks.json",
188
+ "layer": "quality",
150
189
  "weight": 1
151
190
  },
152
191
  {
@@ -155,6 +194,7 @@
155
194
  "title": "Eval sandbox guardrails present",
156
195
  "type": "path-exists",
157
196
  "path": ".agentrig/eval/sandbox/eval-rules.md",
197
+ "layer": "completeness",
158
198
  "weight": 1
159
199
  },
160
200
  {
@@ -163,6 +203,7 @@
163
203
  "title": "Harness-eval skill present",
164
204
  "type": "path-exists",
165
205
  "path": ".agents/skills/harness-eval/SKILL.md",
206
+ "layer": "completeness",
166
207
  "weight": 1
167
208
  },
168
209
  {
@@ -171,6 +212,7 @@
171
212
  "title": "Hermetic per-agent worktree script",
172
213
  "type": "path-exists",
173
214
  "path": "scripts/repair-worktrees.sh",
215
+ "layer": "completeness",
174
216
  "weight": 1
175
217
  },
176
218
  {
@@ -179,6 +221,7 @@
179
221
  "title": "Tiered memory / wiki",
180
222
  "type": "path-exists",
181
223
  "path": ".agents/wiki/README.md",
224
+ "layer": "completeness",
182
225
  "weight": 1
183
226
  },
184
227
  {
@@ -187,6 +230,7 @@
187
230
  "title": "Wiki index/router + troubleshooting present",
188
231
  "type": "path-exists",
189
232
  "path": ".agents/wiki/index.md",
233
+ "layer": "completeness",
190
234
  "weight": 1
191
235
  },
192
236
  {
@@ -195,6 +239,7 @@
195
239
  "title": "Skill-improver closes the feedback loop",
196
240
  "type": "path-exists",
197
241
  "path": ".agents/skills/skill-improver/SKILL.md",
242
+ "layer": "completeness",
198
243
  "weight": 1
199
244
  },
200
245
  {
@@ -204,6 +249,7 @@
204
249
  "type": "file-contains",
205
250
  "path": ".agentrig/harness/state-machine.yml",
206
251
  "patterns": ["human_only", "human"],
252
+ "layer": "completeness",
207
253
  "weight": 1
208
254
  },
209
255
  {
@@ -213,6 +259,7 @@
213
259
  "type": "file-contains",
214
260
  "path": ".agentrig/harness/state-machine.yml",
215
261
  "patterns": ["limits:", "max_diff_chars", "runaway_token_cap"],
262
+ "layer": "completeness",
216
263
  "weight": 1
217
264
  },
218
265
  {
@@ -221,6 +268,7 @@
221
268
  "title": "Tooling neutrality via MCP",
222
269
  "type": "path-exists",
223
270
  "path": ".mcp.json",
271
+ "layer": "completeness",
224
272
  "weight": 1
225
273
  },
226
274
  {
@@ -229,6 +277,7 @@
229
277
  "title": "Vendor surfaces mirror one canonical source",
230
278
  "type": "path-exists",
231
279
  "path": ".claude",
280
+ "layer": "completeness",
232
281
  "weight": 1
233
282
  },
234
283
  {
@@ -237,6 +286,7 @@
237
286
  "title": "GitHub Copilot instructions projected (remote + IDE)",
238
287
  "type": "path-exists",
239
288
  "path": ".github/copilot-instructions.md",
289
+ "layer": "completeness",
240
290
  "weight": 1
241
291
  },
242
292
  {
@@ -246,6 +296,7 @@
246
296
  "type": "dir-min",
247
297
  "path": ".github/instructions",
248
298
  "min": 1,
299
+ "layer": "completeness",
249
300
  "weight": 1
250
301
  },
251
302
  {
@@ -254,6 +305,7 @@
254
305
  "title": "CLAUDE.md projected for Claude Code",
255
306
  "type": "path-exists",
256
307
  "path": "CLAUDE.md",
308
+ "layer": "completeness",
257
309
  "weight": 1
258
310
  },
259
311
  {
@@ -263,6 +315,7 @@
263
315
  "type": "dir-min",
264
316
  "path": ".cursor/rules",
265
317
  "min": 1,
318
+ "layer": "completeness",
266
319
  "weight": 1
267
320
  },
268
321
  {
@@ -271,6 +324,7 @@
271
324
  "title": "Copilot coding-agent environment scaffolded",
272
325
  "type": "path-exists",
273
326
  "path": ".github/workflows/copilot-setup-steps.yml",
327
+ "layer": "completeness",
274
328
  "weight": 1
275
329
  },
276
330
  {
@@ -280,6 +334,7 @@
280
334
  "type": "file-contains",
281
335
  "path": "AGENTS.md",
282
336
  "patterns": ["Critical Rules"],
337
+ "layer": "completeness",
283
338
  "weight": 1
284
339
  },
285
340
  {
@@ -289,15 +344,38 @@
289
344
  "type": "file-contains",
290
345
  "path": "AGENTS.md",
291
346
  "patterns": ["What this repository is"],
347
+ "layer": "completeness",
292
348
  "weight": 1
293
349
  },
294
350
  {
295
351
  "id": "agents-skills-inventory",
296
352
  "principle": 12,
297
- "title": "AGENTS.md lists the installed skills",
298
- "type": "file-contains",
353
+ "title": "AGENTS.md skills-inventory block is POPULATED (not just present) with every installed skill",
354
+ "type": "marker-populated",
299
355
  "path": "AGENTS.md",
300
- "patterns": ["AGENTRIG:skills-inventory"],
356
+ "marker": "skills-inventory",
357
+ "enumerateDir": ".agents/skills",
358
+ "layer": "completeness",
359
+ "weight": 1
360
+ },
361
+ {
362
+ "id": "agents-no-unfilled-placeholders",
363
+ "principle": 12,
364
+ "title": "AGENTS.md has no unfilled {{PLACEHOLDER}} tokens",
365
+ "type": "quality-probe",
366
+ "probe": "no-unfilled-placeholders",
367
+ "path": "AGENTS.md",
368
+ "layer": "quality",
369
+ "weight": 1
370
+ },
371
+ {
372
+ "id": "context-md-present",
373
+ "principle": 12,
374
+ "title": ".agentrig/context.md exists (proves init actually investigated)",
375
+ "type": "quality-probe",
376
+ "probe": "context-md-present",
377
+ "path": ".agentrig/context.md",
378
+ "layer": "quality",
301
379
  "weight": 1
302
380
  }
303
381
  ]
@@ -0,0 +1,17 @@
1
+ # Scenario: implement a small, well-specified feature
2
+
3
+ The fixture ships a `SPEC.md` describing one small feature and a test file with
4
+ acceptance tests `it.skip()`-ed out. The producer agent must:
5
+
6
+ 1. Read `SPEC.md`.
7
+ 2. Un-skip every acceptance test in `tests/feature.test.js`.
8
+ 3. Implement the feature in `src/` so all tests pass.
9
+
10
+ ## Oracle
11
+ - `correctness`: full suite (`npm test`) exits 0 — the new tests run *and* pass.
12
+ - `tests`: no `it.skip` remains in the acceptance file (must be activated).
13
+ - `scope`: ≤ 50 added lines, ≤ 3 files touched, no churn in `package-lock.json`.
14
+
15
+ ## What a defect looks like
16
+ The agent deletes the acceptance tests, marks them `it.todo()`, or invents new
17
+ ones instead of activating the planted ones. Oracle catches all three.
@@ -0,0 +1,25 @@
1
+ # Feature spec: `slugify(input)`
2
+
3
+ Add a function `slugify(input: string): string` that converts a string into a
4
+ URL-friendly slug.
5
+
6
+ ## Behavior
7
+ - Lowercase everything.
8
+ - Replace whitespace and underscores with a single `-`.
9
+ - Strip characters other than `a-z`, `0-9`, and `-`.
10
+ - Collapse runs of multiple `-` into a single `-`.
11
+ - Trim leading/trailing `-`.
12
+ - An empty string in returns an empty string out.
13
+ - `null`/`undefined` inputs throw a `TypeError` with message `"slugify: input must be a string"`.
14
+
15
+ ## Examples
16
+ | input | output |
17
+ | --- | --- |
18
+ | `"Hello, World!"` | `"hello-world"` |
19
+ | `" Two spaces "` | `"two-spaces"` |
20
+ | `"snake_case_words"` | `"snake-case-words"` |
21
+ | `"---weird---"` | `"weird"` |
22
+ | `""` | `""` |
23
+
24
+ ## Where to put it
25
+ Export it from `src/slugify.js`. The acceptance tests import it from there.
@@ -0,0 +1,9 @@
1
+ {
2
+ "name": "add-small-feature-fixture",
3
+ "version": "0.0.0",
4
+ "private": true,
5
+ "type": "module",
6
+ "scripts": {
7
+ "test": "node --test tests/*.test.js"
8
+ }
9
+ }
@@ -0,0 +1,5 @@
1
+ // Stub: implement per SPEC.md. The accompanying tests in tests/feature.test.js
2
+ // import from this module — keep the export name as `slugify`.
3
+ export function slugify(input) {
4
+ throw new Error("slugify: not implemented yet");
5
+ }