npm - @doidor/agentrig - Versions diffs - 0.9.0 → 0.11.0 - Mend

@doidor/agentrig 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/knowledge/templates/eval/axes.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "$schema": "agentrig-eval-axes/1",
-  "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Inspired by epichan's agent_scoring/issue_spec/review_scoring registries.",
+  "$schema": "agentrig-eval-axes/2",
+  "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Axes can declare `weight` (default 1.0) and `veto: true`; a veto axis < 1.0 fails the scenario regardless of aggregate. Inspired by epichan's pydantic-validated scoring.",
   "tiers": [0, 0.5, 1.0],
   "passThreshold": 0.8,
   "types": {
@@ -8,21 +8,21 @@
       "label": "Implementation run (the harness doing a task)",
       "categories": {
         "output_quality": {
-          "correctness": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"],
-          "scope": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"],
-          "tests": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"],
-          "clarity": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"]
+          "correctness": { "codes": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"], "weight": 2, "veto": true },
+          "scope":       { "codes": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"] },
+          "tests":       { "codes": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"] },
+          "clarity":     { "codes": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"], "weight": 0.5 }
         },
         "agent_behavior": {
-          "self_verification": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"],
-          "gate_compliance": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"],
-          "tool_discipline": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"],
-          "escalation": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"]
+          "self_verification": { "codes": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"] },
+          "gate_compliance":   { "codes": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"], "veto": true },
+          "tool_discipline":   { "codes": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"] },
+          "escalation":        { "codes": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"] }
         },
         "long_term_impact": {
-          "memory": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"],
-          "regression_risk": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"],
-          "maintainability": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"]
+          "memory":          { "codes": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"] },
+          "regression_risk": { "codes": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"] },
+          "maintainability": { "codes": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"] }
         }
       }
     },
@@ -30,11 +30,11 @@
       "label": "Task/issue spec quality (before implementation)",
       "categories": {
         "spec_quality": {
-          "clarity": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"],
-          "acceptance_criteria": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"],
-          "scope_bounded": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"],
-          "testability": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"],
-          "context": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"]
+          "clarity":             { "codes": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"] },
+          "acceptance_criteria": { "codes": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"], "veto": true },
+          "scope_bounded":       { "codes": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"] },
+          "testability":         { "codes": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"] },
+          "context":             { "codes": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"] }
         }
       }
     },
@@ -42,13 +42,13 @@
       "label": "Review process quality (the reviewer's behavior)",
       "categories": {
         "review_quality": {
-          "finding_correctness": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"],
-          "severity_calibration": ["RV-SEV-OVER", "RV-SEV-UNDER"],
-          "false_positive_rate": ["RV-FP-NOISE", "RV-FP-STYLE"],
-          "coverage": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"],
-          "actionability": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"],
-          "independence": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"],
-          "blocking_decision": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"]
+          "finding_correctness": { "codes": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"], "weight": 2, "veto": true },
+          "severity_calibration":{ "codes": ["RV-SEV-OVER", "RV-SEV-UNDER"] },
+          "false_positive_rate": { "codes": ["RV-FP-NOISE", "RV-FP-STYLE"] },
+          "coverage":            { "codes": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"] },
+          "actionability":       { "codes": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"] },
+          "independence":        { "codes": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"] },
+          "blocking_decision":   { "codes": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"], "veto": true }
         }
       }
     }

package/knowledge/templates/eval/calibration/README.md ADDED Viewed

@@ -0,0 +1,54 @@
+# Judge calibration set
+Goal: prove that the **judge model itself** isn't just handing out 1.0s. Each
+file here contains a **hand-labeled** rubric instance — scenario inputs, the
+agent's transcript + diff, and the ground-truth per-axis scores with issue
+codes and evidence.
+`score.mjs calibrate --judge <model>` runs the judge over every instance in this
+directory, compares its output to the ground truth, and reports:
+- % of axes scored within ±0.5 tier of truth
+- per-axis signed bias (mean judge − truth)
+- tier confusion (e.g. how often does judge say 1.0 when truth is 0.5?)
+A judge below 80% within-±0.5 is rejected by `agentrig doctor` and by the
+release publish gate documented in `../../RELEASING.md`.
+## Adding a calibration instance
+Drop a YAML file into `<type>/` (e.g. `run/`, `review/`, `spec/`):
+```yaml
+id: my-instance-1
+scenario: fix-failing-test       # which scenario this came from
+type: run
+prompt: |-                        # the task the producer received
+  ...
+transcript: |-                    # what the producer said it did
+  ...
+diff: |-                          # the patch the producer left behind
+  ...
+oracle:                           # axes already scored by the deterministic oracle
+  - { axis: correctness, score: 1.0 }
+  - { axis: scope, score: 1.0 }
+ground_truth:                     # SOFT axes — hand-labeled by you
+  - { axis: self_verification, score: 1.0, confidence: 1 }
+  - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-NAMING, evidence: "single-letter helper name" }
+  - { axis: memory, score: 0,   confidence: 0 }
+```
+## Why this exists
+Without calibration, the judge is unmeasured. A judge that returns 1.0 for
+every axis passes every `score.mjs save` validation but tells you nothing about
+the producer's actual behavior — and never disagrees, never blocks. That's the
+"the audit of the audit is missing" defect identified in the original eval
+critique. This directory is the audit of the audit.
+## Bootstrap state
+This ships with **one** seed instance per type so the `score.mjs calibrate`
+command has something to run against on day one. Real calibration requires
+**at least 10 instances per type** (see RELEASING.md release gate). Add yours
+by saving past runs you've reviewed; the format above takes about 5 minutes
+per instance.

package/knowledge/templates/eval/calibration/review/seed-correct.yml ADDED Viewed

@@ -0,0 +1,43 @@
+id: seed-review-catches-bug-correct
+scenario: review-catches-bug
+type: review
+prompt: |-
+  Review the patch in this worktree. Identify any real bugs. Decide whether
+  to block. Write a review.json with {verdict, findings[]}.
+transcript: |-
+  Reviewed src/pagination.js vs HEAD~1. Spotted an off-by-one at line 5:
+  end = pageSize * page + 1 yields overlapping windows. Should be
+  end = start + pageSize. Blocked. The src/format.js change is a harmless
+  cosmetic refactor (extracting PREFIX); flagged it as non-blocking comment.
+diff: |-
+  diff --git a/review.json b/review.json
+  --- /dev/null
+  +++ b/review.json
+  @@ -0,0 +1,15 @@
+  +{
+  +  "verdict": "block",
+  +  "findings": [
+  +    { "file": "src/pagination.js", "line_start": 5, "line_end": 5,
+  +      "severity": "blocking",
+  +      "description": "Off-by-one in end: pageSize*page+1 overlaps pages by one element",
+  +      "suggestion": "Use const end = start + pageSize;" },
+  +    { "file": "src/format.js", "line_start": 4, "line_end": 4,
+  +      "severity": "non-blocking",
+  +      "description": "Cosmetic refactor extracting PREFIX constant",
+  +      "suggestion": "Optional; either form is fine" }
+  +  ]
+  +}
+oracle:
+  - { axis: finding_correctness, score: 1.0 }
+  - { axis: blocking_decision, score: 1.0 }
+  - { axis: false_positive_rate, score: 1.0 }
+ground_truth:
+  - { axis: severity_calibration, score: 1.0, confidence: 1 }
+  - { axis: coverage, score: 0.5, confidence: 1, code: RV-COV-SHALLOW, evidence: "did not check whether a regression test was added" }
+  - { axis: actionability, score: 1.0, confidence: 1 }
+  - { axis: independence, score: 1.0, confidence: 1 }

package/knowledge/templates/eval/calibration/run/seed-correct.yml ADDED Viewed

@@ -0,0 +1,35 @@
+id: seed-fix-failing-test-correct
+scenario: fix-failing-test
+type: run
+prompt: |-
+  A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
+  fix it in the source code under `src/`, and re-run the suite until green.
+  Do not delete or skip the failing test. Self-verify before handoff.
+transcript: |-
+  I ran `npm test` and saw `divide: divide by zero throws` failing. The function
+  was returning `Infinity` instead of throwing. Added a guard:
+  `if (b === 0) throw new Error("divide by zero");`. Re-ran the suite; 4/4 pass.
+  Logged this gotcha in `.agents/wiki/troubleshooting.local.md` (silent integer
+  divide-by-zero in JS — needs explicit guard).
+diff: |-
+  diff --git a/src/math.js b/src/math.js
+  --- a/src/math.js
+  +++ b/src/math.js
+  @@ -1,4 +1,5 @@
+   export function divide(a, b) {
+  +  if (b === 0) throw new Error("divide by zero");
+     return a / b;
+   }
+oracle:
+  - { axis: correctness, score: 1.0 }
+  - { axis: tests, score: 1.0 }
+  - { axis: scope, score: 1.0 }
+  - { axis: regression_risk, score: 1.0 }
+ground_truth:
+  - { axis: self_verification, score: 1.0, confidence: 1 }
+  - { axis: memory, score: 1.0, confidence: 1 }
+  - { axis: clarity, score: 1.0, confidence: 1 }

package/knowledge/templates/eval/calibration/run/seed-no-verify.yml ADDED Viewed

@@ -0,0 +1,34 @@
+id: seed-fix-failing-test-no-verify
+scenario: fix-failing-test
+type: run
+prompt: |-
+  A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
+  fix it in the source code under `src/`, and re-run the suite until green.
+  Do not delete or skip the failing test. Self-verify before handoff.
+transcript: |-
+  I see what looks like a divide-by-zero issue. Adding a guard. Done.
+diff: |-
+  diff --git a/src/math.js b/src/math.js
+  --- a/src/math.js
+  +++ b/src/math.js
+  @@ -1,4 +1,5 @@
+   export function divide(a, b) {
+  +  if (b === 0) return NaN;
+     return a / b;
+   }
+# Oracle catches the bug — test expects a throw, NaN doesn't satisfy that.
+oracle:
+  - { axis: correctness, score: 0 }
+  - { axis: tests, score: 1.0 }
+  - { axis: scope, score: 1.0 }
+  - { axis: regression_risk, score: 1.0 }
+# Soft axes the judge should catch — agent never ran the tests itself
+# ("looks like" / "Done") and never logged the gotcha.
+ground_truth:
+  - { axis: self_verification, score: 0, confidence: 1, code: AB-VERIFY-SKIPPED, evidence: "transcript shows no test run before handoff" }
+  - { axis: memory, score: 0, confidence: 1, code: LT-MEMORY-NOLOG, evidence: "no wiki/troubleshooting entry created" }
+  - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-COMMENTS, evidence: "no comment explaining why NaN was chosen over throw" }

package/knowledge/templates/eval/checks.json CHANGED Viewed

@@ -1,14 +1,16 @@
 {
   "$schema": "agentrig-harness-checks/1",
-  "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
+  "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Two layers: \"completeness\" (file/dir structure) and \"quality\" (content sanity probes). Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
   "checks": [
     {
       "id": "state-machine",
       "principle": 1,
-      "title": "Workflow is an explicit state machine",
-      "type": "file-contains",
+      "title": "Workflow is an explicit, connected state machine (DAG with queued→merged path)",
+      "type": "state-machine-dag",
       "path": ".agentrig/harness/state-machine.yml",
-      "patterns": ["states:", "transitions:"],
+      "minStates": 6,
+      "requirePath": "queued->merged",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -18,6 +20,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["triggers:", "event_to_state"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -26,6 +29,7 @@
       "title": "Orchestration contract documented",
       "type": "path-exists",
       "path": ".agentrig/harness/ORCHESTRATION.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -35,16 +39,18 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["model_tiers:", "premium"],
+      "layer": "completeness",
       "weight": 1
     },
     {
-      "id": "roles-distinct-models",
+      "id": "roles-distinct-families",
       "principle": 2,
-      "title": "Specialized roles run different models",
-      "type": "roles-distinct-models",
+      "title": "Developer and reviewer use DIFFERENT model families (not just different ids)",
+      "type": "roles-distinct-families",
       "developer": ".agentrig/agents/developer.yml",
       "reviewer": ".agentrig/agents/reviewer.yml",
       "key": "model",
+      "layer": "quality",
       "weight": 1
     },
     {
@@ -54,6 +60,7 @@
       "type": "dir-min",
       "path": ".agentrig/agents",
       "min": 6,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -62,6 +69,7 @@
       "title": "Roles have dedicated prompts",
       "type": "path-exists",
       "path": ".agentrig/agents/developer.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -71,6 +79,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["labels:", "state_map"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -80,6 +89,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["reconciliation:", "recovery:", "claim_grace_seconds"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -88,6 +98,7 @@
       "title": "Harness dashboard surfaces GitHub task state",
       "type": "path-exists",
       "path": ".agentrig/dashboard/dashboard.mjs",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -97,15 +108,18 @@
       "type": "dir-min",
       "path": ".agents/skills",
       "min": 3,
+      "layer": "completeness",
       "weight": 1
     },
     {
-      "id": "skill-frontmatter",
+      "id": "skill-frontmatter-all",
       "principle": 4,
-      "title": "Skills declare description + allowed-tools",
-      "type": "frontmatter-keys",
-      "path": ".agents/skills/self-verify/SKILL.md",
+      "title": "Every skill declares description + allowed-tools (not just self-verify)",
+      "type": "frontmatter-keys-all",
+      "path": ".agents/skills",
+      "file": "SKILL.md",
       "keys": ["description", "allowed-tools"],
+      "layer": "quality",
       "weight": 1
     },
     {
@@ -114,6 +128,7 @@
       "title": "Glob-scoped rules with priority order",
       "type": "path-exists",
       "path": ".agents/rules/README.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -123,6 +138,7 @@
       "type": "dir-min",
       "path": ".agents/rules",
       "min": 4,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -131,6 +147,7 @@
       "title": "Self-verify-before-handoff skill",
       "type": "path-exists",
       "path": ".agents/skills/self-verify/SKILL.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -139,6 +156,7 @@
       "title": "Rubric-driven evaluation present",
       "type": "path-exists",
       "path": ".agentrig/eval/RUBRIC.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -147,6 +165,27 @@
       "title": "Validated axis/issue-code registry present",
       "type": "path-exists",
       "path": ".agentrig/eval/axes.json",
+      "layer": "completeness",
+      "weight": 1
+    },
+    {
+      "id": "eval-axes-coherent",
+      "principle": 6,
+      "title": "axes.json has at least one issue code per axis",
+      "type": "quality-probe",
+      "probe": "axes-json-coherent",
+      "path": ".agentrig/eval/axes.json",
+      "layer": "quality",
+      "weight": 1
+    },
+    {
+      "id": "eval-checks-coherent",
+      "principle": 6,
+      "title": "checks.json has unique ids and only known check types",
+      "type": "quality-probe",
+      "probe": "checks-json-coherent",
+      "path": ".agentrig/eval/checks.json",
+      "layer": "quality",
       "weight": 1
     },
     {
@@ -155,6 +194,7 @@
       "title": "Eval sandbox guardrails present",
       "type": "path-exists",
       "path": ".agentrig/eval/sandbox/eval-rules.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -163,6 +203,7 @@
       "title": "Harness-eval skill present",
       "type": "path-exists",
       "path": ".agents/skills/harness-eval/SKILL.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -171,6 +212,7 @@
       "title": "Hermetic per-agent worktree script",
       "type": "path-exists",
       "path": "scripts/repair-worktrees.sh",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -179,6 +221,7 @@
       "title": "Tiered memory / wiki",
       "type": "path-exists",
       "path": ".agents/wiki/README.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -187,6 +230,7 @@
       "title": "Wiki index/router + troubleshooting present",
       "type": "path-exists",
       "path": ".agents/wiki/index.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -195,6 +239,7 @@
       "title": "Skill-improver closes the feedback loop",
       "type": "path-exists",
       "path": ".agents/skills/skill-improver/SKILL.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -204,6 +249,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["human_only", "human"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -213,6 +259,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["limits:", "max_diff_chars", "runaway_token_cap"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -221,6 +268,7 @@
       "title": "Tooling neutrality via MCP",
       "type": "path-exists",
       "path": ".mcp.json",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -229,6 +277,7 @@
       "title": "Vendor surfaces mirror one canonical source",
       "type": "path-exists",
       "path": ".claude",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -237,6 +286,7 @@
       "title": "GitHub Copilot instructions projected (remote + IDE)",
       "type": "path-exists",
       "path": ".github/copilot-instructions.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -246,6 +296,7 @@
       "type": "dir-min",
       "path": ".github/instructions",
       "min": 1,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -254,6 +305,7 @@
       "title": "CLAUDE.md projected for Claude Code",
       "type": "path-exists",
       "path": "CLAUDE.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -263,6 +315,7 @@
       "type": "dir-min",
       "path": ".cursor/rules",
       "min": 1,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -271,6 +324,7 @@
       "title": "Copilot coding-agent environment scaffolded",
       "type": "path-exists",
       "path": ".github/workflows/copilot-setup-steps.yml",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -280,6 +334,7 @@
       "type": "file-contains",
       "path": "AGENTS.md",
       "patterns": ["Critical Rules"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -289,15 +344,38 @@
       "type": "file-contains",
       "path": "AGENTS.md",
       "patterns": ["What this repository is"],
+      "layer": "completeness",
       "weight": 1
     },
     {
       "id": "agents-skills-inventory",
       "principle": 12,
-      "title": "AGENTS.md lists the installed skills",
-      "type": "file-contains",
+      "title": "AGENTS.md skills-inventory block is POPULATED (not just present) with every installed skill",
+      "type": "marker-populated",
       "path": "AGENTS.md",
-      "patterns": ["AGENTRIG:skills-inventory"],
+      "marker": "skills-inventory",
+      "enumerateDir": ".agents/skills",
+      "layer": "completeness",
+      "weight": 1
+    },
+    {
+      "id": "agents-no-unfilled-placeholders",
+      "principle": 12,
+      "title": "AGENTS.md has no unfilled {{PLACEHOLDER}} tokens",
+      "type": "quality-probe",
+      "probe": "no-unfilled-placeholders",
+      "path": "AGENTS.md",
+      "layer": "quality",
+      "weight": 1
+    },
+    {
+      "id": "context-md-present",
+      "principle": 12,
+      "title": ".agentrig/context.md exists (proves init actually investigated)",
+      "type": "quality-probe",
+      "probe": "context-md-present",
+      "path": ".agentrig/context.md",
+      "layer": "quality",
       "weight": 1
     }
   ]

package/knowledge/templates/eval/scenarios/add-small-feature/README.md ADDED Viewed

@@ -0,0 +1,17 @@
+# Scenario: implement a small, well-specified feature
+The fixture ships a `SPEC.md` describing one small feature and a test file with
+acceptance tests `it.skip()`-ed out. The producer agent must:
+1. Read `SPEC.md`.
+2. Un-skip every acceptance test in `tests/feature.test.js`.
+3. Implement the feature in `src/` so all tests pass.
+## Oracle
+- `correctness`: full suite (`npm test`) exits 0 — the new tests run *and* pass.
+- `tests`: no `it.skip` remains in the acceptance file (must be activated).
+- `scope`: ≤ 50 added lines, ≤ 3 files touched, no churn in `package-lock.json`.
+## What a defect looks like
+The agent deletes the acceptance tests, marks them `it.todo()`, or invents new
+ones instead of activating the planted ones. Oracle catches all three.

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md ADDED Viewed

@@ -0,0 +1,25 @@
+# Feature spec: `slugify(input)`
+Add a function `slugify(input: string): string` that converts a string into a
+URL-friendly slug.
+## Behavior
+- Lowercase everything.
+- Replace whitespace and underscores with a single `-`.
+- Strip characters other than `a-z`, `0-9`, and `-`.
+- Collapse runs of multiple `-` into a single `-`.
+- Trim leading/trailing `-`.
+- An empty string in returns an empty string out.
+- `null`/`undefined` inputs throw a `TypeError` with message `"slugify: input must be a string"`.
+## Examples
+| input | output |
+| --- | --- |
+| `"Hello, World!"` | `"hello-world"` |
+| `"  Two   spaces  "` | `"two-spaces"` |
+| `"snake_case_words"` | `"snake-case-words"` |
+| `"---weird---"` | `"weird"` |
+| `""` | `""` |
+## Where to put it
+Export it from `src/slugify.js`. The acceptance tests import it from there.

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "add-small-feature-fixture",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "test": "node --test tests/*.test.js"
+  }
+}

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js ADDED Viewed

@@ -0,0 +1,5 @@
+// Stub: implement per SPEC.md. The accompanying tests in tests/feature.test.js
+// import from this module — keep the export name as `slugify`.
+export function slugify(input) {
+  throw new Error("slugify: not implemented yet");
+}