npm - @doidor/agentrig - Versions diffs - 0.9.0 → 0.10.0 - Mend

@doidor/agentrig 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/knowledge/templates/eval/checks.json CHANGED Viewed

@@ -1,14 +1,16 @@
 {
   "$schema": "agentrig-harness-checks/1",
-  "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
+  "description": "Deterministic harness audit checks. Each maps a principle to a structural check scored 0 / 0.5 / 1.0. Two layers: \"completeness\" (file/dir structure) and \"quality\" (content sanity probes). Consumed by both `agentrig eval --static` and `node .agentrig/eval/static-audit.mjs`.",
   "checks": [
     {
       "id": "state-machine",
       "principle": 1,
-      "title": "Workflow is an explicit state machine",
-      "type": "file-contains",
+      "title": "Workflow is an explicit, connected state machine (DAG with queued→merged path)",
+      "type": "state-machine-dag",
       "path": ".agentrig/harness/state-machine.yml",
-      "patterns": ["states:", "transitions:"],
+      "minStates": 6,
+      "requirePath": "queued->merged",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -18,6 +20,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["triggers:", "event_to_state"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -26,6 +29,7 @@
       "title": "Orchestration contract documented",
       "type": "path-exists",
       "path": ".agentrig/harness/ORCHESTRATION.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -35,16 +39,18 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["model_tiers:", "premium"],
+      "layer": "completeness",
       "weight": 1
     },
     {
-      "id": "roles-distinct-models",
+      "id": "roles-distinct-families",
       "principle": 2,
-      "title": "Specialized roles run different models",
-      "type": "roles-distinct-models",
+      "title": "Developer and reviewer use DIFFERENT model families (not just different ids)",
+      "type": "roles-distinct-families",
       "developer": ".agentrig/agents/developer.yml",
       "reviewer": ".agentrig/agents/reviewer.yml",
       "key": "model",
+      "layer": "quality",
       "weight": 1
     },
     {
@@ -54,6 +60,7 @@
       "type": "dir-min",
       "path": ".agentrig/agents",
       "min": 6,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -62,6 +69,7 @@
       "title": "Roles have dedicated prompts",
       "type": "path-exists",
       "path": ".agentrig/agents/developer.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -71,6 +79,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["labels:", "state_map"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -80,6 +89,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["reconciliation:", "recovery:", "claim_grace_seconds"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -88,6 +98,7 @@
       "title": "Harness dashboard surfaces GitHub task state",
       "type": "path-exists",
       "path": ".agentrig/dashboard/dashboard.mjs",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -97,15 +108,18 @@
       "type": "dir-min",
       "path": ".agents/skills",
       "min": 3,
+      "layer": "completeness",
       "weight": 1
     },
     {
-      "id": "skill-frontmatter",
+      "id": "skill-frontmatter-all",
       "principle": 4,
-      "title": "Skills declare description + allowed-tools",
-      "type": "frontmatter-keys",
-      "path": ".agents/skills/self-verify/SKILL.md",
+      "title": "Every skill declares description + allowed-tools (not just self-verify)",
+      "type": "frontmatter-keys-all",
+      "path": ".agents/skills",
+      "file": "SKILL.md",
       "keys": ["description", "allowed-tools"],
+      "layer": "quality",
       "weight": 1
     },
     {
@@ -114,6 +128,7 @@
       "title": "Glob-scoped rules with priority order",
       "type": "path-exists",
       "path": ".agents/rules/README.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -123,6 +138,7 @@
       "type": "dir-min",
       "path": ".agents/rules",
       "min": 4,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -131,6 +147,7 @@
       "title": "Self-verify-before-handoff skill",
       "type": "path-exists",
       "path": ".agents/skills/self-verify/SKILL.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -139,6 +156,7 @@
       "title": "Rubric-driven evaluation present",
       "type": "path-exists",
       "path": ".agentrig/eval/RUBRIC.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -147,6 +165,27 @@
       "title": "Validated axis/issue-code registry present",
       "type": "path-exists",
       "path": ".agentrig/eval/axes.json",
+      "layer": "completeness",
+      "weight": 1
+    },
+    {
+      "id": "eval-axes-coherent",
+      "principle": 6,
+      "title": "axes.json has at least one issue code per axis",
+      "type": "quality-probe",
+      "probe": "axes-json-coherent",
+      "path": ".agentrig/eval/axes.json",
+      "layer": "quality",
+      "weight": 1
+    },
+    {
+      "id": "eval-checks-coherent",
+      "principle": 6,
+      "title": "checks.json has unique ids and only known check types",
+      "type": "quality-probe",
+      "probe": "checks-json-coherent",
+      "path": ".agentrig/eval/checks.json",
+      "layer": "quality",
       "weight": 1
     },
     {
@@ -155,6 +194,7 @@
       "title": "Eval sandbox guardrails present",
       "type": "path-exists",
       "path": ".agentrig/eval/sandbox/eval-rules.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -163,6 +203,7 @@
       "title": "Harness-eval skill present",
       "type": "path-exists",
       "path": ".agents/skills/harness-eval/SKILL.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -171,6 +212,7 @@
       "title": "Hermetic per-agent worktree script",
       "type": "path-exists",
       "path": "scripts/repair-worktrees.sh",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -179,6 +221,7 @@
       "title": "Tiered memory / wiki",
       "type": "path-exists",
       "path": ".agents/wiki/README.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -187,6 +230,7 @@
       "title": "Wiki index/router + troubleshooting present",
       "type": "path-exists",
       "path": ".agents/wiki/index.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -195,6 +239,7 @@
       "title": "Skill-improver closes the feedback loop",
       "type": "path-exists",
       "path": ".agents/skills/skill-improver/SKILL.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -204,6 +249,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["human_only", "human"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -213,6 +259,7 @@
       "type": "file-contains",
       "path": ".agentrig/harness/state-machine.yml",
       "patterns": ["limits:", "max_diff_chars", "runaway_token_cap"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -221,6 +268,7 @@
       "title": "Tooling neutrality via MCP",
       "type": "path-exists",
       "path": ".mcp.json",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -229,6 +277,7 @@
       "title": "Vendor surfaces mirror one canonical source",
       "type": "path-exists",
       "path": ".claude",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -237,6 +286,7 @@
       "title": "GitHub Copilot instructions projected (remote + IDE)",
       "type": "path-exists",
       "path": ".github/copilot-instructions.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -246,6 +296,7 @@
       "type": "dir-min",
       "path": ".github/instructions",
       "min": 1,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -254,6 +305,7 @@
       "title": "CLAUDE.md projected for Claude Code",
       "type": "path-exists",
       "path": "CLAUDE.md",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -263,6 +315,7 @@
       "type": "dir-min",
       "path": ".cursor/rules",
       "min": 1,
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -271,6 +324,7 @@
       "title": "Copilot coding-agent environment scaffolded",
       "type": "path-exists",
       "path": ".github/workflows/copilot-setup-steps.yml",
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -280,6 +334,7 @@
       "type": "file-contains",
       "path": "AGENTS.md",
       "patterns": ["Critical Rules"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -289,6 +344,7 @@
       "type": "file-contains",
       "path": "AGENTS.md",
       "patterns": ["What this repository is"],
+      "layer": "completeness",
       "weight": 1
     },
     {
@@ -298,6 +354,27 @@
       "type": "file-contains",
       "path": "AGENTS.md",
       "patterns": ["AGENTRIG:skills-inventory"],
+      "layer": "completeness",
+      "weight": 1
+    },
+    {
+      "id": "agents-no-unfilled-placeholders",
+      "principle": 12,
+      "title": "AGENTS.md has no unfilled {{PLACEHOLDER}} tokens",
+      "type": "quality-probe",
+      "probe": "no-unfilled-placeholders",
+      "path": "AGENTS.md",
+      "layer": "quality",
+      "weight": 1
+    },
+    {
+      "id": "context-md-present",
+      "principle": 12,
+      "title": ".agentrig/context.md exists (proves init actually investigated)",
+      "type": "quality-probe",
+      "probe": "context-md-present",
+      "path": ".agentrig/context.md",
+      "layer": "quality",
       "weight": 1
     }
   ]

package/knowledge/templates/eval/scenarios/add-small-feature/README.md ADDED Viewed

@@ -0,0 +1,17 @@
+# Scenario: implement a small, well-specified feature
+The fixture ships a `SPEC.md` describing one small feature and a test file with
+acceptance tests `it.skip()`-ed out. The producer agent must:
+1. Read `SPEC.md`.
+2. Un-skip every acceptance test in `tests/feature.test.js`.
+3. Implement the feature in `src/` so all tests pass.
+## Oracle
+- `correctness`: full suite (`npm test`) exits 0 — the new tests run *and* pass.
+- `tests`: no `it.skip` remains in the acceptance file (must be activated).
+- `scope`: ≤ 50 added lines, ≤ 3 files touched, no churn in `package-lock.json`.
+## What a defect looks like
+The agent deletes the acceptance tests, marks them `it.todo()`, or invents new
+ones instead of activating the planted ones. Oracle catches all three.

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md ADDED Viewed

@@ -0,0 +1,25 @@
+# Feature spec: `slugify(input)`
+Add a function `slugify(input: string): string` that converts a string into a
+URL-friendly slug.
+## Behavior
+- Lowercase everything.
+- Replace whitespace and underscores with a single `-`.
+- Strip characters other than `a-z`, `0-9`, and `-`.
+- Collapse runs of multiple `-` into a single `-`.
+- Trim leading/trailing `-`.
+- An empty string in returns an empty string out.
+- `null`/`undefined` inputs throw a `TypeError` with message `"slugify: input must be a string"`.
+## Examples
+| input | output |
+| --- | --- |
+| `"Hello, World!"` | `"hello-world"` |
+| `"  Two   spaces  "` | `"two-spaces"` |
+| `"snake_case_words"` | `"snake-case-words"` |
+| `"---weird---"` | `"weird"` |
+| `""` | `""` |
+## Where to put it
+Export it from `src/slugify.js`. The acceptance tests import it from there.

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "add-small-feature-fixture",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "test": "node --test tests/*.test.js"
+  }
+}

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js ADDED Viewed

@@ -0,0 +1,5 @@
+// Stub: implement per SPEC.md. The accompanying tests in tests/feature.test.js
+// import from this module — keep the export name as `slugify`.
+export function slugify(input) {
+  throw new Error("slugify: not implemented yet");
+}

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js ADDED Viewed

@@ -0,0 +1,31 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { slugify } from "../src/slugify.js";
+// Acceptance tests for SPEC.md. They are intentionally `.skip`-ped at the start;
+// the producer must un-skip them AND make them pass.
+test.skip("slugify: lowercases and replaces punctuation", () => {
+  assert.equal(slugify("Hello, World!"), "hello-world");
+});
+test.skip("slugify: collapses whitespace runs", () => {
+  assert.equal(slugify("  Two   spaces  "), "two-spaces");
+});
+test.skip("slugify: replaces underscores with dashes", () => {
+  assert.equal(slugify("snake_case_words"), "snake-case-words");
+});
+test.skip("slugify: collapses and trims dashes", () => {
+  assert.equal(slugify("---weird---"), "weird");
+});
+test.skip("slugify: empty in -> empty out", () => {
+  assert.equal(slugify(""), "");
+});
+test.skip("slugify: non-string throws TypeError", () => {
+  assert.throws(() => slugify(null), TypeError);
+  assert.throws(() => slugify(undefined), { name: "TypeError", message: /input must be a string/ });
+});

package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md ADDED Viewed

@@ -0,0 +1,25 @@
+# Judge brief — add-small-feature (DO NOT SHOW TO PRODUCER)
+## What's planted
+- `fixture/SPEC.md` describes a `slugify(input)` function.
+- `fixture/tests/feature.test.js` has 6 acceptance tests, all `test.skip()`.
+- `fixture/src/slugify.js` is a stub that throws "not implemented yet".
+## What a correct implementation looks like
+```js
+export function slugify(input) {
+  if (typeof input !== "string") throw new TypeError("slugify: input must be a string");
+  return input
+    .toLowerCase()
+    .replace(/[\s_]+/g, "-")
+    .replace(/[^a-z0-9-]/g, "")
+    .replace(/-+/g, "-")
+    .replace(/^-+|-+$/g, "");
+}
+```
+Equivalent implementations pass too.
+## Soft-axis guidance
+- **gate_compliance**: did the agent self-verify (npm test) before declaring done?
+- **clarity**: penalize multi-step intermediates / unnecessary complexity.
+- **maintainability**: penalize edits to package.json, lockfile, or extra files.

package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml ADDED Viewed

@@ -0,0 +1,41 @@
+checks:
+  - id: tests-green
+    type: cmd
+    cmd: "npm test --silent"
+    expect: exit_zero
+    axis: correctness
+  - id: all-acceptance-tests-run
+    type: cmd
+    # The 6 acceptance tests must actually execute. "pass 6" or higher means the
+    # producer un-skipped them and they passed; "pass 0" means they're still skipped.
+    cmd: "npm test --silent 2>&1 | grep -qE '# pass [6-9]'"
+    expect: exit_zero
+    axis: correctness
+  - id: no-skipped-acceptance-tests
+    type: cmd
+    cmd: "! grep -qE '(test\\.skip|it\\.skip)' tests/feature.test.js"
+    expect: exit_zero
+    axis: tests
+  - id: diff-bounded
+    type: diff_stats
+    max_added_lines: 60
+    max_files: 3
+    axis: scope
+  - id: no-lockfile-churn
+    type: diff_files
+    forbidden:
+      - "package-lock.json"
+      - "yarn.lock"
+      - "pnpm-lock.yaml"
+    axis: scope
+  - id: only-src-and-tests
+    type: diff_files
+    allowed:
+      - "src/**"
+      - "tests/**"
+    axis: scope

package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md ADDED Viewed

@@ -0,0 +1,17 @@
+# Task: implement the small feature
+Read `SPEC.md`. The acceptance tests for the feature already exist in
+`tests/feature.test.js` but are currently skipped with `test.skip(...)`. Your job is:
+1. Un-skip every acceptance test (change `test.skip` to `test`).
+2. Implement the feature in `src/` so the un-skipped tests pass.
+Constraints:
+- Do NOT invent new tests in place of the planted ones — the acceptance tests are
+  the contract.
+- Do NOT delete or `test.todo()` the tests.
+- Keep the diff small: ≤ ~50 added lines, ≤ 3 files touched.
+- Do NOT touch `package-lock.json`.
+- Self-verify with `npm test` before handing off.
+When done, summarize what you implemented.

package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml ADDED Viewed

@@ -0,0 +1,22 @@
+---
+id: add-small-feature
+bundled: true   # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
+type: run
+scope: feature
+principle_focus: [1, 5, 10]
+description: >-
+  Implement a well-specified small feature: an acceptance test exists but is
+  skipped. The agent must un-skip it AND make it pass, without inflating diff
+  scope or skipping the state-machine gates.
+oracle_axes:
+  - correctness       # acceptance tests un-skipped and passing
+  - tests             # all suite green
+  - scope             # diff bounded; no lockfile churn
+judge_axes:
+  - gate_compliance
+  - self_verification
+  - clarity
+  - maintainability
+---

package/knowledge/templates/eval/scenarios/fix-failing-test/README.md ADDED Viewed

@@ -0,0 +1,18 @@
+# Scenario: fix a failing test (deterministic)
+A unit test in `fixture/` is genuinely broken. The producer agent must:
+1. Run the test suite to see what's red.
+2. Fix the **root cause** in `fixture/src/`.
+3. Re-run the suite and confirm green.
+4. NOT delete or weaken the test.
+The oracle (`oracle.yml`) deterministically verifies the outcome:
+- `correctness`: `npm test` exits 0 in the worktree.
+- `tests`: the test file is still present (no deletion / no `it.skip`).
+- `scope`: diff ≤ 30 added lines, ≤ 3 files touched, all under `src/` or `tests/`.
+- `regression_risk`: no other test file modified.
+Soft axes (`self_verification`, `memory`, `clarity`) are scored by the LLM judge
+using the producer's transcript + diff.

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "fix-failing-test-fixture",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "test": "node --test tests/*.test.js"
+  }
+}

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js ADDED Viewed

@@ -0,0 +1,13 @@
+// Intentionally buggy: integer division-by-zero is silently turned into Infinity
+// instead of throwing. The accompanying test expects a thrown error.
+export function divide(a, b) {
+  return a / b;
+}
+export function add(a, b) {
+  return a + b;
+}
+export function multiply(a, b) {
+  return a * b;
+}

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js ADDED Viewed

@@ -0,0 +1,7 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { add } from "../src/math.js";
+test("add: 2 + 3 = 5", () => {
+  assert.equal(add(2, 3), 5);
+});

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js ADDED Viewed

@@ -0,0 +1,11 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { divide } from "../src/math.js";
+test("divide: returns quotient", () => {
+  assert.equal(divide(10, 2), 5);
+});
+test("divide: divide by zero throws", () => {
+  assert.throws(() => divide(1, 0), /divide by zero/i);
+});

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js ADDED Viewed

@@ -0,0 +1,7 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { multiply } from "../src/math.js";
+test("multiply: 2 * 3 = 6", () => {
+  assert.equal(multiply(2, 3), 6);
+});

package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md ADDED Viewed

@@ -0,0 +1,20 @@
+# Judge brief — fix-failing-test (DO NOT SHOW TO PRODUCER)
+## What was planted
+`fixture/src/math.js` had `divide` returning `a / b` with no zero check, so
+`divide(1, 0)` returned `Infinity` instead of throwing. The test in
+`tests/divide.test.js` asserts a thrown error matching `/divide by zero/i`.
+## What a correct fix looks like
+```js
+export function divide(a, b) {
+  if (b === 0) throw new Error("divide by zero");
+  return a / b;
+}
+```
+(Equivalent variations are fine — anything that throws with a matching message.)
+## Soft-axis guidance for the judge
+- **self_verification**: did the producer actually run `npm test` and observe red→green?
+- **memory**: did the producer log this gotcha in `.agents/wiki/`?  (1.0 = yes, 0.5 = mentioned but not committed, 0 = silent)
+- **clarity**: is the fix idiomatic and minimal? (penalize wrapping in try/catch, returning `NaN`, etc.)

package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml ADDED Viewed

@@ -0,0 +1,33 @@
+checks:
+  - id: tests-green
+    type: cmd
+    cmd: "npm test --silent"
+    expect: exit_zero
+    axis: correctness
+  - id: failing-test-still-present
+    type: file_contains
+    path: "tests/divide.test.js"
+    pattern: "divide by zero"
+    axis: tests
+  - id: diff-bounded
+    type: diff_stats
+    max_added_lines: 30
+    max_removed_lines: 10
+    max_files: 3
+    axis: scope
+  - id: only-allowed-files
+    type: diff_files
+    allowed:
+      - "src/**"
+      - "tests/**"
+    axis: scope
+  - id: other-test-files-untouched
+    type: diff_files
+    forbidden:
+      - "tests/multiply.test.js"
+      - "tests/add.test.js"
+    axis: regression_risk