npm - @doidor/agentrig - Versions diffs - 0.9.0 → 0.10.0 - Mend

@doidor/agentrig 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/knowledge/templates/agents/triager.yml CHANGED Viewed

@@ -1,8 +1,9 @@
 # Triager role (principle 2, 9). Moves `ingested` tasks to `queued`: recommend labels/assignees,
-# size the work, and gate on human approval for low-reversibility calls. Uses a fast, cheap model
-# on purpose — triage is high-volume and should not burn a premium tier.
+# size the work, and gate on human approval for low-reversibility calls. Currently uses a top-tier
+# GPT model at user request — a cheap tier (gpt-5-mini, claude-haiku-4.5) would suffice for
+# routine routing and would be a sensible cost optimization if triage volume scales up.
 role: triager
-model: gpt-5-mini
-model_tier: low
+model: gpt-5.5
+model_tier: high
 allowed_tools: [read, grep, glob, bash]
 prompt: agents/triager.md

package/knowledge/templates/dashboard/dashboard.mjs CHANGED Viewed

@@ -147,7 +147,8 @@ const tasks = loadTasks(stateLabels);
 const data = {
   generatedAt: new Date().toISOString(),
   repo: repoRoot,
-  harnessScore: audit?.harnessScore ?? null,
+  installCompleteness: audit?.installCompleteness ?? null,
+  qualityProbes: audit?.qualityProbes ?? null,
   principles: audit?.principles ?? [],
   roster,
   tasks,
@@ -178,8 +179,12 @@ function renderTerminal(d) {
   console.log(`\n${bold("AgentRig — harness dashboard")}  ${dim(d.repo)}`);
   console.log(rule);
-  const scoreColor = d.harnessScore == null ? dim : d.harnessScore >= 80 ? green : d.harnessScore >= 50 ? yellow : red;
-  console.log(`${bold("Harness Score")}  ${scoreColor(d.harnessScore == null ? "n/a" : d.harnessScore + "%")}`);
+  const scoreColor = d.installCompleteness == null ? dim : d.installCompleteness >= 80 ? green : d.installCompleteness >= 50 ? yellow : red;
+  console.log(`${bold("Install Completeness")}  ${scoreColor(d.installCompleteness == null ? "n/a" : d.installCompleteness + "%")}`);
+  if (d.qualityProbes != null) {
+    const qColor = d.qualityProbes >= 80 ? green : d.qualityProbes >= 50 ? yellow : red;
+    console.log(`${bold("Quality Probes")}        ${qColor(d.qualityProbes + "%")}`);
+  }
   if (d.principles.length) {
     const weak = d.principles.filter((p) => p.score < 1).map((p) => `P${p.principle} ${(p.score * 100).toFixed(0)}%`);
     console.log(dim(`  weak principles: ${weak.length ? weak.join(", ") : "none — all full credit"}`));
@@ -227,7 +232,8 @@ function renderTerminal(d) {
 function renderHtml(d) {
   const esc = (s) => String(s).replace(/[&<>]/g, (m) => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;" }[m]));
-  const scoreClass = d.harnessScore == null ? "na" : d.harnessScore >= 80 ? "good" : d.harnessScore >= 50 ? "warn" : "bad";
+  const scoreClass = d.installCompleteness == null ? "na" : d.installCompleteness >= 80 ? "good" : d.installCompleteness >= 50 ? "warn" : "bad";
+  const qualityClass = d.qualityProbes == null ? "na" : d.qualityProbes >= 80 ? "good" : d.qualityProbes >= 50 ? "warn" : "bad";
   const rosterRows = d.roster.map((a) => `<tr><td>${esc(a.role)}</td><td>${esc(a.model || "?")}</td><td>${esc(a.tier || "")}</td></tr>`).join("");
   let tasksHtml;
   if (!d.tasks.available) {
@@ -252,7 +258,8 @@ table{border-collapse:collapse;width:100%}td,th{text-align:left;padding:.25rem .
 </style></head><body>
 <h1>AgentRig — harness dashboard</h1>
 <p class="muted">${esc(d.repo)} · generated ${esc(d.generatedAt)}</p>
-<h2>Harness Score</h2><p class="score ${scoreClass}">${d.harnessScore == null ? "n/a" : d.harnessScore + "%"}</p>
+<h2>Install Completeness</h2><p class="score ${scoreClass}">${d.installCompleteness == null ? "n/a" : d.installCompleteness + "%"}</p>
+${d.qualityProbes != null ? `<h2>Quality Probes</h2><p class="score ${qualityClass}">${d.qualityProbes}%</p>` : ""}
 <h2>Agents (${d.roster.length})</h2><table><tr><th>Role</th><th>Model</th><th>Tier</th></tr>${rosterRows}</table>
 <h2>Tasks</h2>${tasksHtml}
 <h2>Evals</h2>${evalRows ? `<table><tr><th></th><th>Scenario</th><th>Score</th><th>Judge</th></tr>${evalRows}</table><p class="muted">overall ${d.evals.overall.toFixed(2)}</p>` : '<p class="muted">No dynamic eval runs yet.</p>'}

package/knowledge/templates/eval/RUBRIC.md CHANGED Viewed

@@ -1,94 +1,117 @@
 # Harness evaluation rubric (principle 6)
-Two layers. **Layer A** is deterministic and model-free; **Layer B** is an independent,
-model-judged behavioral eval. Both write to `.agentrig/eval/results/` via `score.mjs`
-(never hand-edit JSON). The machine-readable rubric registry lives in
-[`axes.json`](./axes.json) — `score.mjs` validates every score against it.
+Three layers. Each makes a different, **bounded** claim — don't over-read what any one of them proves.
+| Layer | What it actually proves | What it does NOT prove | Cost |
+|---|---|---|---|
+| **A1 — install completeness** | every canonical artifact is present and minimally well-formed | the artifacts *work*, or that agents respect them | ~1 second, no model |
+| **A2 — quality probes** | content sanity (parseable YAML/JSON, no unfilled `{{PLACEHOLDER}}`, distinct model **families**, every skill has frontmatter, axes have issue codes) | semantic quality of the content | ~1 second, no model |
+| **B — dynamic behavioral eval** | how the harness *changes agent behavior* on fixed fixtures — verified by deterministic oracles for hard axes + an independent judge for soft axes, with paired sign-test lift vs a baseline | absolute "is this agent good" — only relative to baseline | minutes to hours, real model spend |
+All three persist results under `.agentrig/eval/results/` via `score.mjs`. **Never hand-edit** the JSON.
+The schema is validated on read (`schemaVersion: 2`) and on write — invalid records are quarantined
+into `results/_legacy/`.
 ---
-## Layer A — Static harness audit
-Scored automatically by `checks.json`. Each check maps to a principle and earns **0 / 0.5 / 1.0**.
-The aggregate is the **Harness Score** (0–100%). Run:
+## Layer A1 + A2 — static audit (`agentrig eval --static`)
+Scored from `checks.json`. Each check earns **0 / 0.5 / 1.0** and carries a `layer` field
+(`completeness` vs `quality`). Two aggregate scores:
+- **Install Completeness** — was every canonical artifact installed where the manifest said it should be?
+- **Quality Probes** — does the content of those artifacts pass cheap sanity checks?
 ```bash
-node .agentrig/eval/static-audit.mjs           # or: agentrig eval --static
+node .agentrig/eval/static-audit.mjs            # human report (both layers)
+node .agentrig/eval/static-audit.mjs --json     # machine-readable
+node .agentrig/eval/static-audit.mjs --min 80   # exit non-zero if completeness < 80%
 ```
-Treat any principle scoring < 1.0 as a missing/weak artifact to fix.
+A1 is what CI gates on (`--min`). A2 surfaces drift but doesn't fail the build — it's diagnostic.
 ---
-## Layer B — Dynamic behavioral eval
+## Layer B — dynamic behavioral eval (`agentrig eval --dynamic`)
-For each scenario, run the task through the harness, then have an **independent judge model**
-(different from the producer) score the result. Scoring is **strict 3-tier: 0 / 0.5 / 1.0**.
+For each scenario:
-Three rules, enforced by `score.mjs` against `axes.json`:
-1. **Issue code required.** Any axis < 1.0 (and observed) must carry an issue code **from that
-   axis's bounded registry** plus a one-line **evidence** string. Invented codes are rejected.
-2. **Confidence-gated.** An axis you couldn't observe is scored `na` (confidence 0) and excluded
-   from rollups — partial observability never contaminates the total.
-3. **Rollups are recomputed from axes.** Category and aggregate scores come from the axis data, not
-   from anything the judge asserts.
+1. **Seed** a throwaway worktree from `scenarios/<id>/fixture/`.
+2. **Producer** (one model, runs in the worktree) executes `prompt.md`. Stage the harness or not, per `--variant`.
+3. **Oracle** (`scenarios/<id>/oracle.yml`) runs deterministic checks (commands, diff stats, file presence) → hard-axis scores.
+4. **Judge** (a *different model family*, runs in its own cwd with prompt+diff+transcript+oracle but **NOT** the producer's worktree or reasoning) scores soft axes against `axes.json`.
+5. **Save** via `score.mjs save` — validated against the rubric registry.
-### Multi-rubric lifecycle
-The eval covers the whole lifecycle, not just the final patch. Three rubric **types**, linked by the
-same `--task` id so you get a spec → run → review view:
+### Producer/judge isolation
+- The producer and the judge are **separate `provider.startConversation()` calls**. The judge never sees the producer's reasoning trace.
+- `score.mjs save` rejects a record where the producer and judge share a **model family** (e.g. both `claude-*`). Override with `--allow-same-family` — and the override is recorded in the result so reviewers can spot lazy single-model setups.
+- The judge writes scores via a JSON file (`<artifactsDir>/<scenario>.trial<N>.judge.json`), not free-form text. The orchestrator reads + validates it against `axes.json`.
-| `--type` | What it scores | Categories |
-|----------|----------------|------------|
-| `spec`   | task/issue spec quality (before work) | spec_quality (clarity, acceptance_criteria, scope_bounded, testability, context) |
-| `run`    | the implementation run | output_quality, agent_behavior, long_term_impact |
-| `review` | the reviewer's own behavior | review_quality (finding_correctness, severity_calibration, false_positive_rate, coverage, actionability, independence, blocking_decision) |
+### Rubric rules (enforced by `score.mjs`)
+1. **Strict 3-tier** scores: `0` / `0.5` / `1.0`.
+2. **Issue code required.** Any axis < 1.0 with `confidence > 0` must carry an issue code from that axis's bounded registry plus a one-line evidence string.
+3. **Confidence-gated.** An axis you couldn't observe is `=na` (confidence 0) and excluded from rollups.
+4. **Weighted aggregation.** Axes carry an optional `weight` (default 1) and `veto: true`. The aggregate is a weighted mean of observed axes.
+5. **Pass rule:** `aggregate ≥ passThreshold` **AND** no observed axis at 0 **AND** no veto axis < 1.0. Veto fails are surfaced in the `failReason` field.
-### `run` axes (the most common)
-- **Output Quality** — `correctness`, `scope`, `tests`, `clarity`
-- **Agent Behavior** — `self_verification`, `gate_compliance`, `tool_discipline`, `escalation`
-- **Long-Term Impact** — `memory`, `regression_risk`, `maintainability`
+### Lifecycle types
+| `--type` | Categories | Veto axes |
+|---|---|---|
+| `spec` | `clarity`, `acceptance_criteria`, `scope_bounded`, `testability`, `context` | `acceptance_criteria` |
+| `run` | `output_quality`, `agent_behavior`, `long_term_impact` (10 axes total) | `correctness`, `gate_compliance` |
+| `review` | `review_quality` (7 axes) | `finding_correctness`, `blocking_decision` |
-See `axes.json` for the full per-axis issue-code registries (e.g. `OQ-SCOPE-CHURN`,
-`AB-VERIFY-REDHANDOFF`, `LT-REGRESS-LIKELY`).
+### Multi-trial + statistical lift (`--n` + `compare --baseline`)
+Single-trial verdicts are coin flips. The eval requires `n ≥ 3` paired trials for any verdict
+other than `INCONCLUSIVE`:
-### Saving and reading scores
 ```bash
-# Save one rubric (any axis < 1.0 needs CODE:evidence; use `=na` for unobserved axes)
-node .agentrig/eval/score.mjs save --type run --task add-small-feature \
-  --scenario add-small-feature --judge <model> [--variant v2] [--run RID] \
-  --axis 'correctness=1.0' \
-  --axis 'scope=0.5:OQ-SCOPE-CHURN:left package-lock churn in the diff' \
-  --axis 'tests=na'
-node .agentrig/eval/score.mjs report                     # latest per scenario/variant + per-axis means
-node .agentrig/eval/score.mjs compare --scenario <id>    # A/B variants side by side
+# Run both variants 5 times each.
+agentrig eval --dynamic --variant harness  --n 5
+agentrig eval --dynamic --variant baseline --n 5
+# Paired sign test, median delta, p-value:
+node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
 ```
-### A/B variant evaluation
-Run the **same scenario** under different harness versions (a prompt/skill/rule change) and save each
-under a `--variant`. `score.mjs compare` puts them side by side. **A change that lowers the score is
-a regression even if it "feels" better.** For deeper diffing, keep each run's `diff.patch` /
-`output` artifacts next to the score (see the `harness-eval` skill).
+Verdicts:
+- **HELPS**  — p < 0.05, median delta > 0.05
+- **HURTS**  — p < 0.05, median delta < -0.05
+- **INCONCLUSIVE** — n < 3, or p ≥ 0.05, or |median delta| < 0.05
-### Harness lift — does it actually help? (with vs without)
-Prove the harness earns its keep in *your* repo by comparing a harness-on run to a harness-off
-baseline:
+### Sandboxing
+Run dynamic evals under [`sandbox/eval-rules.md`](sandbox/eval-rules.md): the producer works in a
+throwaway worktree under `$TMPDIR/agentrig-eval/<runId>/<scenario>/` and **must not push, open PRs,
+or merge** — the eval measures behavior, it must not mutate real branches.
+---
+## Calibrating the judge (`calibration/`)
+A judge that always returns 1.0 passes every `score.mjs save` validation but tells you nothing.
+The `calibration/` directory holds **hand-labeled** rubric instances (scenario inputs + transcript +
+diff + ground-truth axes). `score.mjs calibrate --judge <model>` runs your judge over them and
+reports % agreement (within ±0.5 tier) and signed bias.
 ```bash
-agentrig eval --dynamic --scenario <id> --variant harness    # harness ON
-agentrig eval --dynamic --scenario <id> --variant baseline   # bare agent, no AGENTS.md/rules/skills
-node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
+# After your judge wrote scores to /tmp/judge-out.json:
+node .agentrig/eval/score.mjs calibrate \
+  --judge gpt-5.5 --instance .agentrig/eval/calibration/run/seed-correct.yml \
+  --judge-scores /tmp/judge-out.json
+node .agentrig/eval/score.mjs calibrate --report
 ```
-`compare --baseline` prints the per-axis and aggregate **delta** and a `HELPS`/`HURTS` verdict. A
-positive aggregate delta means installing AgentRig improved agent behavior here.
-### Threshold
-A scenario passes if its aggregate ≥ **0.8** (`passThreshold` in `axes.json`) with no observed axis
-at 0.
+`agentrig doctor` reads the calibration rollup and flags any judge below **80% agreement**. See
+[`calibration/README.md`](calibration/README.md) for the format and how to add more instances.
 ---
-## Sandboxing
-Run dynamic evals under the guardrails in [`sandbox/eval-rules.md`](./sandbox/eval-rules.md): the
-agent works in a throwaway worktree and must **not push, open PRs, or merge** — the eval measures
-behavior, it must not mutate real branches.
+## When to run what
+| When | What |
+|---|---|
+| Every PR | A1 + A2 via `eval --static` (CI gate at `--min 80` or higher) |
+| Nightly on main | Layer B with `--n 5` × `harness` and `baseline`, then `compare --baseline baseline` |
+| Before releasing AgentRig | `score.mjs calibrate --report` ≥ 80% for default judge |
+| When prompts/skills/rules change | Manual `eval --dynamic --variant harness-v2 --n 5` + compare against `harness` |

package/knowledge/templates/eval/axes.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "$schema": "agentrig-eval-axes/1",
-  "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Inspired by epichan's agent_scoring/issue_spec/review_scoring registries.",
+  "$schema": "agentrig-eval-axes/2",
+  "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Axes can declare `weight` (default 1.0) and `veto: true`; a veto axis < 1.0 fails the scenario regardless of aggregate. Inspired by epichan's pydantic-validated scoring.",
   "tiers": [0, 0.5, 1.0],
   "passThreshold": 0.8,
   "types": {
@@ -8,21 +8,21 @@
       "label": "Implementation run (the harness doing a task)",
       "categories": {
         "output_quality": {
-          "correctness": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"],
-          "scope": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"],
-          "tests": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"],
-          "clarity": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"]
+          "correctness": { "codes": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"], "weight": 2, "veto": true },
+          "scope":       { "codes": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"] },
+          "tests":       { "codes": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"] },
+          "clarity":     { "codes": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"], "weight": 0.5 }
         },
         "agent_behavior": {
-          "self_verification": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"],
-          "gate_compliance": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"],
-          "tool_discipline": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"],
-          "escalation": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"]
+          "self_verification": { "codes": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"] },
+          "gate_compliance":   { "codes": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"], "veto": true },
+          "tool_discipline":   { "codes": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"] },
+          "escalation":        { "codes": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"] }
         },
         "long_term_impact": {
-          "memory": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"],
-          "regression_risk": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"],
-          "maintainability": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"]
+          "memory":          { "codes": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"] },
+          "regression_risk": { "codes": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"] },
+          "maintainability": { "codes": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"] }
         }
       }
     },
@@ -30,11 +30,11 @@
       "label": "Task/issue spec quality (before implementation)",
       "categories": {
         "spec_quality": {
-          "clarity": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"],
-          "acceptance_criteria": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"],
-          "scope_bounded": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"],
-          "testability": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"],
-          "context": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"]
+          "clarity":             { "codes": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"] },
+          "acceptance_criteria": { "codes": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"], "veto": true },
+          "scope_bounded":       { "codes": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"] },
+          "testability":         { "codes": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"] },
+          "context":             { "codes": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"] }
         }
       }
     },
@@ -42,13 +42,13 @@
       "label": "Review process quality (the reviewer's behavior)",
       "categories": {
         "review_quality": {
-          "finding_correctness": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"],
-          "severity_calibration": ["RV-SEV-OVER", "RV-SEV-UNDER"],
-          "false_positive_rate": ["RV-FP-NOISE", "RV-FP-STYLE"],
-          "coverage": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"],
-          "actionability": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"],
-          "independence": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"],
-          "blocking_decision": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"]
+          "finding_correctness": { "codes": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"], "weight": 2, "veto": true },
+          "severity_calibration":{ "codes": ["RV-SEV-OVER", "RV-SEV-UNDER"] },
+          "false_positive_rate": { "codes": ["RV-FP-NOISE", "RV-FP-STYLE"] },
+          "coverage":            { "codes": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"] },
+          "actionability":       { "codes": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"] },
+          "independence":        { "codes": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"] },
+          "blocking_decision":   { "codes": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"], "veto": true }
         }
       }
     }

package/knowledge/templates/eval/calibration/README.md ADDED Viewed

@@ -0,0 +1,54 @@
+# Judge calibration set
+Goal: prove that the **judge model itself** isn't just handing out 1.0s. Each
+file here contains a **hand-labeled** rubric instance — scenario inputs, the
+agent's transcript + diff, and the ground-truth per-axis scores with issue
+codes and evidence.
+`score.mjs calibrate --judge <model>` runs the judge over every instance in this
+directory, compares its output to the ground truth, and reports:
+- % of axes scored within ±0.5 tier of truth
+- per-axis signed bias (mean judge − truth)
+- tier confusion (e.g. how often does judge say 1.0 when truth is 0.5?)
+A judge below 80% within-±0.5 is rejected by `agentrig doctor` and by the
+release publish gate documented in `../../RELEASING.md`.
+## Adding a calibration instance
+Drop a YAML file into `<type>/` (e.g. `run/`, `review/`, `spec/`):
+```yaml
+id: my-instance-1
+scenario: fix-failing-test       # which scenario this came from
+type: run
+prompt: |-                        # the task the producer received
+  ...
+transcript: |-                    # what the producer said it did
+  ...
+diff: |-                          # the patch the producer left behind
+  ...
+oracle:                           # axes already scored by the deterministic oracle
+  - { axis: correctness, score: 1.0 }
+  - { axis: scope, score: 1.0 }
+ground_truth:                     # SOFT axes — hand-labeled by you
+  - { axis: self_verification, score: 1.0, confidence: 1 }
+  - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-NAMING, evidence: "single-letter helper name" }
+  - { axis: memory, score: 0,   confidence: 0 }
+```
+## Why this exists
+Without calibration, the judge is unmeasured. A judge that returns 1.0 for
+every axis passes every `score.mjs save` validation but tells you nothing about
+the producer's actual behavior — and never disagrees, never blocks. That's the
+"the audit of the audit is missing" defect identified in the original eval
+critique. This directory is the audit of the audit.
+## Bootstrap state
+This ships with **one** seed instance per type so the `score.mjs calibrate`
+command has something to run against on day one. Real calibration requires
+**at least 10 instances per type** (see RELEASING.md release gate). Add yours
+by saving past runs you've reviewed; the format above takes about 5 minutes
+per instance.

package/knowledge/templates/eval/calibration/review/seed-correct.yml ADDED Viewed

@@ -0,0 +1,43 @@
+id: seed-review-catches-bug-correct
+scenario: review-catches-bug
+type: review
+prompt: |-
+  Review the patch in this worktree. Identify any real bugs. Decide whether
+  to block. Write a review.json with {verdict, findings[]}.
+transcript: |-
+  Reviewed src/pagination.js vs HEAD~1. Spotted an off-by-one at line 5:
+  end = pageSize * page + 1 yields overlapping windows. Should be
+  end = start + pageSize. Blocked. The src/format.js change is a harmless
+  cosmetic refactor (extracting PREFIX); flagged it as non-blocking comment.
+diff: |-
+  diff --git a/review.json b/review.json
+  --- /dev/null
+  +++ b/review.json
+  @@ -0,0 +1,15 @@
+  +{
+  +  "verdict": "block",
+  +  "findings": [
+  +    { "file": "src/pagination.js", "line_start": 5, "line_end": 5,
+  +      "severity": "blocking",
+  +      "description": "Off-by-one in end: pageSize*page+1 overlaps pages by one element",
+  +      "suggestion": "Use const end = start + pageSize;" },
+  +    { "file": "src/format.js", "line_start": 4, "line_end": 4,
+  +      "severity": "non-blocking",
+  +      "description": "Cosmetic refactor extracting PREFIX constant",
+  +      "suggestion": "Optional; either form is fine" }
+  +  ]
+  +}
+oracle:
+  - { axis: finding_correctness, score: 1.0 }
+  - { axis: blocking_decision, score: 1.0 }
+  - { axis: false_positive_rate, score: 1.0 }
+ground_truth:
+  - { axis: severity_calibration, score: 1.0, confidence: 1 }
+  - { axis: coverage, score: 0.5, confidence: 1, code: RV-COV-SHALLOW, evidence: "did not check whether a regression test was added" }
+  - { axis: actionability, score: 1.0, confidence: 1 }
+  - { axis: independence, score: 1.0, confidence: 1 }

package/knowledge/templates/eval/calibration/run/seed-correct.yml ADDED Viewed

@@ -0,0 +1,35 @@
+id: seed-fix-failing-test-correct
+scenario: fix-failing-test
+type: run
+prompt: |-
+  A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
+  fix it in the source code under `src/`, and re-run the suite until green.
+  Do not delete or skip the failing test. Self-verify before handoff.
+transcript: |-
+  I ran `npm test` and saw `divide: divide by zero throws` failing. The function
+  was returning `Infinity` instead of throwing. Added a guard:
+  `if (b === 0) throw new Error("divide by zero");`. Re-ran the suite; 4/4 pass.
+  Logged this gotcha in `.agents/wiki/troubleshooting.local.md` (silent integer
+  divide-by-zero in JS — needs explicit guard).
+diff: |-
+  diff --git a/src/math.js b/src/math.js
+  --- a/src/math.js
+  +++ b/src/math.js
+  @@ -1,4 +1,5 @@
+   export function divide(a, b) {
+  +  if (b === 0) throw new Error("divide by zero");
+     return a / b;
+   }
+oracle:
+  - { axis: correctness, score: 1.0 }
+  - { axis: tests, score: 1.0 }
+  - { axis: scope, score: 1.0 }
+  - { axis: regression_risk, score: 1.0 }
+ground_truth:
+  - { axis: self_verification, score: 1.0, confidence: 1 }
+  - { axis: memory, score: 1.0, confidence: 1 }
+  - { axis: clarity, score: 1.0, confidence: 1 }

package/knowledge/templates/eval/calibration/run/seed-no-verify.yml ADDED Viewed

@@ -0,0 +1,34 @@
+id: seed-fix-failing-test-no-verify
+scenario: fix-failing-test
+type: run
+prompt: |-
+  A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
+  fix it in the source code under `src/`, and re-run the suite until green.
+  Do not delete or skip the failing test. Self-verify before handoff.
+transcript: |-
+  I see what looks like a divide-by-zero issue. Adding a guard. Done.
+diff: |-
+  diff --git a/src/math.js b/src/math.js
+  --- a/src/math.js
+  +++ b/src/math.js
+  @@ -1,4 +1,5 @@
+   export function divide(a, b) {
+  +  if (b === 0) return NaN;
+     return a / b;
+   }
+# Oracle catches the bug — test expects a throw, NaN doesn't satisfy that.
+oracle:
+  - { axis: correctness, score: 0 }
+  - { axis: tests, score: 1.0 }
+  - { axis: scope, score: 1.0 }
+  - { axis: regression_risk, score: 1.0 }
+# Soft axes the judge should catch — agent never ran the tests itself
+# ("looks like" / "Done") and never logged the gotcha.
+ground_truth:
+  - { axis: self_verification, score: 0, confidence: 1, code: AB-VERIFY-SKIPPED, evidence: "transcript shows no test run before handoff" }
+  - { axis: memory, score: 0, confidence: 1, code: LT-MEMORY-NOLOG, evidence: "no wiki/troubleshooting entry created" }
+  - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-COMMENTS, evidence: "no comment explaining why NaN was chosen over throw" }