@doidor/agentrig 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +62 -27
  2. package/dist/agent/copilot.js +46 -5
  3. package/dist/agent/copilot.js.map +1 -1
  4. package/dist/cli.js +30 -5
  5. package/dist/cli.js.map +1 -1
  6. package/dist/commands/doctor.js +53 -8
  7. package/dist/commands/doctor.js.map +1 -1
  8. package/dist/commands/eval-dynamic.js +316 -0
  9. package/dist/commands/eval-dynamic.js.map +1 -0
  10. package/dist/commands/eval-scaffold.js +173 -0
  11. package/dist/commands/eval-scaffold.js.map +1 -0
  12. package/dist/commands/eval.js +184 -55
  13. package/dist/commands/eval.js.map +1 -1
  14. package/dist/core/audit.js +237 -9
  15. package/dist/core/audit.js.map +1 -1
  16. package/dist/core/model-family.js +31 -0
  17. package/dist/core/model-family.js.map +1 -0
  18. package/dist/core/scenario-runner.js +298 -0
  19. package/dist/core/scenario-runner.js.map +1 -0
  20. package/dist/prompts/index.js +121 -30
  21. package/dist/prompts/index.js.map +1 -1
  22. package/knowledge/PRINCIPLES.md +2 -2
  23. package/knowledge/manifest.json +16 -1
  24. package/knowledge/templates/AGENTS.md +7 -6
  25. package/knowledge/templates/agents/README.md +4 -4
  26. package/knowledge/templates/agents/developer.yml +1 -1
  27. package/knowledge/templates/agents/judge.yml +1 -1
  28. package/knowledge/templates/agents/reviewer.yml +1 -1
  29. package/knowledge/templates/agents/triager.yml +5 -4
  30. package/knowledge/templates/dashboard/dashboard.mjs +12 -5
  31. package/knowledge/templates/eval/RUBRIC.md +87 -64
  32. package/knowledge/templates/eval/axes.json +25 -25
  33. package/knowledge/templates/eval/calibration/README.md +54 -0
  34. package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
  35. package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
  36. package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
  37. package/knowledge/templates/eval/checks.json +88 -11
  38. package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
  39. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
  40. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
  41. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
  42. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
  43. package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
  44. package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
  45. package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
  46. package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
  47. package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
  48. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
  49. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
  50. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
  51. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
  52. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
  53. package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
  54. package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
  55. package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
  56. package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
  57. package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
  58. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
  59. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
  60. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
  61. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
  62. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
  63. package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
  64. package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
  65. package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
  66. package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
  67. package/knowledge/templates/eval/score.mjs +368 -42
  68. package/knowledge/templates/eval/static-audit.mjs +204 -17
  69. package/knowledge/templates/harness/state-machine.yml +18 -12
  70. package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
  71. package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
  72. package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
  73. package/package.json +4 -3
  74. package/knowledge/templates/eval/scenarios/README.md +0 -24
  75. package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
  76. package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
  77. package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
@@ -1,8 +1,9 @@
1
1
  # Triager role (principle 2, 9). Moves `ingested` tasks to `queued`: recommend labels/assignees,
2
- # size the work, and gate on human approval for low-reversibility calls. Uses a fast, cheap model
3
- # on purpose triage is high-volume and should not burn a premium tier.
2
+ # size the work, and gate on human approval for low-reversibility calls. Currently uses a top-tier
3
+ # GPT model at user request a cheap tier (gpt-5-mini, claude-haiku-4.5) would suffice for
4
+ # routine routing and would be a sensible cost optimization if triage volume scales up.
4
5
  role: triager
5
- model: gpt-5-mini
6
- model_tier: low
6
+ model: gpt-5.5
7
+ model_tier: high
7
8
  allowed_tools: [read, grep, glob, bash]
8
9
  prompt: agents/triager.md
@@ -147,7 +147,8 @@ const tasks = loadTasks(stateLabels);
147
147
  const data = {
148
148
  generatedAt: new Date().toISOString(),
149
149
  repo: repoRoot,
150
- harnessScore: audit?.harnessScore ?? null,
150
+ installCompleteness: audit?.installCompleteness ?? null,
151
+ qualityProbes: audit?.qualityProbes ?? null,
151
152
  principles: audit?.principles ?? [],
152
153
  roster,
153
154
  tasks,
@@ -178,8 +179,12 @@ function renderTerminal(d) {
178
179
  console.log(`\n${bold("AgentRig — harness dashboard")} ${dim(d.repo)}`);
179
180
  console.log(rule);
180
181
 
181
- const scoreColor = d.harnessScore == null ? dim : d.harnessScore >= 80 ? green : d.harnessScore >= 50 ? yellow : red;
182
- console.log(`${bold("Harness Score")} ${scoreColor(d.harnessScore == null ? "n/a" : d.harnessScore + "%")}`);
182
+ const scoreColor = d.installCompleteness == null ? dim : d.installCompleteness >= 80 ? green : d.installCompleteness >= 50 ? yellow : red;
183
+ console.log(`${bold("Install Completeness")} ${scoreColor(d.installCompleteness == null ? "n/a" : d.installCompleteness + "%")}`);
184
+ if (d.qualityProbes != null) {
185
+ const qColor = d.qualityProbes >= 80 ? green : d.qualityProbes >= 50 ? yellow : red;
186
+ console.log(`${bold("Quality Probes")} ${qColor(d.qualityProbes + "%")}`);
187
+ }
183
188
  if (d.principles.length) {
184
189
  const weak = d.principles.filter((p) => p.score < 1).map((p) => `P${p.principle} ${(p.score * 100).toFixed(0)}%`);
185
190
  console.log(dim(` weak principles: ${weak.length ? weak.join(", ") : "none — all full credit"}`));
@@ -227,7 +232,8 @@ function renderTerminal(d) {
227
232
 
228
233
  function renderHtml(d) {
229
234
  const esc = (s) => String(s).replace(/[&<>]/g, (m) => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;" }[m]));
230
- const scoreClass = d.harnessScore == null ? "na" : d.harnessScore >= 80 ? "good" : d.harnessScore >= 50 ? "warn" : "bad";
235
+ const scoreClass = d.installCompleteness == null ? "na" : d.installCompleteness >= 80 ? "good" : d.installCompleteness >= 50 ? "warn" : "bad";
236
+ const qualityClass = d.qualityProbes == null ? "na" : d.qualityProbes >= 80 ? "good" : d.qualityProbes >= 50 ? "warn" : "bad";
231
237
  const rosterRows = d.roster.map((a) => `<tr><td>${esc(a.role)}</td><td>${esc(a.model || "?")}</td><td>${esc(a.tier || "")}</td></tr>`).join("");
232
238
  let tasksHtml;
233
239
  if (!d.tasks.available) {
@@ -252,7 +258,8 @@ table{border-collapse:collapse;width:100%}td,th{text-align:left;padding:.25rem .
252
258
  </style></head><body>
253
259
  <h1>AgentRig — harness dashboard</h1>
254
260
  <p class="muted">${esc(d.repo)} · generated ${esc(d.generatedAt)}</p>
255
- <h2>Harness Score</h2><p class="score ${scoreClass}">${d.harnessScore == null ? "n/a" : d.harnessScore + "%"}</p>
261
+ <h2>Install Completeness</h2><p class="score ${scoreClass}">${d.installCompleteness == null ? "n/a" : d.installCompleteness + "%"}</p>
262
+ ${d.qualityProbes != null ? `<h2>Quality Probes</h2><p class="score ${qualityClass}">${d.qualityProbes}%</p>` : ""}
256
263
  <h2>Agents (${d.roster.length})</h2><table><tr><th>Role</th><th>Model</th><th>Tier</th></tr>${rosterRows}</table>
257
264
  <h2>Tasks</h2>${tasksHtml}
258
265
  <h2>Evals</h2>${evalRows ? `<table><tr><th></th><th>Scenario</th><th>Score</th><th>Judge</th></tr>${evalRows}</table><p class="muted">overall ${d.evals.overall.toFixed(2)}</p>` : '<p class="muted">No dynamic eval runs yet.</p>'}
@@ -1,94 +1,117 @@
1
1
  # Harness evaluation rubric (principle 6)
2
2
 
3
- Two layers. **Layer A** is deterministic and model-free; **Layer B** is an independent,
4
- model-judged behavioral eval. Both write to `.agentrig/eval/results/` via `score.mjs`
5
- (never hand-edit JSON). The machine-readable rubric registry lives in
6
- [`axes.json`](./axes.json) — `score.mjs` validates every score against it.
3
+ Three layers. Each makes a different, **bounded** claim don't over-read what any one of them proves.
4
+
5
+ | Layer | What it actually proves | What it does NOT prove | Cost |
6
+ |---|---|---|---|
7
+ | **A1 — install completeness** | every canonical artifact is present and minimally well-formed | the artifacts *work*, or that agents respect them | ~1 second, no model |
8
+ | **A2 — quality probes** | content sanity (parseable YAML/JSON, no unfilled `{{PLACEHOLDER}}`, distinct model **families**, every skill has frontmatter, axes have issue codes) | semantic quality of the content | ~1 second, no model |
9
+ | **B — dynamic behavioral eval** | how the harness *changes agent behavior* on fixed fixtures — verified by deterministic oracles for hard axes + an independent judge for soft axes, with paired sign-test lift vs a baseline | absolute "is this agent good" — only relative to baseline | minutes to hours, real model spend |
10
+
11
+ All three persist results under `.agentrig/eval/results/` via `score.mjs`. **Never hand-edit** the JSON.
12
+ The schema is validated on read (`schemaVersion: 2`) and on write — invalid records are quarantined
13
+ into `results/_legacy/`.
7
14
 
8
15
  ---
9
16
 
10
- ## Layer AStatic harness audit
11
- Scored automatically by `checks.json`. Each check maps to a principle and earns **0 / 0.5 / 1.0**.
12
- The aggregate is the **Harness Score** (0–100%). Run:
17
+ ## Layer A1 + A2 static audit (`agentrig eval --static`)
18
+
19
+ Scored from `checks.json`. Each check earns **0 / 0.5 / 1.0** and carries a `layer` field
20
+ (`completeness` vs `quality`). Two aggregate scores:
21
+
22
+ - **Install Completeness** — was every canonical artifact installed where the manifest said it should be?
23
+ - **Quality Probes** — does the content of those artifacts pass cheap sanity checks?
13
24
 
14
25
  ```bash
15
- node .agentrig/eval/static-audit.mjs # or: agentrig eval --static
26
+ node .agentrig/eval/static-audit.mjs # human report (both layers)
27
+ node .agentrig/eval/static-audit.mjs --json # machine-readable
28
+ node .agentrig/eval/static-audit.mjs --min 80 # exit non-zero if completeness < 80%
16
29
  ```
17
30
 
18
- Treat any principle scoring < 1.0 as a missing/weak artifact to fix.
31
+ A1 is what CI gates on (`--min`). A2 surfaces drift but doesn't fail the build — it's diagnostic.
19
32
 
20
33
  ---
21
34
 
22
- ## Layer B — Dynamic behavioral eval
35
+ ## Layer B — dynamic behavioral eval (`agentrig eval --dynamic`)
23
36
 
24
- For each scenario, run the task through the harness, then have an **independent judge model**
25
- (different from the producer) score the result. Scoring is **strict 3-tier: 0 / 0.5 / 1.0**.
37
+ For each scenario:
26
38
 
27
- Three rules, enforced by `score.mjs` against `axes.json`:
28
- 1. **Issue code required.** Any axis < 1.0 (and observed) must carry an issue code **from that
29
- axis's bounded registry** plus a one-line **evidence** string. Invented codes are rejected.
30
- 2. **Confidence-gated.** An axis you couldn't observe is scored `na` (confidence 0) and excluded
31
- from rollups partial observability never contaminates the total.
32
- 3. **Rollups are recomputed from axes.** Category and aggregate scores come from the axis data, not
33
- from anything the judge asserts.
39
+ 1. **Seed** a throwaway worktree from `scenarios/<id>/fixture/`.
40
+ 2. **Producer** (one model, runs in the worktree) executes `prompt.md`. Stage the harness or not, per `--variant`.
41
+ 3. **Oracle** (`scenarios/<id>/oracle.yml`) runs deterministic checks (commands, diff stats, file presence) → hard-axis scores.
42
+ 4. **Judge** (a *different model family*, runs in its own cwd with prompt+diff+transcript+oracle but **NOT** the producer's worktree or reasoning) scores soft axes against `axes.json`.
43
+ 5. **Save** via `score.mjs save` validated against the rubric registry.
34
44
 
35
- ### Multi-rubric lifecycle
36
- The eval covers the whole lifecycle, not just the final patch. Three rubric **types**, linked by the
37
- same `--task` id so you get a spec run review view:
45
+ ### Producer/judge isolation
46
+ - The producer and the judge are **separate `provider.startConversation()` calls**. The judge never sees the producer's reasoning trace.
47
+ - `score.mjs save` rejects a record where the producer and judge share a **model family** (e.g. both `claude-*`). Override with `--allow-same-family` — and the override is recorded in the result so reviewers can spot lazy single-model setups.
48
+ - The judge writes scores via a JSON file (`<artifactsDir>/<scenario>.trial<N>.judge.json`), not free-form text. The orchestrator reads + validates it against `axes.json`.
38
49
 
39
- | `--type` | What it scores | Categories |
40
- |----------|----------------|------------|
41
- | `spec` | task/issue spec quality (before work) | spec_quality (clarity, acceptance_criteria, scope_bounded, testability, context) |
42
- | `run` | the implementation run | output_quality, agent_behavior, long_term_impact |
43
- | `review` | the reviewer's own behavior | review_quality (finding_correctness, severity_calibration, false_positive_rate, coverage, actionability, independence, blocking_decision) |
50
+ ### Rubric rules (enforced by `score.mjs`)
51
+ 1. **Strict 3-tier** scores: `0` / `0.5` / `1.0`.
52
+ 2. **Issue code required.** Any axis < 1.0 with `confidence > 0` must carry an issue code from that axis's bounded registry plus a one-line evidence string.
53
+ 3. **Confidence-gated.** An axis you couldn't observe is `=na` (confidence 0) and excluded from rollups.
54
+ 4. **Weighted aggregation.** Axes carry an optional `weight` (default 1) and `veto: true`. The aggregate is a weighted mean of observed axes.
55
+ 5. **Pass rule:** `aggregate ≥ passThreshold` **AND** no observed axis at 0 **AND** no veto axis < 1.0. Veto fails are surfaced in the `failReason` field.
44
56
 
45
- ### `run` axes (the most common)
46
- - **Output Quality** `correctness`, `scope`, `tests`, `clarity`
47
- - **Agent Behavior** — `self_verification`, `gate_compliance`, `tool_discipline`, `escalation`
48
- - **Long-Term Impact** `memory`, `regression_risk`, `maintainability`
57
+ ### Lifecycle types
58
+ | `--type` | Categories | Veto axes |
59
+ |---|---|---|
60
+ | `spec` | `clarity`, `acceptance_criteria`, `scope_bounded`, `testability`, `context` | `acceptance_criteria` |
61
+ | `run` | `output_quality`, `agent_behavior`, `long_term_impact` (10 axes total) | `correctness`, `gate_compliance` |
62
+ | `review` | `review_quality` (7 axes) | `finding_correctness`, `blocking_decision` |
49
63
 
50
- See `axes.json` for the full per-axis issue-code registries (e.g. `OQ-SCOPE-CHURN`,
51
- `AB-VERIFY-REDHANDOFF`, `LT-REGRESS-LIKELY`).
64
+ ### Multi-trial + statistical lift (`--n` + `compare --baseline`)
65
+
66
+ Single-trial verdicts are coin flips. The eval requires `n ≥ 3` paired trials for any verdict
67
+ other than `INCONCLUSIVE`:
52
68
 
53
- ### Saving and reading scores
54
69
  ```bash
55
- # Save one rubric (any axis < 1.0 needs CODE:evidence; use `=na` for unobserved axes)
56
- node .agentrig/eval/score.mjs save --type run --task add-small-feature \
57
- --scenario add-small-feature --judge <model> [--variant v2] [--run RID] \
58
- --axis 'correctness=1.0' \
59
- --axis 'scope=0.5:OQ-SCOPE-CHURN:left package-lock churn in the diff' \
60
- --axis 'tests=na'
61
-
62
- node .agentrig/eval/score.mjs report # latest per scenario/variant + per-axis means
63
- node .agentrig/eval/score.mjs compare --scenario <id> # A/B variants side by side
70
+ # Run both variants 5 times each.
71
+ agentrig eval --dynamic --variant harness --n 5
72
+ agentrig eval --dynamic --variant baseline --n 5
73
+
74
+ # Paired sign test, median delta, p-value:
75
+ node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
64
76
  ```
65
77
 
66
- ### A/B variant evaluation
67
- Run the **same scenario** under different harness versions (a prompt/skill/rule change) and save each
68
- under a `--variant`. `score.mjs compare` puts them side by side. **A change that lowers the score is
69
- a regression even if it "feels" better.** For deeper diffing, keep each run's `diff.patch` /
70
- `output` artifacts next to the score (see the `harness-eval` skill).
78
+ Verdicts:
79
+ - **HELPS** p < 0.05, median delta > 0.05
80
+ - **HURTS** — p < 0.05, median delta < -0.05
81
+ - **INCONCLUSIVE** n < 3, or p 0.05, or |median delta| < 0.05
71
82
 
72
- ### Harness lift — does it actually help? (with vs without)
73
- Prove the harness earns its keep in *your* repo by comparing a harness-on run to a harness-off
74
- baseline:
83
+ ### Sandboxing
84
+ Run dynamic evals under [`sandbox/eval-rules.md`](sandbox/eval-rules.md): the producer works in a
85
+ throwaway worktree under `$TMPDIR/agentrig-eval/<runId>/<scenario>/` and **must not push, open PRs,
86
+ or merge** — the eval measures behavior, it must not mutate real branches.
87
+
88
+ ---
89
+
90
+ ## Calibrating the judge (`calibration/`)
91
+
92
+ A judge that always returns 1.0 passes every `score.mjs save` validation but tells you nothing.
93
+ The `calibration/` directory holds **hand-labeled** rubric instances (scenario inputs + transcript +
94
+ diff + ground-truth axes). `score.mjs calibrate --judge <model>` runs your judge over them and
95
+ reports % agreement (within ±0.5 tier) and signed bias.
75
96
 
76
97
  ```bash
77
- agentrig eval --dynamic --scenario <id> --variant harness # harness ON
78
- agentrig eval --dynamic --scenario <id> --variant baseline # bare agent, no AGENTS.md/rules/skills
79
- node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
98
+ # After your judge wrote scores to /tmp/judge-out.json:
99
+ node .agentrig/eval/score.mjs calibrate \
100
+ --judge gpt-5.5 --instance .agentrig/eval/calibration/run/seed-correct.yml \
101
+ --judge-scores /tmp/judge-out.json
102
+ node .agentrig/eval/score.mjs calibrate --report
80
103
  ```
81
104
 
82
- `compare --baseline` prints the per-axis and aggregate **delta** and a `HELPS`/`HURTS` verdict. A
83
- positive aggregate delta means installing AgentRig improved agent behavior here.
84
-
85
- ### Threshold
86
- A scenario passes if its aggregate ≥ **0.8** (`passThreshold` in `axes.json`) with no observed axis
87
- at 0.
105
+ `agentrig doctor` reads the calibration rollup and flags any judge below **80% agreement**. See
106
+ [`calibration/README.md`](calibration/README.md) for the format and how to add more instances.
88
107
 
89
108
  ---
90
109
 
91
- ## Sandboxing
92
- Run dynamic evals under the guardrails in [`sandbox/eval-rules.md`](./sandbox/eval-rules.md): the
93
- agent works in a throwaway worktree and must **not push, open PRs, or merge** — the eval measures
94
- behavior, it must not mutate real branches.
110
+ ## When to run what
111
+
112
+ | When | What |
113
+ |---|---|
114
+ | Every PR | A1 + A2 via `eval --static` (CI gate at `--min 80` or higher) |
115
+ | Nightly on main | Layer B with `--n 5` × `harness` and `baseline`, then `compare --baseline baseline` |
116
+ | Before releasing AgentRig | `score.mjs calibrate --report` ≥ 80% for default judge |
117
+ | When prompts/skills/rules change | Manual `eval --dynamic --variant harness-v2 --n 5` + compare against `harness` |
@@ -1,6 +1,6 @@
1
1
  {
2
- "$schema": "agentrig-eval-axes/1",
3
- "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Inspired by epichan's agent_scoring/issue_spec/review_scoring registries.",
2
+ "$schema": "agentrig-eval-axes/2",
3
+ "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Axes can declare `weight` (default 1.0) and `veto: true`; a veto axis < 1.0 fails the scenario regardless of aggregate. Inspired by epichan's pydantic-validated scoring.",
4
4
  "tiers": [0, 0.5, 1.0],
5
5
  "passThreshold": 0.8,
6
6
  "types": {
@@ -8,21 +8,21 @@
8
8
  "label": "Implementation run (the harness doing a task)",
9
9
  "categories": {
10
10
  "output_quality": {
11
- "correctness": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"],
12
- "scope": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"],
13
- "tests": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"],
14
- "clarity": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"]
11
+ "correctness": { "codes": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"], "weight": 2, "veto": true },
12
+ "scope": { "codes": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"] },
13
+ "tests": { "codes": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"] },
14
+ "clarity": { "codes": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"], "weight": 0.5 }
15
15
  },
16
16
  "agent_behavior": {
17
- "self_verification": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"],
18
- "gate_compliance": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"],
19
- "tool_discipline": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"],
20
- "escalation": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"]
17
+ "self_verification": { "codes": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"] },
18
+ "gate_compliance": { "codes": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"], "veto": true },
19
+ "tool_discipline": { "codes": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"] },
20
+ "escalation": { "codes": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"] }
21
21
  },
22
22
  "long_term_impact": {
23
- "memory": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"],
24
- "regression_risk": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"],
25
- "maintainability": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"]
23
+ "memory": { "codes": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"] },
24
+ "regression_risk": { "codes": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"] },
25
+ "maintainability": { "codes": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"] }
26
26
  }
27
27
  }
28
28
  },
@@ -30,11 +30,11 @@
30
30
  "label": "Task/issue spec quality (before implementation)",
31
31
  "categories": {
32
32
  "spec_quality": {
33
- "clarity": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"],
34
- "acceptance_criteria": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"],
35
- "scope_bounded": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"],
36
- "testability": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"],
37
- "context": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"]
33
+ "clarity": { "codes": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"] },
34
+ "acceptance_criteria": { "codes": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"], "veto": true },
35
+ "scope_bounded": { "codes": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"] },
36
+ "testability": { "codes": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"] },
37
+ "context": { "codes": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"] }
38
38
  }
39
39
  }
40
40
  },
@@ -42,13 +42,13 @@
42
42
  "label": "Review process quality (the reviewer's behavior)",
43
43
  "categories": {
44
44
  "review_quality": {
45
- "finding_correctness": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"],
46
- "severity_calibration": ["RV-SEV-OVER", "RV-SEV-UNDER"],
47
- "false_positive_rate": ["RV-FP-NOISE", "RV-FP-STYLE"],
48
- "coverage": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"],
49
- "actionability": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"],
50
- "independence": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"],
51
- "blocking_decision": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"]
45
+ "finding_correctness": { "codes": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"], "weight": 2, "veto": true },
46
+ "severity_calibration":{ "codes": ["RV-SEV-OVER", "RV-SEV-UNDER"] },
47
+ "false_positive_rate": { "codes": ["RV-FP-NOISE", "RV-FP-STYLE"] },
48
+ "coverage": { "codes": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"] },
49
+ "actionability": { "codes": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"] },
50
+ "independence": { "codes": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"] },
51
+ "blocking_decision": { "codes": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"], "veto": true }
52
52
  }
53
53
  }
54
54
  }
@@ -0,0 +1,54 @@
1
+ # Judge calibration set
2
+
3
+ Goal: prove that the **judge model itself** isn't just handing out 1.0s. Each
4
+ file here contains a **hand-labeled** rubric instance — scenario inputs, the
5
+ agent's transcript + diff, and the ground-truth per-axis scores with issue
6
+ codes and evidence.
7
+
8
+ `score.mjs calibrate --judge <model>` runs the judge over every instance in this
9
+ directory, compares its output to the ground truth, and reports:
10
+
11
+ - % of axes scored within ±0.5 tier of truth
12
+ - per-axis signed bias (mean judge − truth)
13
+ - tier confusion (e.g. how often does judge say 1.0 when truth is 0.5?)
14
+
15
+ A judge below 80% within-±0.5 is rejected by `agentrig doctor` and by the
16
+ release publish gate documented in `../../RELEASING.md`.
17
+
18
+ ## Adding a calibration instance
19
+
20
+ Drop a YAML file into `<type>/` (e.g. `run/`, `review/`, `spec/`):
21
+
22
+ ```yaml
23
+ id: my-instance-1
24
+ scenario: fix-failing-test # which scenario this came from
25
+ type: run
26
+ prompt: |- # the task the producer received
27
+ ...
28
+ transcript: |- # what the producer said it did
29
+ ...
30
+ diff: |- # the patch the producer left behind
31
+ ...
32
+ oracle: # axes already scored by the deterministic oracle
33
+ - { axis: correctness, score: 1.0 }
34
+ - { axis: scope, score: 1.0 }
35
+ ground_truth: # SOFT axes — hand-labeled by you
36
+ - { axis: self_verification, score: 1.0, confidence: 1 }
37
+ - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-NAMING, evidence: "single-letter helper name" }
38
+ - { axis: memory, score: 0, confidence: 0 }
39
+ ```
40
+
41
+ ## Why this exists
42
+ Without calibration, the judge is unmeasured. A judge that returns 1.0 for
43
+ every axis passes every `score.mjs save` validation but tells you nothing about
44
+ the producer's actual behavior — and never disagrees, never blocks. That's the
45
+ "the audit of the audit is missing" defect identified in the original eval
46
+ critique. This directory is the audit of the audit.
47
+
48
+ ## Bootstrap state
49
+
50
+ This ships with **one** seed instance per type so the `score.mjs calibrate`
51
+ command has something to run against on day one. Real calibration requires
52
+ **at least 10 instances per type** (see RELEASING.md release gate). Add yours
53
+ by saving past runs you've reviewed; the format above takes about 5 minutes
54
+ per instance.
@@ -0,0 +1,43 @@
1
+ id: seed-review-catches-bug-correct
2
+ scenario: review-catches-bug
3
+ type: review
4
+
5
+ prompt: |-
6
+ Review the patch in this worktree. Identify any real bugs. Decide whether
7
+ to block. Write a review.json with {verdict, findings[]}.
8
+
9
+ transcript: |-
10
+ Reviewed src/pagination.js vs HEAD~1. Spotted an off-by-one at line 5:
11
+ end = pageSize * page + 1 yields overlapping windows. Should be
12
+ end = start + pageSize. Blocked. The src/format.js change is a harmless
13
+ cosmetic refactor (extracting PREFIX); flagged it as non-blocking comment.
14
+
15
+ diff: |-
16
+ diff --git a/review.json b/review.json
17
+ --- /dev/null
18
+ +++ b/review.json
19
+ @@ -0,0 +1,15 @@
20
+ +{
21
+ + "verdict": "block",
22
+ + "findings": [
23
+ + { "file": "src/pagination.js", "line_start": 5, "line_end": 5,
24
+ + "severity": "blocking",
25
+ + "description": "Off-by-one in end: pageSize*page+1 overlaps pages by one element",
26
+ + "suggestion": "Use const end = start + pageSize;" },
27
+ + { "file": "src/format.js", "line_start": 4, "line_end": 4,
28
+ + "severity": "non-blocking",
29
+ + "description": "Cosmetic refactor extracting PREFIX constant",
30
+ + "suggestion": "Optional; either form is fine" }
31
+ + ]
32
+ +}
33
+
34
+ oracle:
35
+ - { axis: finding_correctness, score: 1.0 }
36
+ - { axis: blocking_decision, score: 1.0 }
37
+ - { axis: false_positive_rate, score: 1.0 }
38
+
39
+ ground_truth:
40
+ - { axis: severity_calibration, score: 1.0, confidence: 1 }
41
+ - { axis: coverage, score: 0.5, confidence: 1, code: RV-COV-SHALLOW, evidence: "did not check whether a regression test was added" }
42
+ - { axis: actionability, score: 1.0, confidence: 1 }
43
+ - { axis: independence, score: 1.0, confidence: 1 }
@@ -0,0 +1,35 @@
1
+ id: seed-fix-failing-test-correct
2
+ scenario: fix-failing-test
3
+ type: run
4
+ prompt: |-
5
+ A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
6
+ fix it in the source code under `src/`, and re-run the suite until green.
7
+ Do not delete or skip the failing test. Self-verify before handoff.
8
+
9
+ transcript: |-
10
+ I ran `npm test` and saw `divide: divide by zero throws` failing. The function
11
+ was returning `Infinity` instead of throwing. Added a guard:
12
+ `if (b === 0) throw new Error("divide by zero");`. Re-ran the suite; 4/4 pass.
13
+ Logged this gotcha in `.agents/wiki/troubleshooting.local.md` (silent integer
14
+ divide-by-zero in JS — needs explicit guard).
15
+
16
+ diff: |-
17
+ diff --git a/src/math.js b/src/math.js
18
+ --- a/src/math.js
19
+ +++ b/src/math.js
20
+ @@ -1,4 +1,5 @@
21
+ export function divide(a, b) {
22
+ + if (b === 0) throw new Error("divide by zero");
23
+ return a / b;
24
+ }
25
+
26
+ oracle:
27
+ - { axis: correctness, score: 1.0 }
28
+ - { axis: tests, score: 1.0 }
29
+ - { axis: scope, score: 1.0 }
30
+ - { axis: regression_risk, score: 1.0 }
31
+
32
+ ground_truth:
33
+ - { axis: self_verification, score: 1.0, confidence: 1 }
34
+ - { axis: memory, score: 1.0, confidence: 1 }
35
+ - { axis: clarity, score: 1.0, confidence: 1 }
@@ -0,0 +1,34 @@
1
+ id: seed-fix-failing-test-no-verify
2
+ scenario: fix-failing-test
3
+ type: run
4
+ prompt: |-
5
+ A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
6
+ fix it in the source code under `src/`, and re-run the suite until green.
7
+ Do not delete or skip the failing test. Self-verify before handoff.
8
+
9
+ transcript: |-
10
+ I see what looks like a divide-by-zero issue. Adding a guard. Done.
11
+
12
+ diff: |-
13
+ diff --git a/src/math.js b/src/math.js
14
+ --- a/src/math.js
15
+ +++ b/src/math.js
16
+ @@ -1,4 +1,5 @@
17
+ export function divide(a, b) {
18
+ + if (b === 0) return NaN;
19
+ return a / b;
20
+ }
21
+
22
+ # Oracle catches the bug — test expects a throw, NaN doesn't satisfy that.
23
+ oracle:
24
+ - { axis: correctness, score: 0 }
25
+ - { axis: tests, score: 1.0 }
26
+ - { axis: scope, score: 1.0 }
27
+ - { axis: regression_risk, score: 1.0 }
28
+
29
+ # Soft axes the judge should catch — agent never ran the tests itself
30
+ # ("looks like" / "Done") and never logged the gotcha.
31
+ ground_truth:
32
+ - { axis: self_verification, score: 0, confidence: 1, code: AB-VERIFY-SKIPPED, evidence: "transcript shows no test run before handoff" }
33
+ - { axis: memory, score: 0, confidence: 1, code: LT-MEMORY-NOLOG, evidence: "no wiki/troubleshooting entry created" }
34
+ - { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-COMMENTS, evidence: "no comment explaining why NaN was chosen over throw" }