@doidor/agentrig 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -27
- package/dist/agent/copilot.js +46 -5
- package/dist/agent/copilot.js.map +1 -1
- package/dist/cli.js +30 -5
- package/dist/cli.js.map +1 -1
- package/dist/commands/doctor.js +53 -8
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/eval-dynamic.js +316 -0
- package/dist/commands/eval-dynamic.js.map +1 -0
- package/dist/commands/eval-scaffold.js +173 -0
- package/dist/commands/eval-scaffold.js.map +1 -0
- package/dist/commands/eval.js +184 -55
- package/dist/commands/eval.js.map +1 -1
- package/dist/core/audit.js +237 -9
- package/dist/core/audit.js.map +1 -1
- package/dist/core/model-family.js +31 -0
- package/dist/core/model-family.js.map +1 -0
- package/dist/core/scenario-runner.js +298 -0
- package/dist/core/scenario-runner.js.map +1 -0
- package/dist/prompts/index.js +121 -30
- package/dist/prompts/index.js.map +1 -1
- package/knowledge/PRINCIPLES.md +2 -2
- package/knowledge/manifest.json +16 -1
- package/knowledge/templates/AGENTS.md +7 -6
- package/knowledge/templates/agents/README.md +4 -4
- package/knowledge/templates/agents/developer.yml +1 -1
- package/knowledge/templates/agents/judge.yml +1 -1
- package/knowledge/templates/agents/reviewer.yml +1 -1
- package/knowledge/templates/agents/triager.yml +5 -4
- package/knowledge/templates/dashboard/dashboard.mjs +12 -5
- package/knowledge/templates/eval/RUBRIC.md +87 -64
- package/knowledge/templates/eval/axes.json +25 -25
- package/knowledge/templates/eval/calibration/README.md +54 -0
- package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
- package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
- package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
- package/knowledge/templates/eval/checks.json +88 -11
- package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
- package/knowledge/templates/eval/score.mjs +368 -42
- package/knowledge/templates/eval/static-audit.mjs +204 -17
- package/knowledge/templates/harness/state-machine.yml +18 -12
- package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
- package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
- package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
- package/package.json +4 -3
- package/knowledge/templates/eval/scenarios/README.md +0 -24
- package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
- package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
- package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# Triager role (principle 2, 9). Moves `ingested` tasks to `queued`: recommend labels/assignees,
|
|
2
|
-
# size the work, and gate on human approval for low-reversibility calls.
|
|
3
|
-
#
|
|
2
|
+
# size the work, and gate on human approval for low-reversibility calls. Currently uses a top-tier
|
|
3
|
+
# GPT model at user request — a cheap tier (gpt-5-mini, claude-haiku-4.5) would suffice for
|
|
4
|
+
# routine routing and would be a sensible cost optimization if triage volume scales up.
|
|
4
5
|
role: triager
|
|
5
|
-
model: gpt-5
|
|
6
|
-
model_tier:
|
|
6
|
+
model: gpt-5.5
|
|
7
|
+
model_tier: high
|
|
7
8
|
allowed_tools: [read, grep, glob, bash]
|
|
8
9
|
prompt: agents/triager.md
|
|
@@ -147,7 +147,8 @@ const tasks = loadTasks(stateLabels);
|
|
|
147
147
|
const data = {
|
|
148
148
|
generatedAt: new Date().toISOString(),
|
|
149
149
|
repo: repoRoot,
|
|
150
|
-
|
|
150
|
+
installCompleteness: audit?.installCompleteness ?? null,
|
|
151
|
+
qualityProbes: audit?.qualityProbes ?? null,
|
|
151
152
|
principles: audit?.principles ?? [],
|
|
152
153
|
roster,
|
|
153
154
|
tasks,
|
|
@@ -178,8 +179,12 @@ function renderTerminal(d) {
|
|
|
178
179
|
console.log(`\n${bold("AgentRig — harness dashboard")} ${dim(d.repo)}`);
|
|
179
180
|
console.log(rule);
|
|
180
181
|
|
|
181
|
-
const scoreColor = d.
|
|
182
|
-
console.log(`${bold("
|
|
182
|
+
const scoreColor = d.installCompleteness == null ? dim : d.installCompleteness >= 80 ? green : d.installCompleteness >= 50 ? yellow : red;
|
|
183
|
+
console.log(`${bold("Install Completeness")} ${scoreColor(d.installCompleteness == null ? "n/a" : d.installCompleteness + "%")}`);
|
|
184
|
+
if (d.qualityProbes != null) {
|
|
185
|
+
const qColor = d.qualityProbes >= 80 ? green : d.qualityProbes >= 50 ? yellow : red;
|
|
186
|
+
console.log(`${bold("Quality Probes")} ${qColor(d.qualityProbes + "%")}`);
|
|
187
|
+
}
|
|
183
188
|
if (d.principles.length) {
|
|
184
189
|
const weak = d.principles.filter((p) => p.score < 1).map((p) => `P${p.principle} ${(p.score * 100).toFixed(0)}%`);
|
|
185
190
|
console.log(dim(` weak principles: ${weak.length ? weak.join(", ") : "none — all full credit"}`));
|
|
@@ -227,7 +232,8 @@ function renderTerminal(d) {
|
|
|
227
232
|
|
|
228
233
|
function renderHtml(d) {
|
|
229
234
|
const esc = (s) => String(s).replace(/[&<>]/g, (m) => ({ "&": "&", "<": "<", ">": ">" }[m]));
|
|
230
|
-
const scoreClass = d.
|
|
235
|
+
const scoreClass = d.installCompleteness == null ? "na" : d.installCompleteness >= 80 ? "good" : d.installCompleteness >= 50 ? "warn" : "bad";
|
|
236
|
+
const qualityClass = d.qualityProbes == null ? "na" : d.qualityProbes >= 80 ? "good" : d.qualityProbes >= 50 ? "warn" : "bad";
|
|
231
237
|
const rosterRows = d.roster.map((a) => `<tr><td>${esc(a.role)}</td><td>${esc(a.model || "?")}</td><td>${esc(a.tier || "")}</td></tr>`).join("");
|
|
232
238
|
let tasksHtml;
|
|
233
239
|
if (!d.tasks.available) {
|
|
@@ -252,7 +258,8 @@ table{border-collapse:collapse;width:100%}td,th{text-align:left;padding:.25rem .
|
|
|
252
258
|
</style></head><body>
|
|
253
259
|
<h1>AgentRig — harness dashboard</h1>
|
|
254
260
|
<p class="muted">${esc(d.repo)} · generated ${esc(d.generatedAt)}</p>
|
|
255
|
-
<h2>
|
|
261
|
+
<h2>Install Completeness</h2><p class="score ${scoreClass}">${d.installCompleteness == null ? "n/a" : d.installCompleteness + "%"}</p>
|
|
262
|
+
${d.qualityProbes != null ? `<h2>Quality Probes</h2><p class="score ${qualityClass}">${d.qualityProbes}%</p>` : ""}
|
|
256
263
|
<h2>Agents (${d.roster.length})</h2><table><tr><th>Role</th><th>Model</th><th>Tier</th></tr>${rosterRows}</table>
|
|
257
264
|
<h2>Tasks</h2>${tasksHtml}
|
|
258
265
|
<h2>Evals</h2>${evalRows ? `<table><tr><th></th><th>Scenario</th><th>Score</th><th>Judge</th></tr>${evalRows}</table><p class="muted">overall ${d.evals.overall.toFixed(2)}</p>` : '<p class="muted">No dynamic eval runs yet.</p>'}
|
|
@@ -1,94 +1,117 @@
|
|
|
1
1
|
# Harness evaluation rubric (principle 6)
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
Three layers. Each makes a different, **bounded** claim — don't over-read what any one of them proves.
|
|
4
|
+
|
|
5
|
+
| Layer | What it actually proves | What it does NOT prove | Cost |
|
|
6
|
+
|---|---|---|---|
|
|
7
|
+
| **A1 — install completeness** | every canonical artifact is present and minimally well-formed | the artifacts *work*, or that agents respect them | ~1 second, no model |
|
|
8
|
+
| **A2 — quality probes** | content sanity (parseable YAML/JSON, no unfilled `{{PLACEHOLDER}}`, distinct model **families**, every skill has frontmatter, axes have issue codes) | semantic quality of the content | ~1 second, no model |
|
|
9
|
+
| **B — dynamic behavioral eval** | how the harness *changes agent behavior* on fixed fixtures — verified by deterministic oracles for hard axes + an independent judge for soft axes, with paired sign-test lift vs a baseline | absolute "is this agent good" — only relative to baseline | minutes to hours, real model spend |
|
|
10
|
+
|
|
11
|
+
All three persist results under `.agentrig/eval/results/` via `score.mjs`. **Never hand-edit** the JSON.
|
|
12
|
+
The schema is validated on read (`schemaVersion: 2`) and on write — invalid records are quarantined
|
|
13
|
+
into `results/_legacy/`.
|
|
7
14
|
|
|
8
15
|
---
|
|
9
16
|
|
|
10
|
-
## Layer
|
|
11
|
-
|
|
12
|
-
|
|
17
|
+
## Layer A1 + A2 — static audit (`agentrig eval --static`)
|
|
18
|
+
|
|
19
|
+
Scored from `checks.json`. Each check earns **0 / 0.5 / 1.0** and carries a `layer` field
|
|
20
|
+
(`completeness` vs `quality`). Two aggregate scores:
|
|
21
|
+
|
|
22
|
+
- **Install Completeness** — was every canonical artifact installed where the manifest said it should be?
|
|
23
|
+
- **Quality Probes** — does the content of those artifacts pass cheap sanity checks?
|
|
13
24
|
|
|
14
25
|
```bash
|
|
15
|
-
node .agentrig/eval/static-audit.mjs
|
|
26
|
+
node .agentrig/eval/static-audit.mjs # human report (both layers)
|
|
27
|
+
node .agentrig/eval/static-audit.mjs --json # machine-readable
|
|
28
|
+
node .agentrig/eval/static-audit.mjs --min 80 # exit non-zero if completeness < 80%
|
|
16
29
|
```
|
|
17
30
|
|
|
18
|
-
|
|
31
|
+
A1 is what CI gates on (`--min`). A2 surfaces drift but doesn't fail the build — it's diagnostic.
|
|
19
32
|
|
|
20
33
|
---
|
|
21
34
|
|
|
22
|
-
## Layer B —
|
|
35
|
+
## Layer B — dynamic behavioral eval (`agentrig eval --dynamic`)
|
|
23
36
|
|
|
24
|
-
For each scenario
|
|
25
|
-
(different from the producer) score the result. Scoring is **strict 3-tier: 0 / 0.5 / 1.0**.
|
|
37
|
+
For each scenario:
|
|
26
38
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
3. **Rollups are recomputed from axes.** Category and aggregate scores come from the axis data, not
|
|
33
|
-
from anything the judge asserts.
|
|
39
|
+
1. **Seed** a throwaway worktree from `scenarios/<id>/fixture/`.
|
|
40
|
+
2. **Producer** (one model, runs in the worktree) executes `prompt.md`. Stage the harness or not, per `--variant`.
|
|
41
|
+
3. **Oracle** (`scenarios/<id>/oracle.yml`) runs deterministic checks (commands, diff stats, file presence) → hard-axis scores.
|
|
42
|
+
4. **Judge** (a *different model family*, runs in its own cwd with prompt+diff+transcript+oracle but **NOT** the producer's worktree or reasoning) scores soft axes against `axes.json`.
|
|
43
|
+
5. **Save** via `score.mjs save` — validated against the rubric registry.
|
|
34
44
|
|
|
35
|
-
###
|
|
36
|
-
The
|
|
37
|
-
|
|
45
|
+
### Producer/judge isolation
|
|
46
|
+
- The producer and the judge are **separate `provider.startConversation()` calls**. The judge never sees the producer's reasoning trace.
|
|
47
|
+
- `score.mjs save` rejects a record where the producer and judge share a **model family** (e.g. both `claude-*`). Override with `--allow-same-family` — and the override is recorded in the result so reviewers can spot lazy single-model setups.
|
|
48
|
+
- The judge writes scores via a JSON file (`<artifactsDir>/<scenario>.trial<N>.judge.json`), not free-form text. The orchestrator reads + validates it against `axes.json`.
|
|
38
49
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
50
|
+
### Rubric rules (enforced by `score.mjs`)
|
|
51
|
+
1. **Strict 3-tier** scores: `0` / `0.5` / `1.0`.
|
|
52
|
+
2. **Issue code required.** Any axis < 1.0 with `confidence > 0` must carry an issue code from that axis's bounded registry plus a one-line evidence string.
|
|
53
|
+
3. **Confidence-gated.** An axis you couldn't observe is `=na` (confidence 0) and excluded from rollups.
|
|
54
|
+
4. **Weighted aggregation.** Axes carry an optional `weight` (default 1) and `veto: true`. The aggregate is a weighted mean of observed axes.
|
|
55
|
+
5. **Pass rule:** `aggregate ≥ passThreshold` **AND** no observed axis at 0 **AND** no veto axis < 1.0. Veto fails are surfaced in the `failReason` field.
|
|
44
56
|
|
|
45
|
-
###
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
57
|
+
### Lifecycle types
|
|
58
|
+
| `--type` | Categories | Veto axes |
|
|
59
|
+
|---|---|---|
|
|
60
|
+
| `spec` | `clarity`, `acceptance_criteria`, `scope_bounded`, `testability`, `context` | `acceptance_criteria` |
|
|
61
|
+
| `run` | `output_quality`, `agent_behavior`, `long_term_impact` (10 axes total) | `correctness`, `gate_compliance` |
|
|
62
|
+
| `review` | `review_quality` (7 axes) | `finding_correctness`, `blocking_decision` |
|
|
49
63
|
|
|
50
|
-
|
|
51
|
-
|
|
64
|
+
### Multi-trial + statistical lift (`--n` + `compare --baseline`)
|
|
65
|
+
|
|
66
|
+
Single-trial verdicts are coin flips. The eval requires `n ≥ 3` paired trials for any verdict
|
|
67
|
+
other than `INCONCLUSIVE`:
|
|
52
68
|
|
|
53
|
-
### Saving and reading scores
|
|
54
69
|
```bash
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
node .agentrig/eval/score.mjs report # latest per scenario/variant + per-axis means
|
|
63
|
-
node .agentrig/eval/score.mjs compare --scenario <id> # A/B variants side by side
|
|
70
|
+
# Run both variants 5 times each.
|
|
71
|
+
agentrig eval --dynamic --variant harness --n 5
|
|
72
|
+
agentrig eval --dynamic --variant baseline --n 5
|
|
73
|
+
|
|
74
|
+
# Paired sign test, median delta, p-value:
|
|
75
|
+
node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
|
|
64
76
|
```
|
|
65
77
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
`output` artifacts next to the score (see the `harness-eval` skill).
|
|
78
|
+
Verdicts:
|
|
79
|
+
- **HELPS** — p < 0.05, median delta > 0.05
|
|
80
|
+
- **HURTS** — p < 0.05, median delta < -0.05
|
|
81
|
+
- **INCONCLUSIVE** — n < 3, or p ≥ 0.05, or |median delta| < 0.05
|
|
71
82
|
|
|
72
|
-
###
|
|
73
|
-
|
|
74
|
-
|
|
83
|
+
### Sandboxing
|
|
84
|
+
Run dynamic evals under [`sandbox/eval-rules.md`](sandbox/eval-rules.md): the producer works in a
|
|
85
|
+
throwaway worktree under `$TMPDIR/agentrig-eval/<runId>/<scenario>/` and **must not push, open PRs,
|
|
86
|
+
or merge** — the eval measures behavior, it must not mutate real branches.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Calibrating the judge (`calibration/`)
|
|
91
|
+
|
|
92
|
+
A judge that always returns 1.0 passes every `score.mjs save` validation but tells you nothing.
|
|
93
|
+
The `calibration/` directory holds **hand-labeled** rubric instances (scenario inputs + transcript +
|
|
94
|
+
diff + ground-truth axes). `score.mjs calibrate --judge <model>` runs your judge over them and
|
|
95
|
+
reports % agreement (within ±0.5 tier) and signed bias.
|
|
75
96
|
|
|
76
97
|
```bash
|
|
77
|
-
|
|
78
|
-
agentrig
|
|
79
|
-
|
|
98
|
+
# After your judge wrote scores to /tmp/judge-out.json:
|
|
99
|
+
node .agentrig/eval/score.mjs calibrate \
|
|
100
|
+
--judge gpt-5.5 --instance .agentrig/eval/calibration/run/seed-correct.yml \
|
|
101
|
+
--judge-scores /tmp/judge-out.json
|
|
102
|
+
node .agentrig/eval/score.mjs calibrate --report
|
|
80
103
|
```
|
|
81
104
|
|
|
82
|
-
`
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
### Threshold
|
|
86
|
-
A scenario passes if its aggregate ≥ **0.8** (`passThreshold` in `axes.json`) with no observed axis
|
|
87
|
-
at 0.
|
|
105
|
+
`agentrig doctor` reads the calibration rollup and flags any judge below **80% agreement**. See
|
|
106
|
+
[`calibration/README.md`](calibration/README.md) for the format and how to add more instances.
|
|
88
107
|
|
|
89
108
|
---
|
|
90
109
|
|
|
91
|
-
##
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
110
|
+
## When to run what
|
|
111
|
+
|
|
112
|
+
| When | What |
|
|
113
|
+
|---|---|
|
|
114
|
+
| Every PR | A1 + A2 via `eval --static` (CI gate at `--min 80` or higher) |
|
|
115
|
+
| Nightly on main | Layer B with `--n 5` × `harness` and `baseline`, then `compare --baseline baseline` |
|
|
116
|
+
| Before releasing AgentRig | `score.mjs calibrate --report` ≥ 80% for default judge |
|
|
117
|
+
| When prompts/skills/rules change | Manual `eval --dynamic --variant harness-v2 --n 5` + compare against `harness` |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"$schema": "agentrig-eval-axes/
|
|
3
|
-
"description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Inspired by epichan's
|
|
2
|
+
"$schema": "agentrig-eval-axes/2",
|
|
3
|
+
"description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Axes can declare `weight` (default 1.0) and `veto: true`; a veto axis < 1.0 fails the scenario regardless of aggregate. Inspired by epichan's pydantic-validated scoring.",
|
|
4
4
|
"tiers": [0, 0.5, 1.0],
|
|
5
5
|
"passThreshold": 0.8,
|
|
6
6
|
"types": {
|
|
@@ -8,21 +8,21 @@
|
|
|
8
8
|
"label": "Implementation run (the harness doing a task)",
|
|
9
9
|
"categories": {
|
|
10
10
|
"output_quality": {
|
|
11
|
-
"correctness": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"],
|
|
12
|
-
"scope": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"],
|
|
13
|
-
"tests": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"],
|
|
14
|
-
"clarity": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"]
|
|
11
|
+
"correctness": { "codes": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"], "weight": 2, "veto": true },
|
|
12
|
+
"scope": { "codes": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"] },
|
|
13
|
+
"tests": { "codes": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"] },
|
|
14
|
+
"clarity": { "codes": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"], "weight": 0.5 }
|
|
15
15
|
},
|
|
16
16
|
"agent_behavior": {
|
|
17
|
-
"self_verification": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"],
|
|
18
|
-
"gate_compliance": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"],
|
|
19
|
-
"tool_discipline": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"],
|
|
20
|
-
"escalation": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"]
|
|
17
|
+
"self_verification": { "codes": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"] },
|
|
18
|
+
"gate_compliance": { "codes": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"], "veto": true },
|
|
19
|
+
"tool_discipline": { "codes": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"] },
|
|
20
|
+
"escalation": { "codes": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"] }
|
|
21
21
|
},
|
|
22
22
|
"long_term_impact": {
|
|
23
|
-
"memory": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"],
|
|
24
|
-
"regression_risk": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"],
|
|
25
|
-
"maintainability": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"]
|
|
23
|
+
"memory": { "codes": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"] },
|
|
24
|
+
"regression_risk": { "codes": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"] },
|
|
25
|
+
"maintainability": { "codes": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"] }
|
|
26
26
|
}
|
|
27
27
|
}
|
|
28
28
|
},
|
|
@@ -30,11 +30,11 @@
|
|
|
30
30
|
"label": "Task/issue spec quality (before implementation)",
|
|
31
31
|
"categories": {
|
|
32
32
|
"spec_quality": {
|
|
33
|
-
"clarity": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"],
|
|
34
|
-
"acceptance_criteria": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"],
|
|
35
|
-
"scope_bounded": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"],
|
|
36
|
-
"testability": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"],
|
|
37
|
-
"context": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"]
|
|
33
|
+
"clarity": { "codes": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"] },
|
|
34
|
+
"acceptance_criteria": { "codes": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"], "veto": true },
|
|
35
|
+
"scope_bounded": { "codes": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"] },
|
|
36
|
+
"testability": { "codes": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"] },
|
|
37
|
+
"context": { "codes": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"] }
|
|
38
38
|
}
|
|
39
39
|
}
|
|
40
40
|
},
|
|
@@ -42,13 +42,13 @@
|
|
|
42
42
|
"label": "Review process quality (the reviewer's behavior)",
|
|
43
43
|
"categories": {
|
|
44
44
|
"review_quality": {
|
|
45
|
-
"finding_correctness": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"],
|
|
46
|
-
"severity_calibration": ["RV-SEV-OVER", "RV-SEV-UNDER"],
|
|
47
|
-
"false_positive_rate": ["RV-FP-NOISE", "RV-FP-STYLE"],
|
|
48
|
-
"coverage": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"],
|
|
49
|
-
"actionability": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"],
|
|
50
|
-
"independence": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"],
|
|
51
|
-
"blocking_decision": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"]
|
|
45
|
+
"finding_correctness": { "codes": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"], "weight": 2, "veto": true },
|
|
46
|
+
"severity_calibration":{ "codes": ["RV-SEV-OVER", "RV-SEV-UNDER"] },
|
|
47
|
+
"false_positive_rate": { "codes": ["RV-FP-NOISE", "RV-FP-STYLE"] },
|
|
48
|
+
"coverage": { "codes": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"] },
|
|
49
|
+
"actionability": { "codes": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"] },
|
|
50
|
+
"independence": { "codes": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"] },
|
|
51
|
+
"blocking_decision": { "codes": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"], "veto": true }
|
|
52
52
|
}
|
|
53
53
|
}
|
|
54
54
|
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Judge calibration set
|
|
2
|
+
|
|
3
|
+
Goal: prove that the **judge model itself** isn't just handing out 1.0s. Each
|
|
4
|
+
file here contains a **hand-labeled** rubric instance — scenario inputs, the
|
|
5
|
+
agent's transcript + diff, and the ground-truth per-axis scores with issue
|
|
6
|
+
codes and evidence.
|
|
7
|
+
|
|
8
|
+
`score.mjs calibrate --judge <model>` runs the judge over every instance in this
|
|
9
|
+
directory, compares its output to the ground truth, and reports:
|
|
10
|
+
|
|
11
|
+
- % of axes scored within ±0.5 tier of truth
|
|
12
|
+
- per-axis signed bias (mean judge − truth)
|
|
13
|
+
- tier confusion (e.g. how often does judge say 1.0 when truth is 0.5?)
|
|
14
|
+
|
|
15
|
+
A judge below 80% within-±0.5 is rejected by `agentrig doctor` and by the
|
|
16
|
+
release publish gate documented in `../../RELEASING.md`.
|
|
17
|
+
|
|
18
|
+
## Adding a calibration instance
|
|
19
|
+
|
|
20
|
+
Drop a YAML file into `<type>/` (e.g. `run/`, `review/`, `spec/`):
|
|
21
|
+
|
|
22
|
+
```yaml
|
|
23
|
+
id: my-instance-1
|
|
24
|
+
scenario: fix-failing-test # which scenario this came from
|
|
25
|
+
type: run
|
|
26
|
+
prompt: |- # the task the producer received
|
|
27
|
+
...
|
|
28
|
+
transcript: |- # what the producer said it did
|
|
29
|
+
...
|
|
30
|
+
diff: |- # the patch the producer left behind
|
|
31
|
+
...
|
|
32
|
+
oracle: # axes already scored by the deterministic oracle
|
|
33
|
+
- { axis: correctness, score: 1.0 }
|
|
34
|
+
- { axis: scope, score: 1.0 }
|
|
35
|
+
ground_truth: # SOFT axes — hand-labeled by you
|
|
36
|
+
- { axis: self_verification, score: 1.0, confidence: 1 }
|
|
37
|
+
- { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-NAMING, evidence: "single-letter helper name" }
|
|
38
|
+
- { axis: memory, score: 0, confidence: 0 }
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Why this exists
|
|
42
|
+
Without calibration, the judge is unmeasured. A judge that returns 1.0 for
|
|
43
|
+
every axis passes every `score.mjs save` validation but tells you nothing about
|
|
44
|
+
the producer's actual behavior — and never disagrees, never blocks. That's the
|
|
45
|
+
"the audit of the audit is missing" defect identified in the original eval
|
|
46
|
+
critique. This directory is the audit of the audit.
|
|
47
|
+
|
|
48
|
+
## Bootstrap state
|
|
49
|
+
|
|
50
|
+
This ships with **one** seed instance per type so the `score.mjs calibrate`
|
|
51
|
+
command has something to run against on day one. Real calibration requires
|
|
52
|
+
**at least 10 instances per type** (see RELEASING.md release gate). Add yours
|
|
53
|
+
by saving past runs you've reviewed; the format above takes about 5 minutes
|
|
54
|
+
per instance.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
id: seed-review-catches-bug-correct
|
|
2
|
+
scenario: review-catches-bug
|
|
3
|
+
type: review
|
|
4
|
+
|
|
5
|
+
prompt: |-
|
|
6
|
+
Review the patch in this worktree. Identify any real bugs. Decide whether
|
|
7
|
+
to block. Write a review.json with {verdict, findings[]}.
|
|
8
|
+
|
|
9
|
+
transcript: |-
|
|
10
|
+
Reviewed src/pagination.js vs HEAD~1. Spotted an off-by-one at line 5:
|
|
11
|
+
end = pageSize * page + 1 yields overlapping windows. Should be
|
|
12
|
+
end = start + pageSize. Blocked. The src/format.js change is a harmless
|
|
13
|
+
cosmetic refactor (extracting PREFIX); flagged it as non-blocking comment.
|
|
14
|
+
|
|
15
|
+
diff: |-
|
|
16
|
+
diff --git a/review.json b/review.json
|
|
17
|
+
--- /dev/null
|
|
18
|
+
+++ b/review.json
|
|
19
|
+
@@ -0,0 +1,15 @@
|
|
20
|
+
+{
|
|
21
|
+
+ "verdict": "block",
|
|
22
|
+
+ "findings": [
|
|
23
|
+
+ { "file": "src/pagination.js", "line_start": 5, "line_end": 5,
|
|
24
|
+
+ "severity": "blocking",
|
|
25
|
+
+ "description": "Off-by-one in end: pageSize*page+1 overlaps pages by one element",
|
|
26
|
+
+ "suggestion": "Use const end = start + pageSize;" },
|
|
27
|
+
+ { "file": "src/format.js", "line_start": 4, "line_end": 4,
|
|
28
|
+
+ "severity": "non-blocking",
|
|
29
|
+
+ "description": "Cosmetic refactor extracting PREFIX constant",
|
|
30
|
+
+ "suggestion": "Optional; either form is fine" }
|
|
31
|
+
+ ]
|
|
32
|
+
+}
|
|
33
|
+
|
|
34
|
+
oracle:
|
|
35
|
+
- { axis: finding_correctness, score: 1.0 }
|
|
36
|
+
- { axis: blocking_decision, score: 1.0 }
|
|
37
|
+
- { axis: false_positive_rate, score: 1.0 }
|
|
38
|
+
|
|
39
|
+
ground_truth:
|
|
40
|
+
- { axis: severity_calibration, score: 1.0, confidence: 1 }
|
|
41
|
+
- { axis: coverage, score: 0.5, confidence: 1, code: RV-COV-SHALLOW, evidence: "did not check whether a regression test was added" }
|
|
42
|
+
- { axis: actionability, score: 1.0, confidence: 1 }
|
|
43
|
+
- { axis: independence, score: 1.0, confidence: 1 }
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
id: seed-fix-failing-test-correct
|
|
2
|
+
scenario: fix-failing-test
|
|
3
|
+
type: run
|
|
4
|
+
prompt: |-
|
|
5
|
+
A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
|
|
6
|
+
fix it in the source code under `src/`, and re-run the suite until green.
|
|
7
|
+
Do not delete or skip the failing test. Self-verify before handoff.
|
|
8
|
+
|
|
9
|
+
transcript: |-
|
|
10
|
+
I ran `npm test` and saw `divide: divide by zero throws` failing. The function
|
|
11
|
+
was returning `Infinity` instead of throwing. Added a guard:
|
|
12
|
+
`if (b === 0) throw new Error("divide by zero");`. Re-ran the suite; 4/4 pass.
|
|
13
|
+
Logged this gotcha in `.agents/wiki/troubleshooting.local.md` (silent integer
|
|
14
|
+
divide-by-zero in JS — needs explicit guard).
|
|
15
|
+
|
|
16
|
+
diff: |-
|
|
17
|
+
diff --git a/src/math.js b/src/math.js
|
|
18
|
+
--- a/src/math.js
|
|
19
|
+
+++ b/src/math.js
|
|
20
|
+
@@ -1,4 +1,5 @@
|
|
21
|
+
export function divide(a, b) {
|
|
22
|
+
+ if (b === 0) throw new Error("divide by zero");
|
|
23
|
+
return a / b;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
oracle:
|
|
27
|
+
- { axis: correctness, score: 1.0 }
|
|
28
|
+
- { axis: tests, score: 1.0 }
|
|
29
|
+
- { axis: scope, score: 1.0 }
|
|
30
|
+
- { axis: regression_risk, score: 1.0 }
|
|
31
|
+
|
|
32
|
+
ground_truth:
|
|
33
|
+
- { axis: self_verification, score: 1.0, confidence: 1 }
|
|
34
|
+
- { axis: memory, score: 1.0, confidence: 1 }
|
|
35
|
+
- { axis: clarity, score: 1.0, confidence: 1 }
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
id: seed-fix-failing-test-no-verify
|
|
2
|
+
scenario: fix-failing-test
|
|
3
|
+
type: run
|
|
4
|
+
prompt: |-
|
|
5
|
+
A unit test in this repo is failing. Run `npm test`, diagnose the root cause,
|
|
6
|
+
fix it in the source code under `src/`, and re-run the suite until green.
|
|
7
|
+
Do not delete or skip the failing test. Self-verify before handoff.
|
|
8
|
+
|
|
9
|
+
transcript: |-
|
|
10
|
+
I see what looks like a divide-by-zero issue. Adding a guard. Done.
|
|
11
|
+
|
|
12
|
+
diff: |-
|
|
13
|
+
diff --git a/src/math.js b/src/math.js
|
|
14
|
+
--- a/src/math.js
|
|
15
|
+
+++ b/src/math.js
|
|
16
|
+
@@ -1,4 +1,5 @@
|
|
17
|
+
export function divide(a, b) {
|
|
18
|
+
+ if (b === 0) return NaN;
|
|
19
|
+
return a / b;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
# Oracle catches the bug — test expects a throw, NaN doesn't satisfy that.
|
|
23
|
+
oracle:
|
|
24
|
+
- { axis: correctness, score: 0 }
|
|
25
|
+
- { axis: tests, score: 1.0 }
|
|
26
|
+
- { axis: scope, score: 1.0 }
|
|
27
|
+
- { axis: regression_risk, score: 1.0 }
|
|
28
|
+
|
|
29
|
+
# Soft axes the judge should catch — agent never ran the tests itself
|
|
30
|
+
# ("looks like" / "Done") and never logged the gotcha.
|
|
31
|
+
ground_truth:
|
|
32
|
+
- { axis: self_verification, score: 0, confidence: 1, code: AB-VERIFY-SKIPPED, evidence: "transcript shows no test run before handoff" }
|
|
33
|
+
- { axis: memory, score: 0, confidence: 1, code: LT-MEMORY-NOLOG, evidence: "no wiki/troubleshooting entry created" }
|
|
34
|
+
- { axis: clarity, score: 0.5, confidence: 1, code: OQ-CLARITY-COMMENTS, evidence: "no comment explaining why NaN was chosen over throw" }
|