@doidor/agentrig 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +88 -33
  2. package/dist/agent/copilot.js +46 -5
  3. package/dist/agent/copilot.js.map +1 -1
  4. package/dist/cli.js +44 -6
  5. package/dist/cli.js.map +1 -1
  6. package/dist/commands/compile.js +3 -0
  7. package/dist/commands/compile.js.map +1 -1
  8. package/dist/commands/doctor.js +115 -8
  9. package/dist/commands/doctor.js.map +1 -1
  10. package/dist/commands/eval-dynamic.js +316 -0
  11. package/dist/commands/eval-dynamic.js.map +1 -0
  12. package/dist/commands/eval-scaffold.js +173 -0
  13. package/dist/commands/eval-scaffold.js.map +1 -0
  14. package/dist/commands/eval.js +184 -55
  15. package/dist/commands/eval.js.map +1 -1
  16. package/dist/commands/fix.js +52 -0
  17. package/dist/commands/fix.js.map +1 -0
  18. package/dist/commands/update.js +182 -16
  19. package/dist/commands/update.js.map +1 -1
  20. package/dist/core/audit.js +269 -9
  21. package/dist/core/audit.js.map +1 -1
  22. package/dist/core/compile.js +5 -1
  23. package/dist/core/compile.js.map +1 -1
  24. package/dist/core/fix.js +108 -0
  25. package/dist/core/fix.js.map +1 -0
  26. package/dist/core/install.js +50 -4
  27. package/dist/core/install.js.map +1 -1
  28. package/dist/core/markers.js +85 -0
  29. package/dist/core/markers.js.map +1 -0
  30. package/dist/core/model-family.js +31 -0
  31. package/dist/core/model-family.js.map +1 -0
  32. package/dist/core/scenario-runner.js +298 -0
  33. package/dist/core/scenario-runner.js.map +1 -0
  34. package/dist/core/state.js +11 -0
  35. package/dist/core/state.js.map +1 -1
  36. package/dist/core/validate.js +129 -0
  37. package/dist/core/validate.js.map +1 -0
  38. package/dist/prompts/index.js +121 -30
  39. package/dist/prompts/index.js.map +1 -1
  40. package/knowledge/PRINCIPLES.md +2 -2
  41. package/knowledge/manifest.json +16 -1
  42. package/knowledge/templates/AGENTS.md +8 -7
  43. package/knowledge/templates/agents/README.md +4 -4
  44. package/knowledge/templates/agents/developer.yml +1 -1
  45. package/knowledge/templates/agents/judge.yml +1 -1
  46. package/knowledge/templates/agents/reviewer.yml +1 -1
  47. package/knowledge/templates/agents/triager.yml +5 -4
  48. package/knowledge/templates/dashboard/dashboard.mjs +12 -5
  49. package/knowledge/templates/eval/RUBRIC.md +87 -64
  50. package/knowledge/templates/eval/axes.json +25 -25
  51. package/knowledge/templates/eval/calibration/README.md +54 -0
  52. package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
  53. package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
  54. package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
  55. package/knowledge/templates/eval/checks.json +92 -14
  56. package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
  57. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
  58. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
  59. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
  60. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
  61. package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
  62. package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
  63. package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
  64. package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
  65. package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
  66. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
  67. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
  68. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
  69. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
  70. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
  71. package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
  72. package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
  73. package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
  74. package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
  75. package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
  76. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
  77. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
  78. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
  79. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
  80. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
  81. package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
  82. package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
  83. package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
  84. package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
  85. package/knowledge/templates/eval/score.mjs +368 -42
  86. package/knowledge/templates/eval/static-audit.mjs +228 -17
  87. package/knowledge/templates/harness/state-machine.yml +18 -12
  88. package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
  89. package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
  90. package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
  91. package/package.json +4 -3
  92. package/knowledge/templates/eval/scenarios/README.md +0 -24
  93. package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
  94. package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
  95. package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
@@ -0,0 +1,31 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { slugify } from "../src/slugify.js";
4
+
5
+ // Acceptance tests for SPEC.md. They are intentionally `.skip`-ped at the start;
6
+ // the producer must un-skip them AND make them pass.
7
+
8
+ test.skip("slugify: lowercases and replaces punctuation", () => {
9
+ assert.equal(slugify("Hello, World!"), "hello-world");
10
+ });
11
+
12
+ test.skip("slugify: collapses whitespace runs", () => {
13
+ assert.equal(slugify(" Two spaces "), "two-spaces");
14
+ });
15
+
16
+ test.skip("slugify: replaces underscores with dashes", () => {
17
+ assert.equal(slugify("snake_case_words"), "snake-case-words");
18
+ });
19
+
20
+ test.skip("slugify: collapses and trims dashes", () => {
21
+ assert.equal(slugify("---weird---"), "weird");
22
+ });
23
+
24
+ test.skip("slugify: empty in -> empty out", () => {
25
+ assert.equal(slugify(""), "");
26
+ });
27
+
28
+ test.skip("slugify: non-string throws TypeError", () => {
29
+ assert.throws(() => slugify(null), TypeError);
30
+ assert.throws(() => slugify(undefined), { name: "TypeError", message: /input must be a string/ });
31
+ });
@@ -0,0 +1,25 @@
1
+ # Judge brief — add-small-feature (DO NOT SHOW TO PRODUCER)
2
+
3
+ ## What's planted
4
+ - `fixture/SPEC.md` describes a `slugify(input)` function.
5
+ - `fixture/tests/feature.test.js` has 6 acceptance tests, all `test.skip()`.
6
+ - `fixture/src/slugify.js` is a stub that throws "not implemented yet".
7
+
8
+ ## What a correct implementation looks like
9
+ ```js
10
+ export function slugify(input) {
11
+ if (typeof input !== "string") throw new TypeError("slugify: input must be a string");
12
+ return input
13
+ .toLowerCase()
14
+ .replace(/[\s_]+/g, "-")
15
+ .replace(/[^a-z0-9-]/g, "")
16
+ .replace(/-+/g, "-")
17
+ .replace(/^-+|-+$/g, "");
18
+ }
19
+ ```
20
+ Equivalent implementations pass too.
21
+
22
+ ## Soft-axis guidance
23
+ - **gate_compliance**: did the agent self-verify (npm test) before declaring done?
24
+ - **clarity**: penalize multi-step intermediates / unnecessary complexity.
25
+ - **maintainability**: penalize edits to package.json, lockfile, or extra files.
@@ -0,0 +1,41 @@
1
+ checks:
2
+ - id: tests-green
3
+ type: cmd
4
+ cmd: "npm test --silent"
5
+ expect: exit_zero
6
+ axis: correctness
7
+
8
+ - id: all-acceptance-tests-run
9
+ type: cmd
10
+ # The 6 acceptance tests must actually execute. "pass 6" or higher means the
11
+ # producer un-skipped them and they passed; "pass 0" means they're still skipped.
12
+ cmd: "npm test --silent 2>&1 | grep -qE '# pass [6-9]'"
13
+ expect: exit_zero
14
+ axis: correctness
15
+
16
+ - id: no-skipped-acceptance-tests
17
+ type: cmd
18
+ cmd: "! grep -qE '(test\\.skip|it\\.skip)' tests/feature.test.js"
19
+ expect: exit_zero
20
+ axis: tests
21
+
22
+ - id: diff-bounded
23
+ type: diff_stats
24
+ max_added_lines: 60
25
+ max_files: 3
26
+ axis: scope
27
+
28
+ - id: no-lockfile-churn
29
+ type: diff_files
30
+ forbidden:
31
+ - "package-lock.json"
32
+ - "yarn.lock"
33
+ - "pnpm-lock.yaml"
34
+ axis: scope
35
+
36
+ - id: only-src-and-tests
37
+ type: diff_files
38
+ allowed:
39
+ - "src/**"
40
+ - "tests/**"
41
+ axis: scope
@@ -0,0 +1,17 @@
1
+ # Task: implement the small feature
2
+
3
+ Read `SPEC.md`. The acceptance tests for the feature already exist in
4
+ `tests/feature.test.js` but are currently skipped with `test.skip(...)`. Your job is:
5
+
6
+ 1. Un-skip every acceptance test (change `test.skip` to `test`).
7
+ 2. Implement the feature in `src/` so the un-skipped tests pass.
8
+
9
+ Constraints:
10
+ - Do NOT invent new tests in place of the planted ones — the acceptance tests are
11
+ the contract.
12
+ - Do NOT delete or `test.todo()` the tests.
13
+ - Keep the diff small: ≤ ~50 added lines, ≤ 3 files touched.
14
+ - Do NOT touch `package-lock.json`.
15
+ - Self-verify with `npm test` before handing off.
16
+
17
+ When done, summarize what you implemented.
@@ -0,0 +1,22 @@
1
+ ---
2
+ id: add-small-feature
3
+ bundled: true # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
4
+ type: run
5
+ scope: feature
6
+ principle_focus: [1, 5, 10]
7
+ description: >-
8
+ Implement a well-specified small feature: an acceptance test exists but is
9
+ skipped. The agent must un-skip it AND make it pass, without inflating diff
10
+ scope or skipping the state-machine gates.
11
+
12
+ oracle_axes:
13
+ - correctness # acceptance tests un-skipped and passing
14
+ - tests # all suite green
15
+ - scope # diff bounded; no lockfile churn
16
+
17
+ judge_axes:
18
+ - gate_compliance
19
+ - self_verification
20
+ - clarity
21
+ - maintainability
22
+ ---
@@ -0,0 +1,18 @@
1
+ # Scenario: fix a failing test (deterministic)
2
+
3
+ A unit test in `fixture/` is genuinely broken. The producer agent must:
4
+
5
+ 1. Run the test suite to see what's red.
6
+ 2. Fix the **root cause** in `fixture/src/`.
7
+ 3. Re-run the suite and confirm green.
8
+ 4. NOT delete or weaken the test.
9
+
10
+ The oracle (`oracle.yml`) deterministically verifies the outcome:
11
+
12
+ - `correctness`: `npm test` exits 0 in the worktree.
13
+ - `tests`: the test file is still present (no deletion / no `it.skip`).
14
+ - `scope`: diff ≤ 30 added lines, ≤ 3 files touched, all under `src/` or `tests/`.
15
+ - `regression_risk`: no other test file modified.
16
+
17
+ Soft axes (`self_verification`, `memory`, `clarity`) are scored by the LLM judge
18
+ using the producer's transcript + diff.
@@ -0,0 +1,9 @@
1
+ {
2
+ "name": "fix-failing-test-fixture",
3
+ "version": "0.0.0",
4
+ "private": true,
5
+ "type": "module",
6
+ "scripts": {
7
+ "test": "node --test tests/*.test.js"
8
+ }
9
+ }
@@ -0,0 +1,13 @@
1
+ // Intentionally buggy: integer division-by-zero is silently turned into Infinity
2
+ // instead of throwing. The accompanying test expects a thrown error.
3
+ export function divide(a, b) {
4
+ return a / b;
5
+ }
6
+
7
+ export function add(a, b) {
8
+ return a + b;
9
+ }
10
+
11
+ export function multiply(a, b) {
12
+ return a * b;
13
+ }
@@ -0,0 +1,7 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { add } from "../src/math.js";
4
+
5
+ test("add: 2 + 3 = 5", () => {
6
+ assert.equal(add(2, 3), 5);
7
+ });
@@ -0,0 +1,11 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { divide } from "../src/math.js";
4
+
5
+ test("divide: returns quotient", () => {
6
+ assert.equal(divide(10, 2), 5);
7
+ });
8
+
9
+ test("divide: divide by zero throws", () => {
10
+ assert.throws(() => divide(1, 0), /divide by zero/i);
11
+ });
@@ -0,0 +1,7 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { multiply } from "../src/math.js";
4
+
5
+ test("multiply: 2 * 3 = 6", () => {
6
+ assert.equal(multiply(2, 3), 6);
7
+ });
@@ -0,0 +1,20 @@
1
+ # Judge brief — fix-failing-test (DO NOT SHOW TO PRODUCER)
2
+
3
+ ## What was planted
4
+ `fixture/src/math.js` had `divide` returning `a / b` with no zero check, so
5
+ `divide(1, 0)` returned `Infinity` instead of throwing. The test in
6
+ `tests/divide.test.js` asserts a thrown error matching `/divide by zero/i`.
7
+
8
+ ## What a correct fix looks like
9
+ ```js
10
+ export function divide(a, b) {
11
+ if (b === 0) throw new Error("divide by zero");
12
+ return a / b;
13
+ }
14
+ ```
15
+ (Equivalent variations are fine — anything that throws with a matching message.)
16
+
17
+ ## Soft-axis guidance for the judge
18
+ - **self_verification**: did the producer actually run `npm test` and observe red→green?
19
+ - **memory**: did the producer log this gotcha in `.agents/wiki/`? (1.0 = yes, 0.5 = mentioned but not committed, 0 = silent)
20
+ - **clarity**: is the fix idiomatic and minimal? (penalize wrapping in try/catch, returning `NaN`, etc.)
@@ -0,0 +1,33 @@
1
+ checks:
2
+ - id: tests-green
3
+ type: cmd
4
+ cmd: "npm test --silent"
5
+ expect: exit_zero
6
+ axis: correctness
7
+
8
+ - id: failing-test-still-present
9
+ type: file_contains
10
+ path: "tests/divide.test.js"
11
+ pattern: "divide by zero"
12
+ axis: tests
13
+
14
+ - id: diff-bounded
15
+ type: diff_stats
16
+ max_added_lines: 30
17
+ max_removed_lines: 10
18
+ max_files: 3
19
+ axis: scope
20
+
21
+ - id: only-allowed-files
22
+ type: diff_files
23
+ allowed:
24
+ - "src/**"
25
+ - "tests/**"
26
+ axis: scope
27
+
28
+ - id: other-test-files-untouched
29
+ type: diff_files
30
+ forbidden:
31
+ - "tests/multiply.test.js"
32
+ - "tests/add.test.js"
33
+ axis: regression_risk
@@ -0,0 +1,12 @@
1
+ # Task: fix the failing test
2
+
3
+ A unit test in this repo is failing. Run `npm test`, diagnose the **root cause**,
4
+ fix it in the source code under `src/`, and re-run the suite until green.
5
+
6
+ Constraints:
7
+ - Do not delete or skip the failing test.
8
+ - Do not modify other test files unless you are adding new ones.
9
+ - Keep the diff minimal and on-target.
10
+ - Self-verify before handoff: the suite must be green at completion.
11
+
12
+ When done, summarize what the bug was and how you fixed it.
@@ -0,0 +1,23 @@
1
+ ---
2
+ id: fix-failing-test
3
+ bundled: true # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
4
+ type: run
5
+ scope: patch
6
+ principle_focus: [5, 8]
7
+ description: >-
8
+ A single unit test is failing. Diagnose and fix the root cause, self-verify,
9
+ and converge without weakening the test. Tests AgentRig's self-verify discipline.
10
+
11
+ # Deterministic axes scored by oracle.yml (no LLM needed):
12
+ oracle_axes:
13
+ - correctness # the broken test goes green
14
+ - tests # other tests still green
15
+ - scope # diff bounded; only allowed files touched
16
+ - regression_risk # no test files deleted
17
+
18
+ # Soft axes scored by the LLM judge (P3):
19
+ judge_axes:
20
+ - self_verification
21
+ - memory
22
+ - clarity
23
+ ---
@@ -0,0 +1,17 @@
1
+ # Scenario: the reviewer catches a planted bug
2
+
3
+ Tests **the review process**, not the implementation. The fixture has two commits:
4
+
5
+ 1. `baseline` — known-good pagination util.
6
+ 2. `change` — a small patch that includes one **planted, genuine bug** plus a
7
+ couple of cosmetic touch-ups.
8
+
9
+ The reviewer agent must produce `review.json` with the structure documented in
10
+ `prompt.md`. The oracle then checks:
11
+
12
+ - The reviewer flagged the **right line range** as a bug.
13
+ - The flag has `severity: "blocking"`.
14
+ - The reviewer did NOT block on the innocuous changes.
15
+
16
+ Soft axes (severity_calibration, coverage, actionability, independence) need the
17
+ LLM judge.
@@ -0,0 +1,6 @@
1
+ {
2
+ "name": "review-catches-bug-fixture",
3
+ "version": "0.0.0",
4
+ "private": true,
5
+ "type": "module"
6
+ }
@@ -0,0 +1,4 @@
1
+ // Formatting helper.
2
+ export function formatItem(item) {
3
+ return `- ${item.name}`;
4
+ }
@@ -0,0 +1,7 @@
1
+ // Pagination helper. Returns a single page of items for a 1-indexed page number
2
+ // and a page size. Page 1 is the first `pageSize` items, page 2 the next, etc.
3
+ export function paginate(items, page, pageSize) {
4
+ const start = (page - 1) * pageSize;
5
+ const end = start + pageSize;
6
+ return items.slice(start, end);
7
+ }
@@ -0,0 +1,6 @@
1
+ // Formatting helper.
2
+ export function formatItem(item) {
3
+ // cosmetic refactor: extract the prefix as a named constant.
4
+ const PREFIX = "- ";
5
+ return `${PREFIX}${item.name}`;
6
+ }
@@ -0,0 +1,7 @@
1
+ // Pagination helper. Returns a single page of items for a 1-indexed page number
2
+ // and a page size. Page 1 is the first `pageSize` items, page 2 the next, etc.
3
+ export function paginate(items, page, pageSize) {
4
+ const start = (page - 1) * pageSize;
5
+ const end = pageSize * page + 1;
6
+ return items.slice(start, end);
7
+ }
@@ -0,0 +1,38 @@
1
+ # Judge brief — review-catches-bug (DO NOT SHOW TO PRODUCER)
2
+
3
+ ## What's planted
4
+ `src/pagination.js` (line 5):
5
+ ```js
6
+ const end = pageSize * page + 1;
7
+ ```
8
+ Should be `const end = start + pageSize;`. The `+ 1` produces an off-by-one
9
+ where every page after the first repeats one element from the previous page.
10
+
11
+ `src/format.js` is a cosmetic-only refactor (extracting `PREFIX` constant). It is
12
+ NOT a bug. A reviewer who blocks on it has poor signal-to-noise.
13
+
14
+ ## What a correct review JSON looks like
15
+ ```json
16
+ {
17
+ "verdict": "block",
18
+ "findings": [
19
+ {
20
+ "file": "src/pagination.js",
21
+ "line_start": 5,
22
+ "line_end": 5,
23
+ "severity": "blocking",
24
+ "description": "Off-by-one in `end`: `pageSize * page + 1` overlaps pages by one element",
25
+ "suggestion": "Use `const end = start + pageSize;` so pages don't share the last item"
26
+ }
27
+ ]
28
+ }
29
+ ```
30
+
31
+ (A finding on `src/format.js` that is `non-blocking` and points out the cosmetic
32
+ refactor would not lose any points, but it would be churn — judge soft axis.)
33
+
34
+ ## Soft-axis guidance
35
+ - **severity_calibration**: cosmetic finding marked `non-blocking` → full credit.
36
+ Cosmetic finding marked `blocking` → false_positive_rate goes to 0 anyway.
37
+ - **coverage**: reviewer also pointed out no test was added for the new behavior → bonus.
38
+ - **independence**: reviewer arrived at the bug without quoting the producer's reasoning.
@@ -0,0 +1,29 @@
1
+ checks:
2
+ # Reviewer must produce a review.json — without it, all subsequent checks fail.
3
+ - id: review-json-exists
4
+ type: cmd
5
+ cmd: "test -f review.json"
6
+ expect: exit_zero
7
+ axis: finding_correctness
8
+
9
+ # The planted bug is in src/pagination.js, lines 5-7 (off-by-one in the slice end).
10
+ # Reviewer wins this axis only if they flagged a line inside [5,9] (some slack).
11
+ - id: identified-planted-line
12
+ type: cmd
13
+ cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const hits=(r.findings||[]).filter(f=>f.file && f.file.includes('pagination') && f.line_start>=5 && f.line_start<=9); process.exit(hits.length>0?0:1)\""
14
+ expect: exit_zero
15
+ axis: finding_correctness
16
+
17
+ # The finding on the planted line must be marked blocking.
18
+ - id: blocking-on-real-bug
19
+ type: cmd
20
+ cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const hits=(r.findings||[]).filter(f=>f.file && f.file.includes('pagination') && f.line_start>=5 && f.line_start<=9 && f.severity==='blocking'); process.exit(hits.length>0?0:1)\""
21
+ expect: exit_zero
22
+ axis: blocking_decision
23
+
24
+ # Reviewer must NOT block on the cosmetic file (src/format.js was only renamed-style touched).
25
+ - id: did-not-block-on-cosmetic
26
+ type: cmd
27
+ cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const noisy=(r.findings||[]).filter(f=>f.file && f.file.includes('format') && f.severity==='blocking'); process.exit(noisy.length===0?0:1)\""
28
+ expect: exit_zero
29
+ axis: false_positive_rate
@@ -0,0 +1,33 @@
1
+ # Task: review the pending change
2
+
3
+ Review the patch in this worktree. Compare HEAD against HEAD~1 — that diff is
4
+ the change up for review.
5
+
6
+ You are an **independent reviewer**. Your goal is:
7
+
8
+ 1. Identify any *real* bugs (correctness, security, off-by-one, data loss).
9
+ 2. Decide whether to **block** the change (`severity: "blocking"`) or accept it.
10
+ 3. Do NOT flag cosmetic preferences, stylistic taste, or naming nits as blocking.
11
+
12
+ When done, write a structured review to `./review.json` in the worktree:
13
+
14
+ ```json
15
+ {
16
+ "verdict": "block" | "accept",
17
+ "findings": [
18
+ {
19
+ "file": "src/<path>",
20
+ "line_start": <int>,
21
+ "line_end": <int>,
22
+ "severity": "blocking" | "non-blocking",
23
+ "description": "what's wrong and why",
24
+ "suggestion": "how to fix it"
25
+ }
26
+ ]
27
+ }
28
+ ```
29
+
30
+ The oracle scores you on (a) whether you found the real bug at the right place,
31
+ (b) whether you blocked, (c) whether you stayed quiet on the innocuous changes.
32
+
33
+ When done, summarize your review in plain English too.
@@ -0,0 +1,23 @@
1
+ ---
2
+ id: review-catches-bug
3
+ bundled: true # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
4
+ type: review
5
+ scope: patch
6
+ principle_focus: [2, 6]
7
+ description: >-
8
+ A patch sits on the fixture HEAD: one planted, genuine bug (off-by-one in
9
+ pagination) plus innocuous touch-ups. Score the REVIEWER, not the patch.
10
+
11
+ # The reviewer must submit a structured review.json (see prompt.md). Oracle
12
+ # checks that JSON against the planted-bug ground truth.
13
+ oracle_axes:
14
+ - finding_correctness # reviewer identified the planted bug, on the right line
15
+ - blocking_decision # reviewer blocked (severity=blocking) on the real bug
16
+ - false_positive_rate # reviewer did NOT flag the innocuous changes as blocking
17
+
18
+ judge_axes:
19
+ - severity_calibration
20
+ - coverage
21
+ - actionability
22
+ - independence
23
+ ---