@doidor/agentrig 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +64 -29
  2. package/dist/agent/copilot.js +46 -5
  3. package/dist/agent/copilot.js.map +1 -1
  4. package/dist/cli.js +36 -6
  5. package/dist/cli.js.map +1 -1
  6. package/dist/commands/doctor.js +53 -8
  7. package/dist/commands/doctor.js.map +1 -1
  8. package/dist/commands/eval-dynamic.js +316 -0
  9. package/dist/commands/eval-dynamic.js.map +1 -0
  10. package/dist/commands/eval-scaffold.js +173 -0
  11. package/dist/commands/eval-scaffold.js.map +1 -0
  12. package/dist/commands/eval.js +184 -55
  13. package/dist/commands/eval.js.map +1 -1
  14. package/dist/commands/init.js +26 -5
  15. package/dist/commands/init.js.map +1 -1
  16. package/dist/commands/update.js +1 -1
  17. package/dist/commands/update.js.map +1 -1
  18. package/dist/core/audit.js +237 -9
  19. package/dist/core/audit.js.map +1 -1
  20. package/dist/core/install.js +28 -1
  21. package/dist/core/install.js.map +1 -1
  22. package/dist/core/model-family.js +31 -0
  23. package/dist/core/model-family.js.map +1 -0
  24. package/dist/core/scenario-runner.js +298 -0
  25. package/dist/core/scenario-runner.js.map +1 -0
  26. package/dist/prompts/index.js +121 -30
  27. package/dist/prompts/index.js.map +1 -1
  28. package/knowledge/PRINCIPLES.md +2 -2
  29. package/knowledge/manifest.json +16 -1
  30. package/knowledge/templates/AGENTS.md +7 -6
  31. package/knowledge/templates/agents/README.md +4 -4
  32. package/knowledge/templates/agents/developer.yml +1 -1
  33. package/knowledge/templates/agents/judge.yml +1 -1
  34. package/knowledge/templates/agents/reviewer.yml +1 -1
  35. package/knowledge/templates/agents/triager.yml +5 -4
  36. package/knowledge/templates/dashboard/dashboard.mjs +12 -5
  37. package/knowledge/templates/eval/RUBRIC.md +87 -64
  38. package/knowledge/templates/eval/axes.json +25 -25
  39. package/knowledge/templates/eval/calibration/README.md +54 -0
  40. package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
  41. package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
  42. package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
  43. package/knowledge/templates/eval/checks.json +88 -11
  44. package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
  45. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
  46. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
  47. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
  48. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
  49. package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
  50. package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
  51. package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
  52. package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
  53. package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
  54. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
  55. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
  56. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
  57. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
  58. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
  59. package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
  60. package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
  61. package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
  62. package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
  63. package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
  64. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
  65. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
  66. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
  67. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
  68. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
  69. package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
  70. package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
  71. package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
  72. package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
  73. package/knowledge/templates/eval/score.mjs +368 -42
  74. package/knowledge/templates/eval/static-audit.mjs +204 -17
  75. package/knowledge/templates/harness/state-machine.yml +18 -12
  76. package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
  77. package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
  78. package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
  79. package/package.json +4 -3
  80. package/knowledge/templates/eval/scenarios/README.md +0 -24
  81. package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
  82. package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
  83. package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
@@ -0,0 +1,12 @@
1
+ # Task: fix the failing test
2
+
3
+ A unit test in this repo is failing. Run `npm test`, diagnose the **root cause**,
4
+ fix it in the source code under `src/`, and re-run the suite until green.
5
+
6
+ Constraints:
7
+ - Do not delete or skip the failing test.
8
+ - Do not modify other test files unless you are adding new ones.
9
+ - Keep the diff minimal and on-target.
10
+ - Self-verify before handoff: the suite must be green at completion.
11
+
12
+ When done, summarize what the bug was and how you fixed it.
@@ -0,0 +1,23 @@
1
+ ---
2
+ id: fix-failing-test
3
+ bundled: true # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
4
+ type: run
5
+ scope: patch
6
+ principle_focus: [5, 8]
7
+ description: >-
8
+ A single unit test is failing. Diagnose and fix the root cause, self-verify,
9
+ and converge without weakening the test. Tests AgentRig's self-verify discipline.
10
+
11
+ # Deterministic axes scored by oracle.yml (no LLM needed):
12
+ oracle_axes:
13
+ - correctness # the broken test goes green
14
+ - tests # other tests still green
15
+ - scope # diff bounded; only allowed files touched
16
+ - regression_risk # no test files deleted
17
+
18
+ # Soft axes scored by the LLM judge (P3):
19
+ judge_axes:
20
+ - self_verification
21
+ - memory
22
+ - clarity
23
+ ---
@@ -0,0 +1,17 @@
1
+ # Scenario: the reviewer catches a planted bug
2
+
3
+ Tests **the review process**, not the implementation. The fixture has two commits:
4
+
5
+ 1. `baseline` — known-good pagination util.
6
+ 2. `change` — a small patch that includes one **planted, genuine bug** plus a
7
+ couple of cosmetic touch-ups.
8
+
9
+ The reviewer agent must produce `review.json` with the structure documented in
10
+ `prompt.md`. The oracle then checks:
11
+
12
+ - The reviewer flagged the **right line range** as a bug.
13
+ - The flag has `severity: "blocking"`.
14
+ - The reviewer did NOT block on the innocuous changes.
15
+
16
+ Soft axes (severity_calibration, coverage, actionability, independence) need the
17
+ LLM judge.
@@ -0,0 +1,6 @@
1
+ {
2
+ "name": "review-catches-bug-fixture",
3
+ "version": "0.0.0",
4
+ "private": true,
5
+ "type": "module"
6
+ }
@@ -0,0 +1,4 @@
1
+ // Formatting helper.
2
+ export function formatItem(item) {
3
+ return `- ${item.name}`;
4
+ }
@@ -0,0 +1,7 @@
1
+ // Pagination helper. Returns a single page of items for a 1-indexed page number
2
+ // and a page size. Page 1 is the first `pageSize` items, page 2 the next, etc.
3
+ export function paginate(items, page, pageSize) {
4
+ const start = (page - 1) * pageSize;
5
+ const end = start + pageSize;
6
+ return items.slice(start, end);
7
+ }
@@ -0,0 +1,6 @@
1
+ // Formatting helper.
2
+ export function formatItem(item) {
3
+ // cosmetic refactor: extract the prefix as a named constant.
4
+ const PREFIX = "- ";
5
+ return `${PREFIX}${item.name}`;
6
+ }
@@ -0,0 +1,7 @@
1
+ // Pagination helper. Returns a single page of items for a 1-indexed page number
2
+ // and a page size. Page 1 is the first `pageSize` items, page 2 the next, etc.
3
+ export function paginate(items, page, pageSize) {
4
+ const start = (page - 1) * pageSize;
5
+ const end = pageSize * page + 1;
6
+ return items.slice(start, end);
7
+ }
@@ -0,0 +1,38 @@
1
+ # Judge brief — review-catches-bug (DO NOT SHOW TO PRODUCER)
2
+
3
+ ## What's planted
4
+ `src/pagination.js` (line 5):
5
+ ```js
6
+ const end = pageSize * page + 1;
7
+ ```
8
+ Should be `const end = start + pageSize;`. The `+ 1` produces an off-by-one
9
+ where every page after the first repeats one element from the previous page.
10
+
11
+ `src/format.js` is a cosmetic-only refactor (extracting `PREFIX` constant). It is
12
+ NOT a bug. A reviewer who blocks on it has poor signal-to-noise.
13
+
14
+ ## What a correct review JSON looks like
15
+ ```json
16
+ {
17
+ "verdict": "block",
18
+ "findings": [
19
+ {
20
+ "file": "src/pagination.js",
21
+ "line_start": 5,
22
+ "line_end": 5,
23
+ "severity": "blocking",
24
+ "description": "Off-by-one in `end`: `pageSize * page + 1` overlaps pages by one element",
25
+ "suggestion": "Use `const end = start + pageSize;` so pages don't share the last item"
26
+ }
27
+ ]
28
+ }
29
+ ```
30
+
31
+ (A finding on `src/format.js` that is `non-blocking` and points out the cosmetic
32
+ refactor would not lose any points, but it would be churn — judge soft axis.)
33
+
34
+ ## Soft-axis guidance
35
+ - **severity_calibration**: cosmetic finding marked `non-blocking` → full credit.
36
+ Cosmetic finding marked `blocking` → false_positive_rate goes to 0 anyway.
37
+ - **coverage**: reviewer also pointed out no test was added for the new behavior → bonus.
38
+ - **independence**: reviewer arrived at the bug without quoting the producer's reasoning.
@@ -0,0 +1,29 @@
1
+ checks:
2
+ # Reviewer must produce a review.json — without it, all subsequent checks fail.
3
+ - id: review-json-exists
4
+ type: cmd
5
+ cmd: "test -f review.json"
6
+ expect: exit_zero
7
+ axis: finding_correctness
8
+
9
+ # The planted bug is in src/pagination.js, lines 5-7 (off-by-one in the slice end).
10
+ # Reviewer wins this axis only if they flagged a line inside [5,9] (some slack).
11
+ - id: identified-planted-line
12
+ type: cmd
13
+ cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const hits=(r.findings||[]).filter(f=>f.file && f.file.includes('pagination') && f.line_start>=5 && f.line_start<=9); process.exit(hits.length>0?0:1)\""
14
+ expect: exit_zero
15
+ axis: finding_correctness
16
+
17
+ # The finding on the planted line must be marked blocking.
18
+ - id: blocking-on-real-bug
19
+ type: cmd
20
+ cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const hits=(r.findings||[]).filter(f=>f.file && f.file.includes('pagination') && f.line_start>=5 && f.line_start<=9 && f.severity==='blocking'); process.exit(hits.length>0?0:1)\""
21
+ expect: exit_zero
22
+ axis: blocking_decision
23
+
24
+ # Reviewer must NOT block on the cosmetic file (src/format.js was only renamed-style touched).
25
+ - id: did-not-block-on-cosmetic
26
+ type: cmd
27
+ cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const noisy=(r.findings||[]).filter(f=>f.file && f.file.includes('format') && f.severity==='blocking'); process.exit(noisy.length===0?0:1)\""
28
+ expect: exit_zero
29
+ axis: false_positive_rate
@@ -0,0 +1,33 @@
1
+ # Task: review the pending change
2
+
3
+ Review the patch in this worktree. Compare HEAD against HEAD~1 — that diff is
4
+ the change up for review.
5
+
6
+ You are an **independent reviewer**. Your goal is:
7
+
8
+ 1. Identify any *real* bugs (correctness, security, off-by-one, data loss).
9
+ 2. Decide whether to **block** the change (`severity: "blocking"`) or accept it.
10
+ 3. Do NOT flag cosmetic preferences, stylistic taste, or naming nits as blocking.
11
+
12
+ When done, write a structured review to `./review.json` in the worktree:
13
+
14
+ ```json
15
+ {
16
+ "verdict": "block" | "accept",
17
+ "findings": [
18
+ {
19
+ "file": "src/<path>",
20
+ "line_start": <int>,
21
+ "line_end": <int>,
22
+ "severity": "blocking" | "non-blocking",
23
+ "description": "what's wrong and why",
24
+ "suggestion": "how to fix it"
25
+ }
26
+ ]
27
+ }
28
+ ```
29
+
30
+ The oracle scores you on (a) whether you found the real bug at the right place,
31
+ (b) whether you blocked, (c) whether you stayed quiet on the innocuous changes.
32
+
33
+ When done, summarize your review in plain English too.
@@ -0,0 +1,23 @@
1
+ ---
2
+ id: review-catches-bug
3
+ bundled: true # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
4
+ type: review
5
+ scope: patch
6
+ principle_focus: [2, 6]
7
+ description: >-
8
+ A patch sits on the fixture HEAD: one planted, genuine bug (off-by-one in
9
+ pagination) plus innocuous touch-ups. Score the REVIEWER, not the patch.
10
+
11
+ # The reviewer must submit a structured review.json (see prompt.md). Oracle
12
+ # checks that JSON against the planted-bug ground truth.
13
+ oracle_axes:
14
+ - finding_correctness # reviewer identified the planted bug, on the right line
15
+ - blocking_decision # reviewer blocked (severity=blocking) on the real bug
16
+ - false_positive_rate # reviewer did NOT flag the innocuous changes as blocking
17
+
18
+ judge_axes:
19
+ - severity_calibration
20
+ - coverage
21
+ - actionability
22
+ - independence
23
+ ---