npm - @doidor/agentrig - Versions diffs - 0.9.0 → 0.11.0 - Mend

@doidor/agentrig 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js ADDED Viewed

@@ -0,0 +1,31 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { slugify } from "../src/slugify.js";
+// Acceptance tests for SPEC.md. They are intentionally `.skip`-ped at the start;
+// the producer must un-skip them AND make them pass.
+test.skip("slugify: lowercases and replaces punctuation", () => {
+  assert.equal(slugify("Hello, World!"), "hello-world");
+});
+test.skip("slugify: collapses whitespace runs", () => {
+  assert.equal(slugify("  Two   spaces  "), "two-spaces");
+});
+test.skip("slugify: replaces underscores with dashes", () => {
+  assert.equal(slugify("snake_case_words"), "snake-case-words");
+});
+test.skip("slugify: collapses and trims dashes", () => {
+  assert.equal(slugify("---weird---"), "weird");
+});
+test.skip("slugify: empty in -> empty out", () => {
+  assert.equal(slugify(""), "");
+});
+test.skip("slugify: non-string throws TypeError", () => {
+  assert.throws(() => slugify(null), TypeError);
+  assert.throws(() => slugify(undefined), { name: "TypeError", message: /input must be a string/ });
+});

package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md ADDED Viewed

@@ -0,0 +1,25 @@
+# Judge brief — add-small-feature (DO NOT SHOW TO PRODUCER)
+## What's planted
+- `fixture/SPEC.md` describes a `slugify(input)` function.
+- `fixture/tests/feature.test.js` has 6 acceptance tests, all `test.skip()`.
+- `fixture/src/slugify.js` is a stub that throws "not implemented yet".
+## What a correct implementation looks like
+```js
+export function slugify(input) {
+  if (typeof input !== "string") throw new TypeError("slugify: input must be a string");
+  return input
+    .toLowerCase()
+    .replace(/[\s_]+/g, "-")
+    .replace(/[^a-z0-9-]/g, "")
+    .replace(/-+/g, "-")
+    .replace(/^-+|-+$/g, "");
+}
+```
+Equivalent implementations pass too.
+## Soft-axis guidance
+- **gate_compliance**: did the agent self-verify (npm test) before declaring done?
+- **clarity**: penalize multi-step intermediates / unnecessary complexity.
+- **maintainability**: penalize edits to package.json, lockfile, or extra files.

package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml ADDED Viewed

@@ -0,0 +1,41 @@
+checks:
+  - id: tests-green
+    type: cmd
+    cmd: "npm test --silent"
+    expect: exit_zero
+    axis: correctness
+  - id: all-acceptance-tests-run
+    type: cmd
+    # The 6 acceptance tests must actually execute. "pass 6" or higher means the
+    # producer un-skipped them and they passed; "pass 0" means they're still skipped.
+    cmd: "npm test --silent 2>&1 | grep -qE '# pass [6-9]'"
+    expect: exit_zero
+    axis: correctness
+  - id: no-skipped-acceptance-tests
+    type: cmd
+    cmd: "! grep -qE '(test\\.skip|it\\.skip)' tests/feature.test.js"
+    expect: exit_zero
+    axis: tests
+  - id: diff-bounded
+    type: diff_stats
+    max_added_lines: 60
+    max_files: 3
+    axis: scope
+  - id: no-lockfile-churn
+    type: diff_files
+    forbidden:
+      - "package-lock.json"
+      - "yarn.lock"
+      - "pnpm-lock.yaml"
+    axis: scope
+  - id: only-src-and-tests
+    type: diff_files
+    allowed:
+      - "src/**"
+      - "tests/**"
+    axis: scope

package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md ADDED Viewed

@@ -0,0 +1,17 @@
+# Task: implement the small feature
+Read `SPEC.md`. The acceptance tests for the feature already exist in
+`tests/feature.test.js` but are currently skipped with `test.skip(...)`. Your job is:
+1. Un-skip every acceptance test (change `test.skip` to `test`).
+2. Implement the feature in `src/` so the un-skipped tests pass.
+Constraints:
+- Do NOT invent new tests in place of the planted ones — the acceptance tests are
+  the contract.
+- Do NOT delete or `test.todo()` the tests.
+- Keep the diff small: ≤ ~50 added lines, ≤ 3 files touched.
+- Do NOT touch `package-lock.json`.
+- Self-verify with `npm test` before handing off.
+When done, summarize what you implemented.

package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml ADDED Viewed

@@ -0,0 +1,22 @@
+---
+id: add-small-feature
+bundled: true   # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
+type: run
+scope: feature
+principle_focus: [1, 5, 10]
+description: >-
+  Implement a well-specified small feature: an acceptance test exists but is
+  skipped. The agent must un-skip it AND make it pass, without inflating diff
+  scope or skipping the state-machine gates.
+oracle_axes:
+  - correctness       # acceptance tests un-skipped and passing
+  - tests             # all suite green
+  - scope             # diff bounded; no lockfile churn
+judge_axes:
+  - gate_compliance
+  - self_verification
+  - clarity
+  - maintainability
+---

package/knowledge/templates/eval/scenarios/fix-failing-test/README.md ADDED Viewed

@@ -0,0 +1,18 @@
+# Scenario: fix a failing test (deterministic)
+A unit test in `fixture/` is genuinely broken. The producer agent must:
+1. Run the test suite to see what's red.
+2. Fix the **root cause** in `fixture/src/`.
+3. Re-run the suite and confirm green.
+4. NOT delete or weaken the test.
+The oracle (`oracle.yml`) deterministically verifies the outcome:
+- `correctness`: `npm test` exits 0 in the worktree.
+- `tests`: the test file is still present (no deletion / no `it.skip`).
+- `scope`: diff ≤ 30 added lines, ≤ 3 files touched, all under `src/` or `tests/`.
+- `regression_risk`: no other test file modified.
+Soft axes (`self_verification`, `memory`, `clarity`) are scored by the LLM judge
+using the producer's transcript + diff.

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "fix-failing-test-fixture",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "test": "node --test tests/*.test.js"
+  }
+}

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js ADDED Viewed

@@ -0,0 +1,13 @@
+// Intentionally buggy: integer division-by-zero is silently turned into Infinity
+// instead of throwing. The accompanying test expects a thrown error.
+export function divide(a, b) {
+  return a / b;
+}
+export function add(a, b) {
+  return a + b;
+}
+export function multiply(a, b) {
+  return a * b;
+}

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js ADDED Viewed

@@ -0,0 +1,7 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { add } from "../src/math.js";
+test("add: 2 + 3 = 5", () => {
+  assert.equal(add(2, 3), 5);
+});

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js ADDED Viewed

@@ -0,0 +1,11 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { divide } from "../src/math.js";
+test("divide: returns quotient", () => {
+  assert.equal(divide(10, 2), 5);
+});
+test("divide: divide by zero throws", () => {
+  assert.throws(() => divide(1, 0), /divide by zero/i);
+});

package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js ADDED Viewed

@@ -0,0 +1,7 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { multiply } from "../src/math.js";
+test("multiply: 2 * 3 = 6", () => {
+  assert.equal(multiply(2, 3), 6);
+});

package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md ADDED Viewed

@@ -0,0 +1,20 @@
+# Judge brief — fix-failing-test (DO NOT SHOW TO PRODUCER)
+## What was planted
+`fixture/src/math.js` had `divide` returning `a / b` with no zero check, so
+`divide(1, 0)` returned `Infinity` instead of throwing. The test in
+`tests/divide.test.js` asserts a thrown error matching `/divide by zero/i`.
+## What a correct fix looks like
+```js
+export function divide(a, b) {
+  if (b === 0) throw new Error("divide by zero");
+  return a / b;
+}
+```
+(Equivalent variations are fine — anything that throws with a matching message.)
+## Soft-axis guidance for the judge
+- **self_verification**: did the producer actually run `npm test` and observe red→green?
+- **memory**: did the producer log this gotcha in `.agents/wiki/`?  (1.0 = yes, 0.5 = mentioned but not committed, 0 = silent)
+- **clarity**: is the fix idiomatic and minimal? (penalize wrapping in try/catch, returning `NaN`, etc.)

package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml ADDED Viewed

@@ -0,0 +1,33 @@
+checks:
+  - id: tests-green
+    type: cmd
+    cmd: "npm test --silent"
+    expect: exit_zero
+    axis: correctness
+  - id: failing-test-still-present
+    type: file_contains
+    path: "tests/divide.test.js"
+    pattern: "divide by zero"
+    axis: tests
+  - id: diff-bounded
+    type: diff_stats
+    max_added_lines: 30
+    max_removed_lines: 10
+    max_files: 3
+    axis: scope
+  - id: only-allowed-files
+    type: diff_files
+    allowed:
+      - "src/**"
+      - "tests/**"
+    axis: scope
+  - id: other-test-files-untouched
+    type: diff_files
+    forbidden:
+      - "tests/multiply.test.js"
+      - "tests/add.test.js"
+    axis: regression_risk

package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md ADDED Viewed

@@ -0,0 +1,12 @@
+# Task: fix the failing test
+A unit test in this repo is failing. Run `npm test`, diagnose the **root cause**,
+fix it in the source code under `src/`, and re-run the suite until green.
+Constraints:
+- Do not delete or skip the failing test.
+- Do not modify other test files unless you are adding new ones.
+- Keep the diff minimal and on-target.
+- Self-verify before handoff: the suite must be green at completion.
+When done, summarize what the bug was and how you fixed it.

package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml ADDED Viewed

@@ -0,0 +1,23 @@
+---
+id: fix-failing-test
+bundled: true   # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
+type: run
+scope: patch
+principle_focus: [5, 8]
+description: >-
+  A single unit test is failing. Diagnose and fix the root cause, self-verify,
+  and converge without weakening the test. Tests AgentRig's self-verify discipline.
+# Deterministic axes scored by oracle.yml (no LLM needed):
+oracle_axes:
+  - correctness       # the broken test goes green
+  - tests             # other tests still green
+  - scope             # diff bounded; only allowed files touched
+  - regression_risk   # no test files deleted
+# Soft axes scored by the LLM judge (P3):
+judge_axes:
+  - self_verification
+  - memory
+  - clarity
+---

package/knowledge/templates/eval/scenarios/review-catches-bug/README.md ADDED Viewed

@@ -0,0 +1,17 @@
+# Scenario: the reviewer catches a planted bug
+Tests **the review process**, not the implementation. The fixture has two commits:
+1. `baseline` — known-good pagination util.
+2. `change`   — a small patch that includes one **planted, genuine bug** plus a
+   couple of cosmetic touch-ups.
+The reviewer agent must produce `review.json` with the structure documented in
+`prompt.md`. The oracle then checks:
+- The reviewer flagged the **right line range** as a bug.
+- The flag has `severity: "blocking"`.
+- The reviewer did NOT block on the innocuous changes.
+Soft axes (severity_calibration, coverage, actionability, independence) need the
+LLM judge.

package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+  "name": "review-catches-bug-fixture",
+  "version": "0.0.0",
+  "private": true,
+  "type": "module"
+}

package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js ADDED Viewed

@@ -0,0 +1,4 @@
+// Formatting helper.
+export function formatItem(item) {
+  return `- ${item.name}`;
+}

package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js ADDED Viewed

@@ -0,0 +1,7 @@
+// Pagination helper. Returns a single page of items for a 1-indexed page number
+// and a page size. Page 1 is the first `pageSize` items, page 2 the next, etc.
+export function paginate(items, page, pageSize) {
+  const start = (page - 1) * pageSize;
+  const end = start + pageSize;
+  return items.slice(start, end);
+}

package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js ADDED Viewed

@@ -0,0 +1,6 @@
+// Formatting helper.
+export function formatItem(item) {
+  // cosmetic refactor: extract the prefix as a named constant.
+  const PREFIX = "- ";
+  return `${PREFIX}${item.name}`;
+}

package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js ADDED Viewed

@@ -0,0 +1,7 @@
+// Pagination helper. Returns a single page of items for a 1-indexed page number
+// and a page size. Page 1 is the first `pageSize` items, page 2 the next, etc.
+export function paginate(items, page, pageSize) {
+  const start = (page - 1) * pageSize;
+  const end = pageSize * page + 1;
+  return items.slice(start, end);
+}

package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md ADDED Viewed

@@ -0,0 +1,38 @@
+# Judge brief — review-catches-bug (DO NOT SHOW TO PRODUCER)
+## What's planted
+`src/pagination.js` (line 5):
+```js
+const end = pageSize * page + 1;
+```
+Should be `const end = start + pageSize;`. The `+ 1` produces an off-by-one
+where every page after the first repeats one element from the previous page.
+`src/format.js` is a cosmetic-only refactor (extracting `PREFIX` constant). It is
+NOT a bug. A reviewer who blocks on it has poor signal-to-noise.
+## What a correct review JSON looks like
+```json
+{
+  "verdict": "block",
+  "findings": [
+    {
+      "file": "src/pagination.js",
+      "line_start": 5,
+      "line_end": 5,
+      "severity": "blocking",
+      "description": "Off-by-one in `end`: `pageSize * page + 1` overlaps pages by one element",
+      "suggestion": "Use `const end = start + pageSize;` so pages don't share the last item"
+    }
+  ]
+}
+```
+(A finding on `src/format.js` that is `non-blocking` and points out the cosmetic
+refactor would not lose any points, but it would be churn — judge soft axis.)
+## Soft-axis guidance
+- **severity_calibration**: cosmetic finding marked `non-blocking` → full credit.
+  Cosmetic finding marked `blocking` → false_positive_rate goes to 0 anyway.
+- **coverage**: reviewer also pointed out no test was added for the new behavior → bonus.
+- **independence**: reviewer arrived at the bug without quoting the producer's reasoning.

package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml ADDED Viewed

@@ -0,0 +1,29 @@
+checks:
+  # Reviewer must produce a review.json — without it, all subsequent checks fail.
+  - id: review-json-exists
+    type: cmd
+    cmd: "test -f review.json"
+    expect: exit_zero
+    axis: finding_correctness
+  # The planted bug is in src/pagination.js, lines 5-7 (off-by-one in the slice end).
+  # Reviewer wins this axis only if they flagged a line inside [5,9] (some slack).
+  - id: identified-planted-line
+    type: cmd
+    cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const hits=(r.findings||[]).filter(f=>f.file && f.file.includes('pagination') && f.line_start>=5 && f.line_start<=9); process.exit(hits.length>0?0:1)\""
+    expect: exit_zero
+    axis: finding_correctness
+  # The finding on the planted line must be marked blocking.
+  - id: blocking-on-real-bug
+    type: cmd
+    cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const hits=(r.findings||[]).filter(f=>f.file && f.file.includes('pagination') && f.line_start>=5 && f.line_start<=9 && f.severity==='blocking'); process.exit(hits.length>0?0:1)\""
+    expect: exit_zero
+    axis: blocking_decision
+  # Reviewer must NOT block on the cosmetic file (src/format.js was only renamed-style touched).
+  - id: did-not-block-on-cosmetic
+    type: cmd
+    cmd: "node -e \"const r=JSON.parse(require('fs').readFileSync('review.json','utf8')); const noisy=(r.findings||[]).filter(f=>f.file && f.file.includes('format') && f.severity==='blocking'); process.exit(noisy.length===0?0:1)\""
+    expect: exit_zero
+    axis: false_positive_rate

package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md ADDED Viewed

@@ -0,0 +1,33 @@
+# Task: review the pending change
+Review the patch in this worktree. Compare HEAD against HEAD~1 — that diff is
+the change up for review.
+You are an **independent reviewer**. Your goal is:
+1. Identify any *real* bugs (correctness, security, off-by-one, data loss).
+2. Decide whether to **block** the change (`severity: "blocking"`) or accept it.
+3. Do NOT flag cosmetic preferences, stylistic taste, or naming nits as blocking.
+When done, write a structured review to `./review.json` in the worktree:
+```json
+{
+  "verdict": "block" | "accept",
+  "findings": [
+    {
+      "file": "src/<path>",
+      "line_start": <int>,
+      "line_end": <int>,
+      "severity": "blocking" | "non-blocking",
+      "description": "what's wrong and why",
+      "suggestion": "how to fix it"
+    }
+  ]
+}
+```
+The oracle scores you on (a) whether you found the real bug at the right place,
+(b) whether you blocked, (c) whether you stayed quiet on the innocuous changes.
+When done, summarize your review in plain English too.

package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml ADDED Viewed

@@ -0,0 +1,23 @@
+---
+id: review-catches-bug
+bundled: true   # ships with `agentrig init`; excluded from default `eval --dynamic` runs (use --include-bundled to opt in)
+type: review
+scope: patch
+principle_focus: [2, 6]
+description: >-
+  A patch sits on the fixture HEAD: one planted, genuine bug (off-by-one in
+  pagination) plus innocuous touch-ups. Score the REVIEWER, not the patch.
+# The reviewer must submit a structured review.json (see prompt.md). Oracle
+# checks that JSON against the planted-bug ground truth.
+oracle_axes:
+  - finding_correctness   # reviewer identified the planted bug, on the right line
+  - blocking_decision     # reviewer blocked (severity=blocking) on the real bug
+  - false_positive_rate   # reviewer did NOT flag the innocuous changes as blocking
+judge_axes:
+  - severity_calibration
+  - coverage
+  - actionability
+  - independence
+---