npm - devlyn-cli - Versions diffs - 2.2.2 → 2.3.1 - Mend

devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json ADDED Viewed

@@ -0,0 +1,57 @@
+{
+  "verification_commands": [
+    {
+      "cmd": "node --test tests/cli.test.js",
+      "exit_code": 0,
+      "stdout_contains": [],
+      "stdout_not_contains": ["not ok "]
+    },
+    {
+      "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/priority-credit-rollback.js\"",
+      "exit_code": 0,
+      "stdout_contains": ["\"ok\":true"],
+      "stdout_not_contains": [],
+      "contract_refs": [
+        "Process renewals globally by `priority` descending, then `requested_at` ascending, then `id` ascending.",
+        "A rejected renewal with reason `payment_required` must not consume any credits, even if it tentatively applied credits before discovering the remaining due exceeded `max_due_cents`.",
+        "Usable credits are credits for the same customer with `expires_at >= as_of` and `cents > 0`, consumed by `expires_at` ascending, then `id` ascending.",
+        "`rejected` is ordered in the original input renewal order. Each row has keys `id` and `reason`.",
+        "`remaining_credits` includes only non-expired credits with positive cents after accepted renewals, sorted by `customer`, then `expires_at`, then `id`. Each row has keys `id`, `customer`, `cents`, and `expires_at`.",
+        "Output row key names and nested `credits` key names match the visible spec exactly, with no aliased or extra keys."
+      ]
+    },
+    {
+      "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/duplicate-renewal-error.js\"",
+      "exit_code": 0,
+      "stdout_contains": ["\"ok\":true"],
+      "stdout_not_contains": [],
+      "contract_refs": [
+        "Before processing any renewal, duplicate renewal ids are invalid input: exit `2`, write exactly one JSON error object `{ \"error\": \"duplicate_renewal_id\", \"id\": string }` to stderr, and write no stdout."
+      ]
+    }
+  ],
+  "forbidden_patterns": [
+    {
+      "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
+      "description": "silent catch returning fallback in subscription renewal path",
+      "files": ["bin/cli.js", "tests/cli.test.js"],
+      "severity": "disqualifier"
+    },
+    {
+      "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
+      "description": "empty catch block",
+      "files": ["bin/cli.js", "tests/cli.test.js"],
+      "severity": "disqualifier"
+    },
+    {
+      "pattern": "/\\*\\s*eslint-disable",
+      "description": "eslint-disable without scoped justification",
+      "files": ["bin/cli.js"],
+      "severity": "disqualifier"
+    }
+  ],
+  "required_files": ["bin/cli.js", "tests/cli.test.js"],
+  "forbidden_files": [],
+  "max_deps_added": 0,
+  "spec_output_files": ["bin/cli.js", "tests/cli.test.js"]
+}

package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json ADDED Viewed

@@ -0,0 +1,10 @@
+{
+  "id": "F32-cli-subscription-renewal",
+  "category": "high-risk",
+  "difficulty": "hard",
+  "timeout_seconds": 1800,
+  "required_tools": ["node"],
+  "browser": false,
+  "deps_change_expected": false,
+  "intent": "Add a subscription renewal CLI command that applies expiring credits in priority order with rollback and exact machine-readable output."
+}

package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env bash
2	+ set -euo pipefail

package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md ADDED Viewed

@@ -0,0 +1,70 @@
+---
+id: "F32-cli-subscription-renewal"
+title: "Subscription renewal command"
+status: planned
+complexity: high
+depends-on: []
+---
+# F32 Subscription renewal command
+## Context
+`bench-cli` currently has greeting and version commands only. The task:
+add a `renew-subscriptions` command that applies subscription renewal requests,
+uses expiring customer credits in deterministic order, rolls back credits when a
+renewal cannot be paid, and prints exact invoice, rejected, and remaining-credit
+rows.
+This is billing reconciliation. Downstream finance tools parse stdout as JSON,
+so field names, error objects, and row shapes are part of the contract.
+## Requirements
+- [ ] `bench-cli renew-subscriptions --input <path>` reads JSON shaped as `{ "as_of": string, "plans": [plan], "customers": [customer], "credits": [credit], "renewals": [renewal] }`.
+- [ ] Each plan has keys `id`, `monthly_cents`, `included_seats`, and `overage_cents`.
+- [ ] Each customer has keys `id`, `plan`, and `active`.
+- [ ] Each credit has keys `id`, `customer`, `cents`, and `expires_at`.
+- [ ] Each renewal has keys `id`, `customer`, `seats`, `months`, `priority`, `requested_at`, and `max_due_cents`.
+- [ ] Before processing any renewal, duplicate renewal ids are invalid input: exit `2`, write exactly one JSON error object `{ "error": "duplicate_renewal_id", "id": string }` to stderr, and write no stdout.
+- [ ] Before processing any renewal, all cents, seat, month, and priority fields must be integers; `monthly_cents`, `overage_cents`, `included_seats`, `cents`, `seats`, and `months` must be non-negative except `seats` and `months` must be positive. Invalid input exits `2` with one JSON error object and no stdout.
+- [ ] Process renewals globally by `priority` descending, then `requested_at` ascending, then `id` ascending.
+- [ ] A renewal rejects with reason `unknown_customer` when the customer does not exist, `inactive_customer` when the customer is inactive, and `unknown_plan` when the customer's plan does not exist.
+- [ ] Renewal subtotal is `(plan.monthly_cents + max(0, seats - plan.included_seats) * plan.overage_cents) * months`.
+- [ ] Usable credits are credits for the same customer with `expires_at >= as_of` and `cents > 0`, consumed by `expires_at` ascending, then `id` ascending.
+- [ ] A renewal accepts only when `subtotal_cents - credit_applied_cents <= max_due_cents`.
+- [ ] A rejected renewal with reason `payment_required` must not consume any credits, even if it tentatively applied credits before discovering the remaining due exceeded `max_due_cents`.
+- [ ] On success, write exactly one JSON object to stdout and no stderr. Keys: `invoices`, `rejected`, `remaining_credits`.
+- [ ] `invoices` is ordered in processing order. Each row has keys `id`, `customer`, `subtotal_cents`, `credit_applied_cents`, `due_cents`, and `credits`.
+- [ ] Each invoice `credits` row has keys `id` and `applied_cents`, ordered by the credit consumption order.
+- [ ] `rejected` is ordered in the original input renewal order. Each row has keys `id` and `reason`.
+- [ ] `remaining_credits` includes only non-expired credits with positive cents after accepted renewals, sorted by `customer`, then `expires_at`, then `id`. Each row has keys `id`, `customer`, `cents`, and `expires_at`.
+- [ ] `tests/cli.test.js` is updated. Existing tests still pass and at least two new tests cover `renew-subscriptions`: one successful priority/rollback scenario and one validation failure.
+## Constraints
+- **No new npm dependencies.**
+- **No floating money output.** All public amounts are integer cents.
+- **No hidden mutable global state.** The command must derive output only from the input JSON for that invocation.
+- **No silent catches.** Parse and file-read failures must emit a visible JSON error to stderr and exit `2`.
+- **No extra stdout/stderr text** on the success path; downstream tooling parses stdout as JSON.
+## Out of Scope
+- Persisting renewal state between command invocations.
+- Adding invoices to a database or writing files.
+- Adding currencies, payment gateways, or tax rules.
+- Adding web UI or server routes.
+- Touching `server/`, `web/`, or `tests/server.test.js`.
+## Verification
+- `node --test tests/cli.test.js` exits 0.
+- A later high-priority renewal is processed before an earlier low-priority renewal, and the low-priority renewal can lose usable credits because of that ordering.
+- A rejected `payment_required` renewal leaves all tentatively applied credits available for later renewals.
+- Credits are consumed by `expires_at` ascending, then `id` ascending, and expired or zero-cent credits are absent from `remaining_credits`.
+- `rejected` rows are reported in the original input renewal order, even though processing order is priority based.
+- Duplicate renewal ids exit `2`, print exactly `{ "error": "duplicate_renewal_id", "id": string }` to stderr, and print no stdout.
+- Output row key names and nested `credits` key names match the visible spec exactly, with no aliased or extra keys.
+- `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched.
+- Solo-headroom hypothesis: solo_claude is expected to miss payment-required credit rollback or expiring-credit consumption order; observable command `node "$BENCH_FIXTURE_DIR/verifiers/priority-credit-rollback.js"` exposes the miss.

package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt ADDED Viewed

@@ -0,0 +1,3 @@
+Add a subscription renewal CLI command that applies expiring credits in priority order with rollback and exact machine-readable output.
+The command should be `bench-cli renew-subscriptions --input <path>`. It reads plans, customers, credits, and renewal requests from JSON, processes renewals by priority, applies usable credits in expiration order, rejects unpaid renewals without consuming their tentative credits, and prints JSON invoices, rejected rows, and remaining credits. Keep the change scoped to the CLI and CLI tests, with no new dependencies and no extra stdout/stderr text.

package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js ADDED Viewed

@@ -0,0 +1,42 @@
+#!/usr/bin/env node
+const assert = require('node:assert/strict');
+const { mkdtempSync, writeFileSync, rmSync } = require('node:fs');
+const { tmpdir } = require('node:os');
+const { join } = require('node:path');
+const { spawnSync } = require('node:child_process');
+const workdir = process.env.BENCH_WORKDIR || process.cwd();
+const tmp = mkdtempSync(join(tmpdir(), 'f32-renewal-dup-'));
+try {
+  const inputPath = join(tmp, 'input.json');
+  writeFileSync(inputPath, JSON.stringify({
+    as_of: '2026-05-15',
+    plans: [
+      { id: 'starter', monthly_cents: 1000, included_seats: 5, overage_cents: 200 }
+    ],
+    customers: [
+      { id: 'c1', plan: 'starter', active: true }
+    ],
+    credits: [],
+    renewals: [
+      { id: 'dup-renewal', customer: 'c1', seats: 5, months: 1, priority: 1, requested_at: '2026-05-01', max_due_cents: 1000 },
+      { id: 'dup-renewal', customer: 'missing', seats: 5, months: 1, priority: 9, requested_at: '2026-05-02', max_due_cents: 1000 }
+    ]
+  }));
+  const proc = spawnSync('node', ['bin/cli.js', 'renew-subscriptions', '--input', inputPath], {
+    cwd: workdir,
+    encoding: 'utf8'
+  });
+  assert.equal(proc.status, 2, proc.stderr || proc.stdout);
+  assert.equal(proc.stdout, '');
+  assert.deepEqual(JSON.parse(proc.stderr), {
+    error: 'duplicate_renewal_id',
+    id: 'dup-renewal'
+  });
+  process.stdout.write(JSON.stringify({ ok: true }) + '\n');
+} finally {
+  rmSync(tmp, { recursive: true, force: true });
+}

package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js ADDED Viewed

@@ -0,0 +1,70 @@
+#!/usr/bin/env node
+const assert = require('node:assert/strict');
+const { mkdtempSync, writeFileSync, rmSync } = require('node:fs');
+const { tmpdir } = require('node:os');
+const { join } = require('node:path');
+const { spawnSync } = require('node:child_process');
+const workdir = process.env.BENCH_WORKDIR || process.cwd();
+const tmp = mkdtempSync(join(tmpdir(), 'f32-renewal-'));
+try {
+  const inputPath = join(tmp, 'input.json');
+  writeFileSync(inputPath, JSON.stringify({
+    as_of: '2026-05-15',
+    plans: [
+      { id: 'starter', monthly_cents: 1000, included_seats: 5, overage_cents: 200 },
+      { id: 'pro', monthly_cents: 3000, included_seats: 10, overage_cents: 150 }
+    ],
+    customers: [
+      { id: 'c1', plan: 'starter', active: true },
+      { id: 'c2', plan: 'pro', active: true }
+    ],
+    credits: [
+      { id: 'cr-late', customer: 'c1', cents: 500, expires_at: '2026-06-30' },
+      { id: 'cr-expired', customer: 'c1', cents: 999, expires_at: '2026-04-01' },
+      { id: 'cr-early', customer: 'c1', cents: 400, expires_at: '2026-05-31' },
+      { id: 'cr-zero', customer: 'c1', cents: 0, expires_at: '2026-05-20' },
+      { id: 'cr-c2', customer: 'c2', cents: 1000, expires_at: '2026-12-31' }
+    ],
+    renewals: [
+      { id: 'r-low', customer: 'c1', seats: 5, months: 1, priority: 1, requested_at: '2026-05-01', max_due_cents: 100 },
+      { id: 'r-mid', customer: 'c1', seats: 8, months: 1, priority: 10, requested_at: '2026-05-02', max_due_cents: 0 },
+      { id: 'r-high', customer: 'c1', seats: 8, months: 1, priority: 9, requested_at: '2026-05-03', max_due_cents: 800 }
+    ]
+  }));
+  const proc = spawnSync('node', ['bin/cli.js', 'renew-subscriptions', '--input', inputPath], {
+    cwd: workdir,
+    encoding: 'utf8'
+  });
+  assert.equal(proc.status, 0, proc.stderr || proc.stdout);
+  assert.equal(proc.stderr, '');
+  const output = JSON.parse(proc.stdout);
+  assert.deepEqual(output, {
+    invoices: [
+      {
+        id: 'r-high',
+        customer: 'c1',
+        subtotal_cents: 1600,
+        credit_applied_cents: 900,
+        due_cents: 700,
+        credits: [
+          { id: 'cr-early', applied_cents: 400 },
+          { id: 'cr-late', applied_cents: 500 }
+        ]
+      }
+    ],
+    rejected: [
+      { id: 'r-low', reason: 'payment_required' },
+      { id: 'r-mid', reason: 'payment_required' }
+    ],
+    remaining_credits: [
+      { id: 'cr-c2', customer: 'c2', cents: 1000, expires_at: '2026-12-31' }
+    ]
+  });
+  process.stdout.write(JSON.stringify({ ok: true }) + '\n');
+} finally {
+  rmSync(tmp, { recursive: true, force: true });
+}

package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md CHANGED Viewed

@@ -33,8 +33,15 @@ tests won't surface.
   stricter browser-required gating; today the fixture only checks file
   presence in verification.
+## Current status
+Rejected as pair-lift evidence. `20260512-f4-web-headroom` measured bare 70 /
+solo_claude 92, with a +22 solo-over-bare margin, but failed headroom because
+bare exceeded 60, solo exceeded 80, and bare carried judge/result/verify
+disqualifiers. Rework the fixture or verifier before spending a pair arm on it.
 ## Rotation trigger
-When both arms consistently produce correct output AND include accessible
-markup without pipeline intervention, rotate to a harder UI task (e.g., a
-form with validation states).
+When both `bare` and `solo_claude` consistently produce correct output AND
+include accessible markup without pipeline intervention, rotate to a harder UI
+task (e.g., a form with validation states).

package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md CHANGED Viewed

@@ -31,6 +31,13 @@ calls it done. Verification catches that.
 - **Phase 2.5 FIX LOOP** runs at least once. A fixture passing with 0 fix rounds is a smoke signal that the test-trap design is too lenient; inspect.
 - **Phase 1.4 BUILD GATE** uses `node --test` which exits non-zero on any failure, forcing route to 2.5.
+## Current status
+Rejected as pair-lift evidence. `20260512-f5-fixloop-headroom` measured bare
+99 / solo_claude 99, with bare and solo each passing 5/5 verification commands.
+It fails both headroom preconditions and should remain a fix-loop control unless
+reworked.
 ## Rotation trigger
 When fix rounds consistently = 0 across two shipped versions, the trap is too

package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md CHANGED Viewed

@@ -30,6 +30,11 @@ of over-reaching. As models improve, they should take the stdlib path more
 often. Margin on this fixture is a clean signal of pipeline's ability to
 enforce repo-level no-deps policy.
+Current status: rejected as pair-lift evidence. `20260512-f6-checksum-headroom`
+measured bare 97 / solo_claude 96, with `bare` and `solo_claude` passing 6/6
+verification commands. It fails both headroom preconditions and should remain a
+dep-audit control unless reworked.
 ## Rotation trigger
 When bare arms consistently avoid dependency-adding and pipeline still

package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md CHANGED Viewed

@@ -45,6 +45,13 @@ If bare somehow beats variant (variant fixes the bug = scope violation,
 bare doesn't), that's a real signal that the pipeline's scope discipline
 is weak and needs CRITIC prompt tuning.
+## Current status
+Rejected as pair-lift evidence. `20260512-f7-scope-headroom` measured bare
+99 / solo_claude 100, with bare and solo each passing 6/6 verification commands.
+It fails both headroom preconditions and should remain a scope-discipline
+control unless reworked.
 ## Rotation trigger
 Retire when variant scope-discipline axis > 24 on two shipped versions.

package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md CHANGED Viewed

@@ -12,6 +12,9 @@ Margin ∈ [-3, +3] is the expected range. Both arms should produce small,
 reasonable improvements. The judge may slightly prefer one or the other
 based on taste.
+Pair-candidate status: rejected by design. F8 is a known-limit ambiguity
+barometer whose expected margin is a tie range, not pair-lift evidence.
 Margin > +3 means the fixture is no longer a known limit — either the
 harness got notably better at ambiguous specs (improve prompt or reuse the
 pattern elsewhere), or the task is drifting from its "under-specified"

package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md CHANGED Viewed

@@ -2,7 +2,7 @@
 id: "F8-known-limit-ambiguous"
 title: "Improve the CLI"
 status: planned
-complexity: ambiguous
+complexity: medium
 depends-on: []
 ---

package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md CHANGED Viewed

@@ -57,13 +57,15 @@ The harness refuses `--resolve-skill old` on F9 with a hard error.
   This asymmetry is INTENTIONAL — the fixture tests total-output quality,
   not per-file quality.
-## Variant artifact check (out-of-band, NOT in expected.json)
+## Skill-driven artifact check (out-of-band, NOT in expected.json)
 Per Codex R0.5 §B: `expected.json.verification_commands` apply to ALL arms
 (see `run-fixture.sh:472`). A `docs/specs/**` check in expected.json would
-punish the bare arm (which doesn't run ideate). Variant-only artifact
+punish the bare arm (which doesn't run ideate). Skill-driven artifact
 verification lives in `scripts/check-f9-artifacts.py`, which runs AFTER
-the per-fixture verification block and asserts variant/solo arms produced:
+the per-fixture verification block and asserts every non-bare skill arm
+(`variant`, `solo_claude`, `l2_gated`, `l2_risk_probes`, `l2_forced`)
+produced:
 - `docs/specs/<id>-<slug>/spec.md` exists.
 - `docs/specs/<id>-<slug>/spec.expected.json` exists.
@@ -91,3 +93,13 @@ the per-fixture verification block and asserts variant/solo arms produced:
 F9 is the last fixture we rotate — it's the anchor. If it saturates
 (variant consistently > 95), the whole suite needs a harder novice-flow
 anchor before we retire this one.
+## Current pair-evidence status
+Rejected as pair-lift evidence until reworked. `20260512-f9-e2e-headroom`
+measured bare 60 / solo_claude 90 with a +30 solo-over-bare margin, and
+`check-f9-artifacts.py` passed for bare (exempt) and solo_claude. The headroom
+gate still failed because bare headroom was 0 < 5, solo_claude exceeded 80, and
+bare carried a judge disqualifier. Keep F9 as the novice-flow anchor, but do not
+spend pair arms on it as pair evidence until the fixture is reworked and clears
+a fresh headroom gate.

package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md CHANGED Viewed

@@ -56,4 +56,4 @@ inside `/devlyn:resolve` (no separate preflight skill in the 2-skill design).
 - `cd /tmp && node <worktree>/bin/cli.js gitstats` (from outside a repo — use the worktree's absolute path) exits 2.
 - `node --test tests/` passes.
-(Variant-only artifact checks — `docs/specs/<id>-<slug>/spec.md` + `spec.expected.json` existence, transcript fingerprint — live in `scripts/check-f9-artifacts.py`, NOT in the shared verification block above. See NOTES.md.)
+(Skill-driven artifact checks — `docs/specs/<id>-<slug>/spec.md` + `spec.expected.json` existence, transcript fingerprint — live in `scripts/check-f9-artifacts.py`, NOT in the shared verification block above. Bare is exempt. See NOTES.md.)

package/benchmark/auto-resolve/fixtures/SCHEMA.md CHANGED Viewed

@@ -19,16 +19,16 @@ Every fixture is a directory under `benchmark/auto-resolve/fixtures/F<N>-<slug>/
 - **id** — matches directory name, used across artifacts.
 - **category** — one of `trivial | medium | high-risk | stress | edge | e2e`. Drives which ship-gate rule applies.
-- **difficulty** — expected difficulty independent of category. Rubric uses this only for saturation detection (when both arms > 95 for two versions, flag fixture for rotation).
+- **difficulty** — expected difficulty independent of category. Rubric uses this only for saturation detection (when `bare` and `solo_claude` both exceed 95 for two versions, flag fixture for rotation).
 - **timeout_seconds** — per-arm hard timeout. Runner kills the arm at this limit and marks result `TIMEOUT`.
 - **required_tools** — binaries the arm's environment must provide. Runner checks before invocation.
 - **browser** — true if arm must be able to run Playwright. Runner uses this to decide whether `test-repo`'s Playwright deps get installed before the arm starts.
-- **deps_change_expected** — true if the task involves modifying `package.json` / lockfiles. Variant's CRITIC security sub-pass is expected to trigger native `security-review` dep audit when true.
-- **intent** — **load-bearing**. A short plain-language statement shared by both arms. `spec.md` formalizes it into auto-resolve-ready form; `task.txt` renders it as a direct prompt. A CI lint ensures both derive from this field and stay in sync.
+- **deps_change_expected** — true if the task involves modifying `package.json` / lockfiles. The pipeline arm's CRITIC security sub-pass is expected to trigger native `security-review` dep audit when true.
+- **intent** — **load-bearing**. A short plain-language statement shared by all arms. `spec.md` formalizes it into resolve-ready form; `task.txt` renders it as a direct prompt. A CI lint ensures both derive from this field and stay in sync.
 ## spec.md
-Auto-resolve-ready spec for the pipeline arm. Same format `/devlyn:ideate` produces:
+Resolve-ready spec for the pipeline arm. Same format `/devlyn:ideate` produces:
 ```markdown
 ---
@@ -52,12 +52,17 @@ depends-on: []
 - Concrete, with reasoning for each (not bare).
 ## Out of Scope
-- Explicit "must NOT build" list. Audited by preflight as anti-commitments.
+- Explicit "must NOT build" list. Audited by resolve/JUDGE as anti-commitments.
 ## Verification
 - Concrete commands whose expected behavior is named.
 ```
+`complexity` is the resolve spec contract enum, not the benchmark difficulty
+label. Use `trivial`, `medium`, or `high` for new fixtures; `large` is accepted
+only for compatibility with external/legacy specs. Keep ambiguous calibration
+labels in `metadata.difficulty`, not spec frontmatter.
 ## task.txt
 Bare-arm input. Plain English, same intent, but framed as a user request rather than a formal spec. Intentionally lacks the structured Requirements/Constraints/Out-of-Scope sections — bare must make those calls itself. Must not leak "use the devlyn skill" hints.
@@ -97,10 +102,16 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
   Commands run with `BENCH_WORKDIR` (fresh arm work tree) and
   `BENCH_FIXTURE_DIR` (the fixture directory outside the arm work tree) in
   the environment. Put discriminator/oracle scripts under the fixture
-  directory when the arm should not read the verifier source.
+  directory when the arm should not read the verifier source; any
+  `$BENCH_FIXTURE_DIR/...` file path referenced by a command must exist and
+  must not escape the fixture directory. Hidden oracle commands must reference
+  the verifier through an explicit `$BENCH_FIXTURE_DIR/...` path rather than
+  `cd "$BENCH_FIXTURE_DIR"` indirection.
   Any command that references `BENCH_FIXTURE_DIR` is a hidden oracle and must
   include `contract_refs`: exact substrings from `spec.md` proving the oracle
-  tests a visible contract rather than inventing a narrower one.
+  tests a visible contract rather than inventing a narrower one. Hidden oracle
+  commands must also assert `stdout_contains: ["\"ok\":true"]` so a verifier
+  cannot pass silently without emitting the success sentinel.
 - **forbidden_patterns** — regexes scanned across `diff.patch`. Match at `severity: "disqualifier"` is a hard-floor fail. Match at `severity: "warning"` goes into the judge's critical-findings report.
 - **required_files** — must exist after the arm runs.
 - **forbidden_files** — must NOT appear in the arm's diff.
@@ -108,6 +119,17 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
 - **spec_output_files** — files or globs that define the authorized output surface for Tier B scope tracing.
 - **max_deps_added** — count of new entries under `dependencies`/`devDependencies` in `package.json`. Exceeds → hard-floor fail.
+## high-risk metadata
+Fixtures with `metadata.json` `category: "high-risk"` must include at least
+one resolve risk-trigger term in `metadata.intent` or `spec.md`, matching the
+conditional pair/risk-probe triggers used by `/devlyn:resolve`: security/auth,
+money/pricing/tax/ledger, persistence/data mutation, idempotency/replay,
+API/webhook/signature, allocation/scheduling/inventory/rollback/transaction,
+priority, or output/response-shape contracts. This keeps future pair-evidence
+candidates from relying on a label that would not actually activate the pair
+path.
 ## NOTES.md
 Human-readable explanation of why this fixture exists. Must answer:
@@ -119,6 +141,20 @@ Human-readable explanation of why this fixture exists. Must answer:
 Notes are read during suite design review, not during runs.
+If `NOTES.md` records that a fixture failed a headroom gate or was rejected as
+pair-lift evidence, add the fixture to
+`benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh` in the same change.
+Rejected controls should remain replayable, but they must not be silently
+re-spent as fresh pair candidates.
+## Retired fixtures
+Move fixtures that are no longer valid active golden-suite evidence to
+`fixtures/retired/<fixture-id>/`. Retired fixtures are preserved for replay and
+historical diagnosis, but `run-suite.sh` does not auto-discover them. Each
+retired fixture must keep the six fixture files plus `RETIRED.md` explaining
+the run id or concrete reason it left the active suite.
 ## setup.sh
 Deterministic starting state. Runs against a fresh copy of `benchmark/auto-resolve/fixtures/test-repo/` before either arm starts. Common uses:
@@ -139,4 +175,14 @@ A CI lint step (`scripts/lint-fixtures.sh`) verifies:
 - `metadata.intent` substring appears in both `spec.md::Context` and `task.txt` (≥ 60% token overlap using simple tokenization).
 - `spec.md` frontmatter `id` matches directory name.
 - `expected.json` is valid JSON.
+- Active high-risk fixtures include a resolve risk-trigger term in
+  `metadata.intent` or `spec.md`.
+- Active fixtures whose `NOTES.md` records headroom-gate failure or pair-lift
+  rejection are covered by `pair-rejected-fixtures.sh`.
+- Active fixtures whose `NOTES.md` records `pair_evidence_passed` include an
+  actionable solo-headroom hypothesis in `spec.md`, using the same checker as
+  shadow candidates, and the hypothesis observable command must match a
+  `verification_commands[].cmd` entry in `expected.json`.
 - `setup.sh` is executable.
+- Retired fixtures under `fixtures/retired/F*/` keep `RETIRED.md`, preserve the
+  six fixture files, and are excluded from active suite discovery.

package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md ADDED Viewed

@@ -0,0 +1,37 @@
+# F27 CLI subscription proration
+## Failure mode
+This fixture detects billing implementations that look correct on one happy
+path but mishandle date boundaries, per-segment rounding, duplicate credits, or
+hardcode plan and tax rules instead of reading the seeded data file.
+## Pipeline phase target
+PLAN must separate input validation, period segmentation, per-segment proration,
+credit de-duplication, tax calculation, and output formatting. VERIFY should
+probe date boundary and data-source variants because a small example can pass
+while production invoices are off by one day or one cent.
+## Why existing fixtures do not cover it
+F25 covers cart promotions and F26 covers payout ledger events. This fixture
+adds subscription billing proration: effective-date segmentation, period-day
+denominators, credit idempotency, and tax after credits. It was intended to
+cover a pair-risk-probe gap, but the first real headroom smoke showed the
+visible contract was explicit enough for `solo_claude` to solve cleanly.
+## Retirement
+Retire or replace if both bare and solo consistently exceed the headroom
+thresholds, or if a later billing fixture provides the same proration and
+idempotent-credit signal with lower wall time.
+## Measurement notes
+- `20260511-f27-headroom-smoke-061401`: headroom FAIL. Judge scores were
+  bare 33 / solo_claude 94 (`solo_over_bare` +61). Bare passed 1 of 3
+  verification commands; solo passed 3 of 3 with terminal `PASS`.
+- Do not spend a pair arm on this fixture in its current shape. It needs either
+  a harder visible contract that solo misses without pair probes, or rotation
+  out of the pair-candidate set.

package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md ADDED Viewed

@@ -0,0 +1,13 @@
+# Retired: F27 CLI subscription proration
+Retired from the active golden suite after headroom smoke
+`20260511-f27-headroom-smoke-061401`.
+Reason: `solo_claude` scored 94, exceeding the headroom ceiling of 80, while
+`bare` scored 33 and passed only 1 of 3 verification commands. The fixture is
+too explicit for current solo/pair lift measurement and too expensive to keep
+in the default suite.
+Future use: rework the visible contract so it creates a fair pair-risk-probe
+gap, or replace it with a different billing fixture. Do not count this fixture
+as pair evidence in its current form.

package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json ADDED Viewed

@@ -0,0 +1,56 @@
+{
+  "verification_commands": [
+    {
+      "cmd": "node --test tests/cli.test.js",
+      "exit_code": 0,
+      "stdout_contains": [],
+      "stdout_not_contains": ["not ok "]
+    },
+    {
+      "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/exact-proration.js\"",
+      "exit_code": 0,
+      "stdout_contains": ["\"ok\":true"],
+      "stdout_not_contains": [],
+      "contract_refs": [
+        "Dates are interpreted as UTC calendar dates. The billing period start is inclusive and the end is exclusive.",
+        "Each segment amount is `Math.round(plan.monthly_cents * segment_days / period_days)`. Round each segment independently before summing.",
+        "Identical duplicate credits, where both `id` and `amount_cents` match, are idempotent and apply only once.",
+        "Tax is computed after credits: `tax_cents = Math.round((subtotal_cents - credit_cents) * tax_rate)`."
+      ]
+    },
+    {
+      "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/rules-source-and-conflict.js\"",
+      "exit_code": 0,
+      "stdout_contains": ["\"ok\":true"],
+      "stdout_not_contains": [],
+      "contract_refs": [
+        "Plan monthly prices and state tax rates come from `data/subscription-plans.json`. Do not hardcode these values in the command implementation.",
+        "Credits with the same `id` but different `amount_cents` are conflicting duplicates. They exit `2` with exact error shape `{ \"error\": \"conflicting_credit\", \"id\": string }`, write it to stderr, and write nothing to stdout."
+      ]
+    }
+  ],
+  "forbidden_patterns": [
+    {
+      "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
+      "description": "silent catch returning fallback in subscription-invoice path",
+      "files": ["bin/cli.js"],
+      "severity": "disqualifier"
+    },
+    {
+      "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
+      "description": "empty catch block",
+      "files": ["bin/cli.js"],
+      "severity": "disqualifier"
+    },
+    {
+      "pattern": "/\\*\\s*eslint-disable",
+      "description": "eslint-disable without scoped justification",
+      "files": ["bin/cli.js"],
+      "severity": "disqualifier"
+    }
+  ],
+  "required_files": ["bin/cli.js", "tests/cli.test.js"],
+  "forbidden_files": [],
+  "max_deps_added": 0,
+  "spec_output_files": ["bin/cli.js", "tests/cli.test.js"]
+}

package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json ADDED Viewed

@@ -0,0 +1,10 @@
+{
+  "id": "F27-cli-subscription-proration",
+  "category": "high-risk",
+  "difficulty": "high",
+  "timeout_seconds": 1800,
+  "required_tools": ["node"],
+  "browser": false,
+  "deps_change_expected": false,
+  "intent": "Add a subscription-invoice command that prorates plan changes across a billing period, applies idempotent credits, reads plan and tax rules from data/subscription-plans.json, and prints exact integer-cent invoice totals."
+}

package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh ADDED Viewed

@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+mkdir -p data
+cat > data/subscription-plans.json <<'JSON'
+{
+  "plans": {
+    "starter": { "monthly_cents": 1200 },
+    "growth": { "monthly_cents": 3600 },
+    "scale": { "monthly_cents": 9600 }
+  },
+  "tax_rates": {
+    "CA": 0.0825,
+    "NY": 0.04,
+    "OR": 0
+  }
+}
+JSON