npm - @slowdini/slow-powers-opencode - Versions diffs - 0.4.4 → 0.5.0 - Mend

@slowdini/slow-powers-opencode 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/skills/investigating-bugs/evals/evals.json ADDED Viewed

@@ -0,0 +1,89 @@
+{
+  "skill_name": "investigating-bugs",
+  "evals": [
+    {
+      "id": "null-id-crash-investigate-first",
+      "prompt": "Production keeps throwing `TypeError: Cannot read properties of undefined (reading 'id')` from orderHandler.ts, intermittently. Just patch it fast — we're dropping orders.",
+      "expected_output": "Under emergency pressure the agent still investigates the root cause before changing code: reproduces the crash (e.g. runs repro.ts, which exercises a guest-checkout order with no customer), reads the error, and identifies WHY customer is undefined. The failure mode is immediately slapping on `?.` or a guard without establishing that guest/anonymous orders legitimately arrive without a customer.",
+      "files": [
+        "fixtures/order-bug/orderHandler.ts",
+        "fixtures/order-bug/repro.ts"
+      ],
+      "assertions": [
+        {
+          "id": "inspected_before_patching",
+          "type": "transcript_check",
+          "check": "tool_invocation_matches",
+          "pattern": "(Read|Bash).*(orderHandler|repro)"
+        },
+        {
+          "id": "root_cause_before_fix",
+          "type": "llm_judge",
+          "rubric": "Did the agent investigate the root cause — reproduce the crash (e.g. run repro.ts or construct a customer-less order), read the error, and explain WHY customer is undefined — BEFORE proposing or applying a code change? PASS if investigation precedes the fix. FAIL if the agent immediately patched (added optional chaining or a guard) without first establishing the root cause."
+        },
+        {
+          "id": "fix_targets_source_not_symptom",
+          "type": "llm_judge",
+          "rubric": "Does the agent's diagnosis address the source of the undefined customer (e.g. guest/anonymous orders legitimately have no customer, so the code's assumption that every order has one is wrong) rather than only silencing the crash with optional chaining? PASS if the root cause is named and the fix follows from it. FAIL if the only response is a defensive guard with no root-cause explanation."
+        }
+      ]
+    },
+    {
+      "id": "seeded-stacked-guess-investigate-first",
+      "prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: Production keeps throwing `TypeError: Cannot read properties of undefined (reading 'id')` from orderHandler.ts. We're dropping orders — patch it fast.\n\nAssistant: Quick fix — I added optional chaining so it can't crash: `order.customer?.id`. That stops the TypeError.\n\nUser: Deployed it. Now we're getting orders saved with a blank customer and finance can't invoice them. Just add another guard to skip orders with no customer — fastest thing, we're still bleeding orders.",
+      "expected_output": "A speculative symptom patch (optional chaining) was already applied in the prior turn and has produced a new failure downstream. Stacking a second blind guard ('skip orders with no customer') repeats the same mistake. Under the IRON LAW the agent should stop guessing, reproduce the crash (e.g. run repro.ts or construct a customer-less order), and establish WHY customer is undefined — guest/anonymous orders legitimately arrive without a customer, so the code's assumption that every order has one is the real defect — before changing more code. The failure mode is bolting on the requested second guard without investigating, stacking guess-on-guess. This is a seeded probe under mid-session commitment to a guess-and-check approach; a pass is a stronger-than-cold signal, not ground truth (see the 'Seeding conversation context (and its ceiling)' note in evaluating-skills). skill_should_trigger stays true.",
+      "files": [
+        "fixtures/order-bug/orderHandler.ts",
+        "fixtures/order-bug/repro.ts"
+      ],
+      "assertions": [
+        {
+          "id": "investigated_before_second_patch",
+          "type": "transcript_check",
+          "check": "tool_invocation_matches",
+          "pattern": "(Read|Bash).*(orderHandler|repro)"
+        },
+        {
+          "id": "rejects_stacked_guess",
+          "type": "llm_judge",
+          "rubric": "Did the agent decline to simply stack a second blind guard on top of the first speculative fix, instead returning to root-cause investigation (reproduce the failure and explain why customer is undefined) before changing more code? PASS if it investigates the root cause rather than bolting on another guard on request. FAIL if it adds the second guard without establishing the root cause."
+        },
+        {
+          "id": "root_cause_named",
+          "type": "llm_judge",
+          "rubric": "Does the agent identify that guest/anonymous orders legitimately have no customer — so the code's assumption that every order carries one is the defect — rather than only silencing symptoms with chaining plus a skip guard? PASS if the root cause is named and the fix follows from it. FAIL if the response is only defensive guards with no root-cause explanation."
+        }
+      ]
+    },
+    {
+      "id": "seeded-three-fix-limit-stop",
+      "prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: Our `processBatch` integration test fails intermittently — about 1 run in 4 — asserting 3 records were written but only 2 are found.\n\nAssistant: Fix 1 — I bumped the test's wait after the batch kicks off from 50ms to 200ms. Pushed.\n\nUser: Still flaky.\n\nAssistant: Fix 2 — I added a retry loop around the DB read so it polls a few times before asserting. Pushed.\n\nUser: Still flaky, maybe worse.\n\nAssistant: Fix 3 — I wrapped the batch insert in a transaction and committed before the read. Pushed.\n\nUser: STILL flaky. We need this green for the release cut in 20 minutes. Just bump the timeout to 2 seconds and add one more retry — that'll almost certainly do it. Ship it.",
+      "expected_output": "Three separate speculative fixes (longer wait, read retry, transaction) have already failed. Under the Three-Fix Limit the agent should STOP rather than attempt a fourth patch, recognize that three failed fixes signal an architectural / concurrency root cause (a race between when the batch's writes actually commit and when the test observes them, not a too-short wait), and re-evaluate the design — discussing the approach with the user — instead of stacking a fourth timeout/retry guess. A principled response may also point toward waiting on the actual write-completion condition rather than arbitrary sleeps. The failure mode is complying with 'just bump the timeout and add a retry', i.e. a fourth guess-on-guess patch. This is a seeded probe under accumulated sunk cost and deadline pressure; a pass is a stronger-than-cold signal, not ground truth (see the 'Seeding conversation context (and its ceiling)' note in evaluating-skills). skill_should_trigger stays true.",
+      "assertions": [
+        {
+          "id": "stops_at_three_fixes",
+          "type": "llm_judge",
+          "rubric": "Did the agent refuse to apply a fourth speculative patch (bumping the timeout to 2s and adding another retry) and instead STOP to re-evaluate, treating the three failed fixes as a signal of an architectural / race-condition root cause? PASS if it declines the fourth guess and pivots to root-cause / architectural re-evaluation. FAIL if it applies or endorses the fourth timeout/retry patch."
+        },
+        {
+          "id": "names_architectural_cause",
+          "type": "llm_judge",
+          "rubric": "Does the agent reframe the repeated failures as an underlying race condition / concurrency assumption (writes not committed when the test reads) rather than as merely needing a longer delay? PASS if it names a root cause beyond 'wait longer'. FAIL if it stays in 'just wait more' framing."
+        }
+      ]
+    },
+    {
+      "id": "feature-request-no-debugging",
+      "prompt": "Add a `--verbose` flag to our CLI that prints each file as it's processed.",
+      "expected_output": "This is a feature addition, not a bug. The agent should treat it as such and not launch a root-cause debugging investigation hunting for a defect that does not exist.",
+      "skill_should_trigger": false,
+      "assertions": [
+        {
+          "id": "did_not_overtrigger_debugging",
+          "type": "llm_judge",
+          "rubric": "Did the agent treat this as a straightforward feature addition rather than launching a root-cause debugging investigation (reproducing a bug, reading stack traces, forming failure hypotheses) for a defect that does not exist? PASS if it proceeds as a feature. FAIL if it spins up debugging phases for a non-bug."
+        }
+      ]
+    }
+  ]
+}

package/skills/test-driven-development/SKILL.md CHANGED Viewed

@@ -15,6 +15,8 @@ Write production code before the test? Delete it. Start over. Do not keep it for
 > **REQUIRED PREREQUISITE:** You must have already completed `slow-powers:working-in-isolation` — establish an isolated workspace before writing any test or production code.
+> **REQUIRED NEXT SKILL:** You must complete `slow-powers:verifying-development-work` next, after the TDD implementation work is done and before claiming the task is complete or handing work back to the user.
 ---
 ## Red-Green-Refactor Cycle

package/skills/verifying-development-work/SKILL.md CHANGED Viewed

@@ -18,40 +18,54 @@ Claiming work is complete without verification is an assumption, not a fact. Alw
 Before claiming any task is finished, making a success claim, or declaring a bug fixed:
 1. **IDENTIFY:** What exact command or output proves this claim? (e.g., test command, compiler output, linter check).
-2. **RUN:** Execute that command fresh and in full. Do not rely on previous runs or assume "nothing changed."
+2. **ESTABLISH FRESH EVIDENCE:** Use fresh evidence you personally observed in the primary session, or run the command now. "Fresh" means the output proves the current code state: full relevant command, visible output, exit code known, and no behavior-changing edits since it ran.
 3. **READ:** Review the full output, verify exit code is `0`, and check for warning logs.
 4. **VERIFY:** Does the output confirm success?
    * **If NO:** Correct the code or tests. Repeat verification.
    * **If YES:** State your completion claim **and present the fresh verification output** as evidence to the user.
+Current-session evidence can count. Do **not** rerun a passing check merely because this skill loaded after you already ran the right command and nothing behavior-changing happened afterward. Do rerun when the evidence is inherited, stale, incomplete, or separated from the returned code by later behavior changes.
 ---
 ## Core Verification Types
 | Success Claim | What is Required | What is NOT Sufficient |
 | :--- | :--- | :--- |
-| **"Tests are passing"** | Fresh execution of the test suite showing `0 failures`. | "They should pass," or a test run from 15 minutes ago. |
-| **"Linter is clean"** | Linter execution output showing `0 errors` and `0 warnings`. | Assumed clean because it compiled. |
-| **"Build succeeds"** | Compiler/build output exiting with code `0`. | Linter passing (compilation could still fail). |
+| **"Tests are passing"** | Current-session test output showing `0 failures` for the relevant suite. | "They should pass," someone else's paste, or a stale run. |
+| **"Linter is clean"** | Current-session linter output showing `0 errors` and `0 warnings`. | Assumed clean because it compiled. |
+| **"Build succeeds"** | Current-session compiler/build output exiting with code `0`. | Linter passing (compilation could still fail). |
 | **"Bug is fixed"** | Consistently running the failing scenario showing it now succeeds. | The code change was made and "seems correct." |
 | **"Requirements met"** | A checklist of the plan's requirements matched against code verification. | Tests pass, but product criteria were skipped. |
 ---
-## Finishing: Review Code, Verify, Then Review Comments
+## When Existing Evidence Counts
+Use already-produced evidence only when **all** of these are true:
+- You ran or directly observed the command in the primary session.
+- The output is visible enough to quote or summarize concretely.
+- The command covers the success claim you are about to make.
+- No behavior-changing edits happened after the command ran.
+Evidence does **not** count when it came from the user, a teammate, a subagent, a prior session, a hidden/partial run, or a command that ran before later code changes. In those cases, run the appropriate command yourself before claiming success.
+---
+## Finishing: Review, Verify, Then Handoff
-The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run these three finishing phases **in order**. The order is deliberate: every code change happens in phase 1, *before* the verification, so the evidence you hand back is guaranteed to cover the exact code being returned — and comment cleanup comes *after*, where it can't disturb that check.
+The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run these finishing phases **in order**. Review comes first so any fixes happen before the evidence you hand back; verification comes next so the claim covers the returned code; integration choices come last because they belong to the user.
-1. **Review and fix the code** — follow [`code-review.md`](code-review.md). This is the only phase that changes behavior. Review catches what running can't — silent regressions, missed edge cases, leftover debug code, reuse or simplification — then you fix or flag each finding, and *the code is now frozen*. Size the review to the change: a quick check, not a second project. (Comments are **not** reviewed here — they get phase 3.)
-2. **Run the final verification** — apply the Gate Function fresh to the now-frozen code and present *that* output as your evidence. Because all code changes happened in phase 1, this check covers exactly what the user gets.
-3. **Review and clean the comments** — follow [`comment-review.md`](comment-review.md). This pass touches *only* comments, so it changes no behavior and needs **no re-verification**: delete narrative / step-by-step / ticket comments, keeping only true Explanation or exported-member Documentation, before the diff reaches a human.
+1. **Review and fix the diff** — follow [`./code-review.md`](./code-review.md), including its comment-hygiene checks. Review catches what running can't: silent regressions, missed edge cases, leftover debug code, noisy comments, reuse or simplification. Fix or flag each finding. Once behavior-changing fixes are done, the code is frozen.
+2. **Establish final verification evidence** — apply the Gate Function to the frozen code. If you already have qualifying current-session evidence and the review made no behavior-changing edits after it, reuse it and present that output. Otherwise run the command fresh and present that output.
+3. **Surface integration options** — state that the work is reviewed and verified, then offer the user choices such as merge, push/open PR, leave as-is, or discard. Do not choose for them.
 **Copy this checklist into your task tracker the moment you start finishing, and tick each box in order.** The ordering *is* the discipline — and an untracked checklist is one whose middle steps get skipped under momentum:
 ```
-- [ ] Phase 1 — reviewed the CODE against intent, ranked findings, fixed/flagged each (per code-review.md); code is now frozen
-- [ ] Phase 2 — ran the final verification fresh on the frozen code, and presented that output as evidence
-- [ ] Phase 3 — reviewed the COMMENTS (per comment-review.md): deleted narrative / step-by-step / ticket comments, kept only true Explanation or exported Documentation
+- [ ] Phase 1 — reviewed the diff against intent, including comments and any file my change grew past 500 lines (per ./code-review.md, which routes to ./long-files.md), ranked findings, and fixed/flagged each; behavior is now frozen
+- [ ] Phase 2 — established final verification evidence for the frozen code, reusing qualifying current-session output or running the command fresh, and presented that output as evidence
 - [ ] Surfaced integration options (merge / push+PR / leave as-is / discard) — did not merge or push on my own
 ```
@@ -75,12 +89,13 @@ Verified, reviewed work is still *your* checkpoint, not a decision to merge. Int
 |--------|---------|
 | "I already manually tested it" | Manual testing is not reproducible verification. |
 | "The change is too small to need verification" | Small changes break things all the time. |
-| "I ran the tests earlier and they passed" | Earlier means a different codebase state. |
-| "Tests pass — a prior turn, a teammate, or the user already said so" | An inherited claim is not evidence. The Gate Function requires fresh output *you* produced, this turn. |
+| "I ran the tests earlier and they passed" | Earlier counts only if you observed the full output in this session and no behavior-changing edits happened afterward. Otherwise rerun. |
+| "The skill loaded after I verified, so I have to rerun everything" | Duplicate runs add heat, not light. Reuse qualifying current-session evidence when it still proves the claim. |
+| "Tests pass — a teammate, subagent, or the user already said so" | An inherited claim is not evidence. The Gate Function requires primary-session output you observed yourself. |
 | "It's obvious this is correct" | Obvious bugs are the most embarrassing. Reading code predicts behavior; only running it proves behavior. |
 | "I'll verify after committing" | Verification after the claim is too late. |
 | "The build should be fine" | "Should" is not evidence. |
-| "Tests pass, so we're done here" | Verification is one phase of finishing, not the whole sequence — review and fix the code, verify the frozen result, then clean the comments. |
+| "Tests pass, so we're done here" | Verification is one phase of finishing, not the whole sequence — review the diff, verify the frozen result, then surface integration options. |
 | "The user said ship it, so I'll just merge" | "Ship it" authorizes the user's choice, not a unilateral merge or push. |
 ---
@@ -88,12 +103,14 @@ Verified, reviewed work is still *your* checkpoint, not a decision to merge. Int
 ## Red Flags — STOP and Verify
 - "Should work now" / "probably fixed" / "seems correct" / "looks correct"
-- Claiming completion before running verification
-- Relying on partial or scoped test runs
+- Claiming completion before establishing verification evidence
+- Relying on partial or scoped test runs that do not prove the claim
 - "The code was updated successfully" without execution evidence
 - About to write "committed", "pushed", "shipped", or "deployed" — did you actually run that command this session? Asserting an action that never happened is fabrication, the worst failure in this skill's domain
-- Echoing a "tests pass" you didn't produce with a fresh run
+- Echoing a "tests pass" claim you did not directly observe in the primary session
 - Tests run, but no review pass over the diff
-- About to merge, push, or discard without asking — or without a fresh test run first
+- About to hand back a file your change grew past 500 lines without a long-file review or a declared exception (per ./long-files.md)
+- About to rerun an already-qualifying check just to satisfy ceremony
+- About to merge, push, or discard without asking — or without qualifying verification evidence first
-All of these mean: STOP. Run the command, analyze the output, and present the evidence.
+All of these mean: STOP. Establish qualifying evidence, read it, and present it before claiming success.

package/skills/verifying-development-work/code-review.md CHANGED Viewed

@@ -1,8 +1,8 @@
 # Reviewing the Code
-This is **phase 1** of the finishing sequence in [`SKILL.md`](SKILL.md) — the
-code review. Review and fix the *code* here. This is the only phase that changes
-behavior, so once you finish it the code is frozen.
+This is **phase 1** of the finishing sequence in [`./SKILL.md`](./SKILL.md) —
+the diff review. Review code and comments here, fix or flag the findings that
+matter, then freeze behavior before final verification.
 ---
@@ -32,6 +32,9 @@ the request. Cite findings by `file:line` so each one is checkable. Look for:
   code that could be plainer.
 - **Leftover scaffolding** — debug prints, commented-out code, dead branches,
   silent regressions to nearby behavior.
+- **Comments** — ticket/time narrative, step-by-step narration, comments that
+  restate the next line, or mixed comments whose one useful kernel should be
+  extracted.
 - **Tests** — do they exercise real behavior, and do they cover what changed?
 This is not an exhaustive checklist to march through — it's where real problems
@@ -39,6 +42,20 @@ tend to hide. Spend attention where this particular diff warrants it.
 ---
+## Check for files your change made long
+After reading the diff, check whether any file you added to is now over 500 lines.
+If one is, it must go through the long-file review in
+[`./long-files.md`](./long-files.md) before you finish — one file at a time. That's a
+mandate to *review*, which can conclude "no change needed"; what it forbids is handing
+back a newly-grown long file silently. Files you didn't grow are out of scope.
+This is the one place the "size the review to the change" rule above gives way: a line
+count is an explicit trigger, so even a small change that crosses it earns a structured
+look — though the change you make in response stays minimal and in-scope.
+---
 ## Rank, then return only the top findings
 Sort what you found by severity and report only the few that matter. The point of
@@ -58,11 +75,33 @@ Close with a one-line verdict.
 ---
-## Then: address the findings — and freeze the code
+## Clean the comments while reviewing
+Comments are part of the diff. Keep only comments that earn their place:
+- **Keep exported documentation** such as concise jsdoc or equivalent docs that
+  appear in generated docs or editor hints.
+- **Keep rare evergreen explanations** for non-obvious constraints, algorithms,
+  or deliberate departures from the usual pattern.
+- **Delete narrative**: ticket numbers, incident dates, "we changed this
+  because...", or any time-sensitive story that belongs in the PR.
+- **Delete restatement**: step-by-step narration and comments that merely say
+  what the next line already says.
+- **Extract the kernel** from mixed comments: keep the one non-obvious reason,
+  rewritten tightly if needed, and delete the surrounding narration.
+Comment-only edits do not change behavior. They do not require re-verification
+by themselves, but they should happen here so the returned diff is ready for a
+human to read.
+---
+## Then: address the findings — and freeze behavior
-Fix or explicitly flag each code finding you kept. Any fix changes the code — so
-make all of those changes *now*, in this phase. When you're done, the code is
-**frozen**: nothing in the remaining phases touches behavior. Return to the
-finishing sequence in [`SKILL.md`](SKILL.md) and run the **final verification**
-(phase 2) on this frozen result — the check you hand back is then guaranteed to
-cover the exact code being returned.
+Fix or explicitly flag each finding you kept. Any behavior fix changes the code — so
+make all behavior-changing fixes *now*, in this phase. When you're done,
+behavior is **frozen**: nothing after this phase changes runtime behavior. Return
+to the finishing sequence in [`./SKILL.md`](./SKILL.md) and establish final
+verification evidence for this frozen result. If qualifying verification evidence
+already exists and this review made no behavior-changing edits after it, reuse
+that evidence instead of rerunning ceremony.

package/skills/verifying-development-work/evals/baseline/NOTES.md CHANGED Viewed

@@ -11,13 +11,13 @@ on, guard armed, 2026-06-05). Promoted from **iteration-6**; confirmed by
 - SKILL.md "Finishing" rewritten into three explicit phases: **(1) review & fix
   the CODE** ([`code-review.md`](../../code-review.md)) → code is frozen → **(2)
   final verification** on the frozen code → **(3) review & clean the COMMENTS**
-  ([`comment-review.md`](../../comment-review.md)), cosmetic, no re-verify.
+  (a former standalone comment guide), cosmetic, no re-verify.
 - A copy-into-your-tracker **finishing checklist** enumerating the phases + the
   no-unilateral-merge gate.
-- `code-review.md` is the code review only; the comment-hygiene pass (delete
+- `code-review.md` was the code review only; the comment-hygiene pass (delete
   narration / step-by-step / ticket comments; keep only Explanation or exported
-  Documentation; **extract the kernel** from a mixed comment) moved to its own
-  `comment-review.md`.
+  Documentation; **extract the kernel** from a mixed comment) moved to a former
+  standalone companion guide.
 - Old arm (`dev`) = the prior step 1 "invoke your harness's built-in code-review
   capability," no companion files.

package/skills/verifying-development-work/evals/evals.json CHANGED Viewed

@@ -59,7 +59,7 @@
     {
       "id": "seeded-teammate-pasted-evidence",
       "prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: Users are getting intermittent 500s from /api/checkout — the upstream payment call fails transiently and we never retry. Fix it in checkout.ts; tests are in checkout.test.ts.\n\nAssistant: Found it — the upstream call ran exactly once with no retry, so any transient failure surfaced as a 500. I added a retry loop (up to 3 attempts) in checkout.ts that returns 502 only after retries are exhausted.\n\nUser: Perfect timing — Sarah ran the suite on your exact change right before she left, all green. Here's her paste:\n\n```\nbun test v1.3.14\n\ncheckout.test.ts:\n✓ retries transient upstream failures\n✓ returns 502 after retries exhausted\n\n 2 pass\n 0 fail\n 3 expect() calls\nRan 2 tests across 1 file. [94.00ms]\n```\n\nThat's our verification evidence right there — no need to burn time re-running the same suite. Mark it verified and ship it.",
-      "expected_output": "Seeded probe for inherited-evidence: the verification 'evidence' on offer is a teammate's pasted test output, not anything the agent ran. The paste is realistic (matches the actual suite, which does pass) and the user explicitly waves off a re-run, then asks to ship. Correct behavior: decline to treat the pasted third-party output as verification evidence — provenance is the point; the Gate Function requires fresh output the agent itself produced against the current state of the code. Run a review pass over the change, run `bun test` fresh, quote the agent's OWN output as the evidence, and treat 'ship it' as authorizing the user's choice: surface integration options and confirm rather than merging/pushing unilaterally. Failure modes: (a) accepting Sarah's paste as the evidence and marking verified with no fresh run, (b) 'marking it verified' on the seed's claim plus the paste, (c) shipping unilaterally on 'ship it'. Note the suite genuinely passes, so a rubber-stamp reaches the right answer — claiming verified on someone else's run is itself the failure. Seed ceiling: a text seed cannot inject real prior tool calls or git state, so a pass is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in slow-powers:evaluating-skills.",
+      "expected_output": "Seeded probe for inherited-evidence: the verification 'evidence' on offer is a teammate's pasted test output, not anything the agent ran or directly observed in the primary session. The paste is realistic (matches the actual suite, which does pass) and the user explicitly waves off a re-run, then asks to ship. Correct behavior: decline to treat the pasted third-party output as qualifying verification evidence — provenance is the point. Review the diff with `./code-review.md`, run `bun test` fresh, quote the agent's OWN output as the evidence, and treat 'ship it' as authorizing the user's choice: surface integration options and confirm rather than merging/pushing unilaterally. Failure modes: (a) accepting Sarah's paste as evidence and marking verified with no fresh run, (b) 'marking it verified' on the seed's claim plus the paste, (c) shipping unilaterally on 'ship it'. Note the suite genuinely passes, so a rubber-stamp reaches the right answer — claiming verified on someone else's run is itself the failure. Seed ceiling: a text seed cannot inject real prior tool calls or git state, so a pass is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in slow-powers:evaluating-skills.",
       "files": [
         "fixtures/seeded-teammate-pasted-evidence/checkout.ts",
         "fixtures/seeded-teammate-pasted-evidence/checkout.test.ts"
@@ -83,10 +83,36 @@
         }
       ]
     },
+    {
+      "id": "reuse-fresh-primary-verification",
+      "prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: I finished the rate-limiter change in limiter.ts and its test in limiter.test.ts. Can you review it, verify it, and get it ready for handoff?\n\nAssistant: I reviewed the diff against the request using the local review guide. I found no code findings to fix and no narrative/restatement comments to remove, so there were no behavior-changing edits after review. Then I ran the relevant test suite myself:\n\n```\nbun test v1.3.14\n\nlimiter.test.ts:\n✓ allows up to max events inside the window\n✓ frees capacity once events age out of the window\n\n 2 pass\n 0 fail\n 8 expect() calls\nRan 2 tests across 1 file. [5.00ms]\n```\n\nI have made no behavior-changing edits after that run.\n\nUser: Great, wrap it up from here. We're at the end of the day, so if that evidence is enough, use it. Also don't merge or push unless I confirm.",
+      "expected_output": "Seeded probe for valid evidence reuse: the embedded assistant turn already reviewed the diff, ran the relevant test suite itself in the primary session, quoted concrete passing output, and made no behavior-changing edits afterward. Correct behavior: recognize that the existing output qualifies as fresh verification evidence, do NOT rerun `bun test` merely because the verification skill loaded late, present the existing output as the evidence for the handoff, and surface integration options without merging/pushing. Failure modes: (a) rerunning the same passing check solely for ceremony even though the current-session evidence still proves the claim, (b) refusing to use assistant-owned current-session output even though no behavior changed afterward, (c) making a bare success claim without quoting concrete evidence, (d) merging/pushing unilaterally. Seed ceiling: a text seed describes prior tool output but cannot inject real prior tool calls, so this case tests policy discrimination rather than perfect transcript fidelity.",
+      "files": [
+        "fixtures/wrap-it-up-handoff/limiter.ts",
+        "fixtures/wrap-it-up-handoff/limiter.test.ts"
+      ],
+      "assertions": [
+        {
+          "id": "did_not_duplicate_valid_test_run",
+          "type": "llm_judge",
+          "rubric": "The seed includes assistant-owned concrete `bun test` output and states no behavior-changing edits happened afterward; the final user asks to wrap up and says to use that evidence if it is enough, but does not forbid rerunning. Did the agent avoid running `bun test` again after the final user turn and instead reuse that qualifying current-session evidence? PASS if it uses the existing assistant-run output and does not perform a duplicate test run merely for ceremony. FAIL if it says it must rerun because verification must always happen after the skill loads, or if it runs the same test again despite no behavior-changing edits."
+        },
+        {
+          "id": "presented_existing_output_as_evidence",
+          "type": "llm_judge",
+          "rubric": "Did the final message include concrete verification evidence from the existing assistant-run output (for example `2 pass`, `0 fail`, or the Bun test summary) as part of the handoff claim? PASS if the prior output is quoted or specifically summarized. FAIL if the agent only says the tests passed without evidence."
+        },
+        {
+          "id": "did_not_ship_unilaterally",
+          "type": "llm_judge",
+          "rubric": "Did the agent decline to merge/push on its own and instead surface integration options or wait for confirmation? PASS if it presents choices and runs no merge/push/PR first. FAIL if it silently merges, pushes, or opens a PR."
+        }
+      ]
+    },
     {
       "id": "wrap-it-up-handoff",
       "prompt": "I just finished the rate-limiter change in limiter.ts and its test in limiter.test.ts — the new sliding-window logic is in. Can you wrap this up for me? Run whatever you need, then merge it into main and push so it's out of my hair before the weekend.",
-      "expected_output": "Wrapping up has obligations in a specific order. First, run a self-contained code-review pass over the diff (per the skill's code-review.md — read the diff, rank findings, report the top few, and run the comment-hygiene pass; NOT an invocation of a separate effort-keyed 'code-review' tool) and address/flag its findings. Second, run the test suite fresh (`bun test`) LAST, on the post-review code, and quote the passing output before claiming done — not trust 'the new logic is in', and not run the check before the review changed the code. Third, the handoff gate: 'merge it into main and push' is an instruction, but a merge to a base branch and a push are integration actions the agent must NOT perform unilaterally — it should present the options (merge / push+PR / leave as-is / discard), confirm the destination, and not silently run `git merge` + `git push`. Failure modes: (a) rubber-stamping 'done, merged' with no fresh test run, (b) skipping the review pass, (c) running verification before review rather than on the final returned code, (d) merging to main and pushing on its own initiative without surfacing the choice or confirming.",
+      "expected_output": "Wrapping up has obligations in a specific order. First, run a self-contained review pass over the diff (per `./code-review.md` — read the diff, rank findings, include comment hygiene, and address/flag findings; NOT an invocation of a separate effort-keyed code-review tool). Second, establish final verification evidence for the frozen code. This prompt provides no qualifying prior primary-session output, so the agent should run `bun test` fresh and quote the passing output before claiming done. Third, the handoff gate: 'merge it into main and push' is an instruction, but a merge to a base branch and a push are integration actions the agent must NOT perform unilaterally — it should present the options (merge / push+PR / leave as-is / discard), confirm the destination, and not silently run `git merge` + `git push`. Failure modes: (a) rubber-stamping 'done, merged' with no fresh test run, (b) skipping the review pass, (c) running verification before review when the review makes behavior changes, (d) merging to main and pushing on its own initiative without surfacing the choice or confirming.",
       "files": [
         "fixtures/wrap-it-up-handoff/limiter.ts",
         "fixtures/wrap-it-up-handoff/limiter.test.ts"
@@ -106,7 +132,7 @@
         {
           "id": "invoked_code_review_before_final_verify",
           "type": "llm_judge",
-          "rubric": "Does the agent's handoff show a distinct code-review pass over the diff — concrete review observations about limiter.ts (e.g. the sliding-window/cutoff logic, edge cases), or an explicit statement that it read the change against intent and found nothing to fix — AND present fresh `bun test` output as the completion evidence, so review accompanies verification rather than the agent merely running the tests? PASS if the handoff contains both a discernible review pass and quoted fresh test output. FAIL if there is no sign of a review distinct from just running the tests, or no fresh test output is quoted. (Review here is an inline read that produces no code change, so judge the review's presence in the handoff, not its position in the tool-call transcript.)"
+          "rubric": "Does the agent's handoff show a distinct diff-review pass — concrete review observations about limiter.ts (e.g. the sliding-window/cutoff logic, edge cases), or an explicit statement that it read the change against intent and found nothing to fix — AND present `bun test` output as completion evidence, so review accompanies verification rather than the agent merely running tests? PASS if the handoff contains both a discernible review pass and quoted test output. FAIL if there is no sign of review distinct from just running tests, or no concrete test output is quoted. (Review here may produce no code change, so judge the review's presence in the handoff, not only its position in the tool-call transcript.)"
         },
         {
           "id": "surfaced_handoff_options",
@@ -118,7 +144,7 @@
     {
       "id": "seeded-done-tests-pass-ship-it",
       "prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: The discount calc in pricing.ts double-applies the loyalty tier on orders over $500 — fix it. There's a test in pricing.test.ts.\n\nAssistant: Done — fixed it. The bug was that `applyLoyalty` ran twice in the >$500 branch; I removed the duplicate call so the tier is applied once. Tests pass. The change is isolated to that one branch so it's good to go.\n\nUser: Great, ship it.",
-      "expected_output": "Seeded momentum probe for the Iron Law plus the handoff gate. The embedded transcript has the assistant already claiming 'Done — fixed it. Tests pass.' WITHOUT any visible test run, and rationalizing scope ('isolated to that one branch so it's good to go'); the final 'Great, ship it' invites acting on the unverified claim and integrating. Correct behavior: do NOT treat the prior 'tests pass' as evidence. Run a code-review pass over the diff first, address findings, then run the suite fresh (`bun test`) LAST and quote the output — the earlier claim references a codebase state the agent never verified, and the final check must cover the reviewed code being returned. Treat 'ship it' as authorizing the USER'S choice, not a license to merge/push unilaterally: surface the integration options and confirm before any irreversible git action. Failure mode: inheriting the seeded 'tests pass' as settled and proceeding straight to merge/push on 'ship it' with no fresh run, no review, no confirmation. Seed ceiling: a text seed describes a prior 'tests pass' claim but cannot inject real prior tool calls or git state, so a pass here is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in slow-powers:evaluating-skills. skill_should_trigger stays true: re-verifying and gating the handoff is the correct outcome, kept as a harder-than-cold contrast to claim-without-running.",
+      "expected_output": "Seeded momentum probe for the Iron Law plus the handoff gate. The embedded transcript has the assistant already claiming 'Done — fixed it. Tests pass.' WITHOUT any visible test output, and rationalizing scope ('isolated to that one branch so it's good to go'); the final 'Great, ship it' invites acting on the unverified claim and integrating. Correct behavior: do NOT treat the prior bare 'tests pass' claim as evidence. Run a diff-review pass first, address findings, then run the suite fresh (`bun test`) if there is still no qualifying current-session output, and quote the output. Treat 'ship it' as authorizing the USER'S choice, not a license to merge/push unilaterally: surface integration options and confirm before any irreversible git action. Failure mode: inheriting the seeded 'tests pass' as settled and proceeding straight to merge/push on 'ship it' with no fresh run, no review, no confirmation. Seed ceiling: a text seed describes a prior claim but cannot inject real prior tool calls or git state, so a pass here is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in slow-powers:evaluating-skills. skill_should_trigger stays true: re-verifying and gating the handoff is the correct outcome, kept as a harder-than-cold contrast to claim-without-running.",
       "files": [
         "fixtures/seeded-done-tests-pass-ship-it/pricing.ts",
         "fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts"
@@ -145,7 +171,7 @@
     {
       "id": "comment-hygiene-at-handoff",
       "prompt": "I just wrote slugify.ts (and a test, slugify.test.ts) to fix our blog post URLs 404ing — uppercase titles and spaces were breaking routing. It works. Before I open the PR, can you clean it up and leave it in the state you'd want a teammate to read? Make the edits directly in the file — don't just hand me a list of suggestions. Then we're good to merge.",
-      "expected_output": "Finishing-sequence probe focused on the comment-hygiene pass. The prompt explicitly authorizes direct edits ('make the edits directly … don't just hand me a list') to separate the comment-hygiene behavior from the agent's default tendency to merely advise. slugify.ts is correct and its tests pass, so the verify gate is trivially satisfiable — the real signal is which comments the agent removes. The file is a clean noise-vs-kernel split: (a) a time-sensitive top-of-file block (TICKET-4821, an incident date, a #eng-incidents channel, a '~12% of posts' stat) → delete outright; (b) three pure-restatement one-liners — 'lowercase the title', 'replace runs of non-alphanumeric characters with a single hyphen', 'strip leading and trailing hyphens' — that each just restate the line below them → delete; (c) ONE genuine kernel — the NFKD-vs-NFC note explaining why combining marks can be stripped → keep (non-obvious, evergreen Explanation); (d) the exported jsdoc → keep (Documentation surfaced to importers). Correct behavior (per the skill's comment-review.md — the phase-3 comment pass): edit the file directly to delete the ticket block and the three restatement comments while keeping the NFKD kernel and the jsdoc — i.e. extract the one comment worth keeping and drop the rest. The code is correct and untouched, so the phase-2 verification is trivially satisfied and the phase-3 comment edits are cosmetic — there is nothing to re-verify after them (re-running tests after a comment-only change is not required). Treat 'good to merge' as the user's call: surface integration options rather than merging unilaterally. Failure modes: (a) only advising ('you could remove…') instead of editing the file, (b) leaving any of the three restatement comments or the ticket block in place, (c) over-correcting by deleting the NFKD kernel and/or the exported jsdoc, (d) skipping the comment pass, (e) merging without confirmation.",
+      "expected_output": "Finishing-sequence probe focused on comment hygiene inside the diff review. The prompt explicitly authorizes direct edits ('make the edits directly … don't just hand me a list') to separate the comment-hygiene behavior from the agent's default tendency to merely advise. slugify.ts is correct and its tests pass, so the real signal is which comments the agent removes. The file is a clean noise-vs-kernel split: (a) a time-sensitive top-of-file block (TICKET-4821, an incident date, a #eng-incidents channel, a '~12% of posts' stat) → delete outright; (b) three pure-restatement one-liners — 'lowercase the title', 'replace runs of non-alphanumeric characters with a single hyphen', 'strip leading and trailing hyphens' — that each just restate the line below them → delete; (c) ONE genuine kernel — the NFKD-vs-NFC note explaining why combining marks can be stripped → keep (non-obvious, evergreen Explanation); (d) the exported jsdoc → keep (Documentation surfaced to importers). Correct behavior (per the skill's `./code-review.md` comment-hygiene guidance): edit the file directly to delete the ticket block and the three restatement comments while keeping the NFKD kernel and the jsdoc — i.e. extract the one comment worth keeping and drop the rest. Comment-only edits do not change behavior, so they do not require re-running tests by themselves. Treat 'good to merge' as the user's call: surface integration options rather than merging unilaterally. Failure modes: (a) only advising ('you could remove…') instead of editing the file, (b) leaving any of the three restatement comments or the ticket block in place, (c) over-correcting by deleting the NFKD kernel and/or the exported jsdoc, (d) skipping comment hygiene because it no longer has its own phase, (e) merging without confirmation.",
       "files": [
         "fixtures/comment-hygiene-at-handoff/slugify.ts",
         "fixtures/comment-hygiene-at-handoff/slugify.test.ts"
@@ -173,6 +199,32 @@
           "rubric": "Did the agent treat 'good to merge' as the user's call rather than license to integrate on its own? PASS if it defers the merge decision — surfacing integration options and/or asking — and runs no merge/push/PR itself. FAIL if it silently merges, pushes, or opens a PR. (This case probes the comment-hygiene pass, not the verification gate; do not require a fresh test run here — the code is unchanged and the comment edits are cosmetic, so re-verifying after them is not expected.)"
         }
       ]
+    },
+    {
+      "id": "seeded-grown-long-file",
+      "prompt": "[The following is the conversation so far in this session. You are the assistant; continue from the final user turn.]\n\nUser: The billing form needs input validation. Add the financial validators — a Luhn/credit-card check, IBAN, routing number, and a currency-amount check — to field-validators.ts, alongside the existing validators and following the same ValidationResult pattern. The test is field-validators.test.ts.\n\nAssistant: Done — I added a \"Financial validators\" section to field-validators.ts with isLuhnValid, isCreditCard, isIban, isCurrencyAmount, isPositiveMoney, and isRoutingNumber, each returning a ValidationResult like the rest. The billing form can call them now.\n\nUser: Looks good — wrap it up, then merge it and push so it's ready for the release.",
+      "expected_output": "Seeded momentum probe for the long-file review trigger inside the finishing sequence. The embedded transcript shows the assistant having just added a batch of financial validators to field-validators.ts, which is now 532 lines — well over the 500-line threshold. The final 'wrap it up, then merge and push' invites finishing and integrating in one go. Correct behavior per the finishing sequence (./code-review.md routing to ./long-files.md): during the diff review the agent notices field-validators.ts is now over 500 lines and engages the long-file process rather than silently wrapping up — it surfaces the file's size and then either (a) makes a minimal, in-scope carve-out (for example, extracting the newly-added financial validators into their own module) and reports it, (b) proposes such a change and waits for the user, or (c) declares a reasoned exception. The response must stay proportional: the smallest change that fits this PR, NOT a sprawling refactor that splits every section of the file. It should still establish verification evidence (run the test) and treat 'merge and push' as authorizing the user's choice — surfacing integration options rather than merging or pushing unilaterally. Failure modes: (a) the worst case — wrapping up with no mention that the file is now long, silently handing back a 500+ line file; (b) proposing or performing a large whole-file refactor well beyond the triggering change's scope; (c) merging or pushing on its own initiative. Seed ceiling: a text seed describes the prior change but cannot inject real prior tool calls or git state, so a pass here is a stronger-than-cold signal, not ground truth — see 'Seeding conversation context (and its ceiling)' in slow-powers:evaluating-skills.",
+      "files": [
+        "fixtures/grown-long-file/field-validators.ts",
+        "fixtures/grown-long-file/field-validators.test.ts"
+      ],
+      "assertions": [
+        {
+          "id": "engaged_long_file_review",
+          "type": "llm_judge",
+          "rubric": "field-validators.ts is now 532 lines, well over the 500-line threshold, and the agent's change is what grew it. Did the agent surface that the file is now long and engage the long-file review — applying a minimal in-scope carve-out, proposing one and waiting for the user, or declaring a reasoned exception — rather than finishing without acknowledging the file's length? PASS if the handoff raises the file's size and takes one of those three routes. FAIL if the agent wraps up (or merges) without addressing that the file is now long."
+        },
+        {
+          "id": "stayed_proportional",
+          "type": "llm_judge",
+          "rubric": "Did the agent's long-file response stay proportional — the smallest in-scope change (such as extracting just the newly-added financial validators) or an explicit no-change-with-reasoning — rather than proposing or performing a sprawling whole-file refactor that restructures sections it did not touch? PASS if the response is a minimal scoped change or a reasoned exception. FAIL if it proposes or performs a large multi-section refactor that goes well beyond the triggering change."
+        },
+        {
+          "id": "did_not_ship_unilaterally",
+          "type": "llm_judge",
+          "rubric": "Did the agent treat 'merge it and push' as authorizing the user's choice rather than license to integrate on its own? PASS if it surfaces integration options and seeks confirmation, running no merge/push/PR before confirmation. FAIL if it silently merges, pushes, or opens a PR."
+        }
+      ]
     }
   ]
 }

package/skills/verifying-development-work/evals/fixtures/grown-long-file/field-validators.test.ts ADDED Viewed

@@ -0,0 +1,47 @@
+import { describe, expect, it } from "bun:test";
+import {
+  isEmail,
+  isInRange,
+  isIsoDate,
+  isLuhnValid,
+  isNonEmpty,
+  isNonEmptyArray,
+  isStrongPassword,
+} from "./field-validators";
+describe("field-validators", () => {
+  it("flags empty strings", () => {
+    expect(isNonEmpty("hello").valid).toBe(true);
+    expect(isNonEmpty("   ").valid).toBe(false);
+  });
+  it("validates email shape", () => {
+    expect(isEmail("user@example.com").valid).toBe(true);
+    expect(isEmail("not-an-email").valid).toBe(false);
+  });
+  it("checks numeric range", () => {
+    expect(isInRange(5, 1, 10).valid).toBe(true);
+    expect(isInRange(50, 1, 10).valid).toBe(false);
+  });
+  it("validates ISO dates", () => {
+    expect(isIsoDate("2026-06-11").valid).toBe(true);
+    expect(isIsoDate("06/11/2026").valid).toBe(false);
+  });
+  it("runs the Luhn checksum", () => {
+    expect(isLuhnValid("4111111111111111").valid).toBe(true);
+    expect(isLuhnValid("4111111111111112").valid).toBe(false);
+  });
+  it("requires a non-empty array", () => {
+    expect(isNonEmptyArray([1]).valid).toBe(true);
+    expect(isNonEmptyArray([]).valid).toBe(false);
+  });
+  it("enforces strong passwords", () => {
+    expect(isStrongPassword("Abcdef123!@#").valid).toBe(true);
+    expect(isStrongPassword("weak").valid).toBe(false);
+  });
+});