@slowdini/slow-powers-opencode 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +14 -5
  2. package/package.json +1 -1
  3. package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
  4. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  5. package/skills/evaluating-skills/SKILL.md +3 -3
  6. package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
  7. package/skills/evaluating-skills/pressure-scenarios.md +1 -1
  8. package/skills/hardening-plans/SKILL.md +1 -1
  9. package/skills/systematic-debugging/SKILL.md +4 -0
  10. package/skills/test-driven-development/SKILL.md +2 -0
  11. package/skills/verifying-development-work/SKILL.md +88 -0
  12. package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/BASELINE.md +6 -6
  13. package/skills/verifying-development-work/evals/baseline/NOTES.md +153 -0
  14. package/skills/verifying-development-work/evals/baseline/benchmark.json +53 -0
  15. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  16. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  17. package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/grading/build-implied-by-edit__with_skill.json +3 -3
  18. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  19. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  20. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  21. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +46 -0
  22. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +31 -0
  23. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +53 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +38 -0
  25. package/skills/verifying-development-work/evals/evals.json +146 -0
  26. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
  27. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
  28. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
  29. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
  30. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
  31. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
  32. package/skills/working-in-isolation/SKILL.md +2 -2
  33. package/skills/writing-skills/SKILL.md +2 -3
  34. package/skills/finishing-a-development-branch/SKILL.md +0 -96
  35. package/skills/finishing-a-development-branch/evals/evals.json +0 -41
  36. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
  37. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
  38. package/skills/verification-before-completion/SKILL.md +0 -65
  39. package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
  40. package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
  41. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  42. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  43. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  44. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  45. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  46. package/skills/verification-before-completion/evals/evals.json +0 -77
  47. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
  48. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
  49. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
  50. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
  51. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
package/README.md CHANGED
@@ -94,20 +94,29 @@ Slow-powers provides a set of highly focused, execution-level skills that ensure
94
94
  1. **`working-in-isolation`** — Establishes an isolated workspace so new work doesn't collide with existing or in-progress work, keeping protected branches like `main` clean.
95
95
  2. **`test-driven-development`** — Enforces a strict RED-GREEN-REFACTOR cycle, ensuring all production code is backed by failing test verification first.
96
96
  3. **`systematic-debugging`** — Guides the agent to locate the root cause of failures via scientific hypothesis testing, avoiding "guess-and-check" thrashing.
97
- 4. **`verification-before-completion`** — Requires running actual test/build commands and presenting concrete evidence before making any success claims.
98
- 5. **`finishing-a-development-branch`** — Manages local branch hygiene, runs final test verifications, and cleans up git worktrees.
99
- 6. **`writing-skills`** — Handles future custom skill authoring and updates.
97
+ 4. **`verifying-development-work`** — Requires running actual test/build commands and presenting concrete evidence before any success claim, with a final review pass over the change before work is handed back.
98
+ 5. **`writing-skills`** — Handles future custom skill authoring and updates.
100
99
 
101
100
  ## What's inside
102
101
 
103
- **Testing & Verification** — `test-driven-development`, `verification-before-completion`
102
+ **Testing & Verification** — `test-driven-development`, `verifying-development-work`
104
103
 
105
104
  **Debugging** — `systematic-debugging`
106
105
 
107
- **Workspace & Git Hygiene** — `working-in-isolation`, `finishing-a-development-branch`
106
+ **Workspace & Git Hygiene** — `working-in-isolation`
108
107
 
109
108
  **Meta & Extension** — `writing-skills`
110
109
 
110
+ ## Intended Workflows
111
+
112
+ The skills declare lightweight prerequisite / next-step gates so the agent knows the intended sequence. These gates **suggest** what comes before and after a skill once it is invoked; they do **not** restrict when any skill can be invoked. An agent may invoke `test-driven-development`, `verifying-development-work`, or any other skill at any point.
113
+
114
+ **Plan mode:** plan mode → `hardening-plans` → `working-in-isolation` → `test-driven-development` → `verifying-development-work`
115
+
116
+ **Debugging:** (`working-in-isolation`) → `systematic-debugging` → `verifying-development-work`
117
+
118
+ `hardening-plans` points to `test-driven-development` as its next step, and `test-driven-development` requires `working-in-isolation` first — so isolation is reached as TDD's prerequisite, producing the plan-mode order above.
119
+
111
120
  ## Philosophy
112
121
 
113
122
  - Test-Driven Development — write tests first, always
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@slowdini/slow-powers-opencode",
3
- "version": "0.1.5",
3
+ "version": "0.2.0",
4
4
  "description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
5
5
  "type": "module",
6
6
  "main": "./opencode/plugins/slow-powers.js",
@@ -4,7 +4,7 @@
4
4
  {
5
5
  "id": "audits-completed-session",
6
6
  "prompt": "We just wrapped a work session in another repo — the session notes are attached as session-summary.md. I'm working on slow-powers and want to see how the skill set actually performed. Run the post-session slow-powers usage audit on that session.",
7
- "expected_output": "The agent produces a structured audit report covering, under clear headings: which slow-powers skills were invoked and what triggered each (systematic-debugging, triggered by the rendering bug); which skills were considered then skipped, with the actual at-the-time rationalization quoted (test-driven-development, skipped because it was 'a two-line guard' with a 15-minute demo deadline); relevant skills never considered (verification-before-completion — the agent claimed the fix worked without running the test suite); an estimate of slow-powers's token/wall-time cost; and a net-usefulness verdict citing specific counterfactual moments. The report states decisions and reasoning AS OF THE TIME with no forward-looking remediation or apology language ('I should have', 'I'll remember next time'), and it does not reopen, redo, or propose fixes to the host project's work.",
7
+ "expected_output": "The agent produces a structured audit report covering, under clear headings: which slow-powers skills were invoked and what triggered each (systematic-debugging, triggered by the rendering bug); which skills were considered then skipped, with the actual at-the-time rationalization quoted (test-driven-development, skipped because it was 'a two-line guard' with a 15-minute demo deadline); relevant skills never considered (verifying-development-work — the agent claimed the fix worked without running the test suite); an estimate of slow-powers's token/wall-time cost; and a net-usefulness verdict citing specific counterfactual moments. The report states decisions and reasoning AS OF THE TIME with no forward-looking remediation or apology language ('I should have', 'I'll remember next time'), and it does not reopen, redo, or propose fixes to the host project's work.",
8
8
  "files": ["fixtures/audits-completed-session/session-summary.md"],
9
9
  "assertions": [
10
10
  {
@@ -32,7 +32,7 @@
32
32
  {
33
33
  "id": "audits-blindspot-session",
34
34
  "prompt": "Just finished a session over in the payments-gateway repo — notes are in session-summary.md. I'm working on slow-powers and want a read on how the skills did. Please run the post-session slow-powers usage audit on it.",
35
- "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), working-in-isolation (a feature change made directly on the checked-out branch), and verification-before-completion (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
35
+ "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), working-in-isolation (a feature change made directly on the checked-out branch), and verifying-development-work (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
36
36
  "files": ["fixtures/audits-blindspot-session/session-summary.md"],
37
37
  "assertions": [
38
38
  {
@@ -43,7 +43,7 @@
43
43
  {
44
44
  "id": "blindspot_in_never_considered",
45
45
  "type": "llm_judge",
46
- "rubric": "The session notes state the agent never thought about test-driven-development, working-in-isolation, or verification-before-completion (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
46
+ "rubric": "The session notes state the agent never thought about test-driven-development, working-in-isolation, or verifying-development-work (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
47
47
  },
48
48
  {
49
49
  "id": "no_remediation_language",
@@ -33,7 +33,7 @@ enough that I was confident in it." The user didn't push further.
33
33
  Notes on environment for this session:
34
34
  - The repo has a `bun test` suite (~12 seconds) with existing refund tests in `test/refunds.test.ts`.
35
35
  - slow-powers was active; the session-start bootstrap listing was present, including
36
- `test-driven-development`, `working-in-isolation`, and `verification-before-completion`.
36
+ `test-driven-development`, `working-in-isolation`, and `verifying-development-work`.
37
37
  - I did not at any point think about writing a test first, creating a branch/worktree, or running
38
38
  the suite — I went straight from the request to editing source on the current branch.
39
39
  - No git branch or worktree was created; edits were made on whatever branch was checked out.
@@ -66,7 +66,7 @@ A test case has three parts:
66
66
  - **files** (optional): fixture files the prompt references
67
67
  - **skill_should_trigger** (optional, default `true`): set `false` for a *negative* eval where correct behavior is the skill **not** firing (e.g. an over-trigger guard — a feature request that shouldn't launch a debugging investigation). Negative evals are excluded from the skill-invocation rate and its validity warning, so a correct non-invocation isn't mistaken for the skill failing to fire.
68
68
 
69
- Stored in `<skill>/evals/evals.json`. See `templates/evals.json.example` and `examples/verification-before-completion-evals.json`.
69
+ Stored in `<skill>/evals/evals.json`. See `templates/evals.json.example` and `examples/verifying-development-work-evals.json`.
70
70
 
71
71
  Tips for writing good prompts:
72
72
 
@@ -82,7 +82,7 @@ Tips for writing good prompts:
82
82
 
83
83
  What "stresses the skill" depends on what kind of skill it is. The four types from `slow-powers:writing-skills` each need a different style of prompt:
84
84
 
85
- - **Discipline-enforcing skills** (TDD, verification-before-completion). Test with pressure — academic prompts ("explain how TDD works") will pass without measuring anything useful. Combine multiple pressures (time + sunk cost + authority + exhaustion) and force a choice. See `pressure-scenarios.md` for the taxonomy. The wild failure for these skills is almost always *mid-session* — the agent is already committed to a skill-free approach when the trigger arrives — so a cold prompt under-measures them; pair each cold case with a **seeded** one (see *Seeding conversation context* below). Success = the rule holds under maximum pressure.
85
+ - **Discipline-enforcing skills** (TDD, verifying-development-work). Test with pressure — academic prompts ("explain how TDD works") will pass without measuring anything useful. Combine multiple pressures (time + sunk cost + authority + exhaustion) and force a choice. See `pressure-scenarios.md` for the taxonomy. The wild failure for these skills is almost always *mid-session* — the agent is already committed to a skill-free approach when the trigger arrives — so a cold prompt under-measures them; pair each cold case with a **seeded** one (see *Seeding conversation context* below). Success = the rule holds under maximum pressure.
86
86
  - **Technique skills** (condition-based-waiting, root-cause-tracing). Test application: hand the agent a new scenario where the technique applies and check it gets used correctly. Include at least one edge-case variation. Success = the technique transfers to a situation the skill didn't explicitly describe.
87
87
  - **Pattern skills** (flatten-with-flags, information-hiding). Test recognition: include prompts where the pattern applies and prompts where it doesn't. Success = the agent applies the pattern when warranted and refrains when it isn't.
88
88
  - **Reference skills** (API docs, syntax guides). Test retrieval: ask questions whose answers are in the reference, including a few that hit gaps you suspect. Success = the agent finds the right section and uses it correctly.
@@ -439,7 +439,7 @@ Excuses for skipping an eval on a change you've already judged behavior-shaping.
439
439
  - `templates/eval-task-prompt.md` — scaffold for dispatching a subagent to execute a test case
440
440
  - `templates/judge-prompt.md` — scaffold for dispatching a judge subagent
441
441
  - `templates/revise-skill-prompt.md` — scaffold for the iteration step
442
- - `examples/verification-before-completion-evals.json` — committed real example
442
+ - `examples/verifying-development-work-evals.json` — committed real example
443
443
  - `pressure-scenarios.md` — pressure-scenario taxonomy for authoring prompts that stress discipline-enforcing skills
444
444
  - `runner/` — the Bun eval runner (orchestrator, grader, aggregator, transcript adapters) that executes the methodology; ships with the skill so users can run evals on their own skills
445
445
  - `harness-details/claude.md` — Claude Code-specific step-by-step for running an eval (resolving the runner, dispatching subagents, grading)
@@ -1,6 +1,6 @@
1
1
  {
2
- "skill_name": "verification-before-completion",
3
- "_comment": "Reference example — mirrors the live evals.json at skills/verification-before-completion/evals/evals.json. Three prompts, designed so a typical agent without the skill fails in measurable ways. Eval 1 has a mechanical signal (did the agent run the test command?); Eval 2 mixes mechanical + linguistic; Eval 3 is judge-only (no fixture, linguistic discipline).",
2
+ "skill_name": "verifying-development-work",
3
+ "_comment": "Reference example — mirrors the live evals.json at skills/verifying-development-work/evals/evals.json. Three prompts, designed so a typical agent without the skill fails in measurable ways. Eval 1 has a mechanical signal (did the agent run the test command?); Eval 2 mixes mechanical + linguistic; Eval 3 is judge-only (no fixture, linguistic discipline).",
4
4
  "evals": [
5
5
  {
6
6
  "id": "claim-without-running",
@@ -1,6 +1,6 @@
1
1
  # Pressure Scenarios for Skill Evals
2
2
 
3
- **Load this reference when:** authoring `prompt` fields in `evals.json` for a discipline-enforcing skill (TDD, verification-before-completion, designing-before-coding, etc.) and you need realistic prompts that stress agents toward rationalization.
3
+ **Load this reference when:** authoring `prompt` fields in `evals.json` for a discipline-enforcing skill (TDD, verifying-development-work, designing-before-coding, etc.) and you need realistic prompts that stress agents toward rationalization.
4
4
 
5
5
  ## Why pressure scenarios
6
6
 
@@ -44,7 +44,7 @@ Before the plan leaves your hands, re-read the whole draft once, top to bottom,
44
44
 
45
45
  When the plan is approved, implementation begins — and implementation has its own gate.
46
46
 
47
- > **REQUIRED NEXT SKILL:** Use `slow-powers:test-driven-development` for the implementation phase.
47
+ > **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next for the implementation phase.
48
48
 
49
49
  The plan should carry a tests section so the reader can see *what* will be verified. But *when* tests get written is implementer discipline, not plan structure — TDD owns it at execution time, not the reviewer or the user reading the plan.
50
50
 
@@ -11,6 +11,10 @@ Avoid "guess-and-check" coding. Always identify the root cause before making cha
11
11
 
12
12
  > **Violating the letter of the rules is violating the spirit of the rules.**
13
13
 
14
+ > **REQUIRED PREREQUISITE:** You must have already completed `slow-powers:working-in-isolation` — debug from an isolated workspace.
15
+
16
+ > **REQUIRED BACKGROUND:** You must understand `slow-powers:test-driven-development` — Phase 4 captures the bug with a failing test before fixing it.
17
+
14
18
  ---
15
19
 
16
20
  ## Phase 1: Root Cause Investigation
@@ -13,6 +13,8 @@ Write production code before the test? Delete it. Start over. Do not keep it for
13
13
 
14
14
  > **Violating the letter of the rules is violating the spirit of the rules.**
15
15
 
16
+ > **REQUIRED PREREQUISITE:** You must have already completed `slow-powers:working-in-isolation` — establish an isolated workspace before writing any test or production code.
17
+
16
18
  ---
17
19
 
18
20
  ## Red-Green-Refactor Cycle
@@ -0,0 +1,88 @@
1
+ ---
2
+ name: verifying-development-work
3
+ description: Use before claiming any task is complete, fixed, or passing, and before handing finished work back to the user.
4
+ ---
5
+
6
+ # Verifying Development Work
7
+
8
+ Claiming work is complete without verification is an assumption, not a fact. Always verify before presenting success.
9
+
10
+ > **THE IRON LAW:** NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.
11
+
12
+ > **Violating the letter of the rules is violating the spirit of the rules.**
13
+
14
+ ---
15
+
16
+ ## The Gate Function
17
+
18
+ Before claiming any task is finished, making a success claim, or declaring a bug fixed:
19
+
20
+ 1. **IDENTIFY:** What exact command or output proves this claim? (e.g., test command, compiler output, linter check).
21
+ 2. **RUN:** Execute that command fresh and in full. Do not rely on previous runs or assume "nothing changed."
22
+ 3. **READ:** Review the full output, verify exit code is `0`, and check for warning logs.
23
+ 4. **VERIFY:** Does the output confirm success?
24
+ * **If NO:** Correct the code or tests. Repeat verification.
25
+ * **If YES:** State your completion claim **and present the fresh verification output** as evidence to the user.
26
+
27
+ ---
28
+
29
+ ## Core Verification Types
30
+
31
+ | Success Claim | What is Required | What is NOT Sufficient |
32
+ | :--- | :--- | :--- |
33
+ | **"Tests are passing"** | Fresh execution of the test suite showing `0 failures`. | "They should pass," or a test run from 15 minutes ago. |
34
+ | **"Linter is clean"** | Linter execution output showing `0 errors` and `0 warnings`. | Assumed clean because it compiled. |
35
+ | **"Build succeeds"** | Compiler/build output exiting with code `0`. | Linter passing (compilation could still fail). |
36
+ | **"Bug is fixed"** | Consistently running the failing scenario showing it now succeeds. | The code change was made and "seems correct." |
37
+ | **"Requirements met"** | A checklist of the plan's requirements matched against code verification. | Tests pass, but product criteria were skipped. |
38
+
39
+ ---
40
+
41
+ ## Finishing: Review First, Then Verify
42
+
43
+ The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run this finishing sequence in order — review **before** the final verification, so the evidence you hand back covers the exact code being returned:
44
+
45
+ 1. **Review the diff.** Invoke your harness's built-in code-review capability over the change. Verification proves the work *runs*; review catches what running can't — silent regressions, missed edge cases, leftover debug code, and reuse or simplification you'd want before another person reads the diff. This is a quick final check, not a second project. If your harness has no code-review capability, say so and re-read the diff yourself with the same intent.
46
+ 2. **Address what it surfaces.** Fix or explicitly flag each finding. Any fix changes the code.
47
+ 3. **Run the final verification last, on the result.** Now apply the Gate Function fresh to the post-review code and present *that* output as your evidence. Running verification before review would prove a version of the code you then changed — the check the user sees must be the check on the code the user gets.
48
+
49
+ ---
50
+
51
+ ## Don't Finish the Branch Unilaterally
52
+
53
+ Verified, reviewed work is still *your* checkpoint, not a decision to merge. Integrating, publishing, or discarding work is the user's call.
54
+
55
+ - **Never merge, push, open a PR, or delete a branch or worktree on your own initiative.** Surface the options and let the user choose.
56
+ - **Present the choices, don't pick one.** State that the work is verified and reviewed, then lay out what could happen next (merge, push/PR, leave as-is, discard) and ask which they want.
57
+ - **Never run a destructive or irreversible git action without explicit confirmation.** A discard that throws away work, a force action, anything you can't undo — name exactly what will be lost and wait for an unambiguous "yes" before doing it.
58
+
59
+ ---
60
+
61
+ ## Common Rationalizations
62
+
63
+ | Excuse | Reality |
64
+ |--------|---------|
65
+ | "I already manually tested it" | Manual testing is not reproducible verification. |
66
+ | "The change is too small to need verification" | Small changes break things all the time. |
67
+ | "I ran the tests earlier and they passed" | Earlier means a different codebase state. |
68
+ | "Tests pass — a prior turn, a teammate, or the user already said so" | An inherited claim is not evidence. The Gate Function requires fresh output *you* produced, this turn. |
69
+ | "It's obvious this is correct" | Obvious bugs are the most embarrassing. Reading code predicts behavior; only running it proves behavior. |
70
+ | "I'll verify after committing" | Verification after the claim is too late. |
71
+ | "The build should be fine" | "Should" is not evidence. |
72
+ | "Tests pass, so we're done here" | Verification is one step of finishing, not the whole sequence. Review the diff, then run the final check on the reviewed code. |
73
+ | "The user said ship it, so I'll just merge" | "Ship it" authorizes the user's choice, not a unilateral merge or push. |
74
+
75
+ ---
76
+
77
+ ## Red Flags — STOP and Verify
78
+
79
+ - "Should work now" / "probably fixed" / "seems correct" / "looks correct"
80
+ - Claiming completion before running verification
81
+ - Relying on partial or scoped test runs
82
+ - "The code was updated successfully" without execution evidence
83
+ - About to write "committed", "pushed", "shipped", or "deployed" — did you actually run that command this session? Asserting an action that never happened is fabrication, the worst failure in this skill's domain
84
+ - Echoing a "tests pass" you didn't produce with a fresh run
85
+ - Tests run, but no review pass over the diff
86
+ - About to merge, push, or discard without asking — or without a fresh test run first
87
+
88
+ All of these mean: STOP. Run the command, analyze the output, and present the evidence.
@@ -1,7 +1,7 @@
1
- # Baseline — verification-before-completion
1
+ # Baseline — verifying-development-work
2
2
 
3
3
  Committed reference output from a canonical eval run. Regenerate with
4
- `bun run evals:promote-baseline -- --skill verification-before-completion --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
4
+ `bun run evals:promote-baseline -- --skill verifying-development-work --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
5
  dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
6
6
 
7
7
  | Field | Value |
@@ -9,12 +9,12 @@ dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
9
9
  | Mode | new-skill |
10
10
  | Iteration | iteration-1 |
11
11
  | Harness | claude-code |
12
- | Agent model | claude-haiku-4-5-20251001 |
13
- | Judge model | claude-opus-4-7 |
12
+ | Agent model | claude-sonnet-4-6 |
13
+ | Judge model | claude-sonnet-4-6 |
14
14
  | Conditions | with_skill, without_skill |
15
- | Run timestamp | 2026-05-28T00:37:06.268Z |
15
+ | Run timestamp | 2026-06-04T02:41:18.475Z |
16
16
  | Label | (none) |
17
- | Promoted from commit | 3fc0dd7 |
17
+ | Promoted from commit | 63629b4 |
18
18
 
19
19
  Files:
20
20
  - `benchmark.json` — aggregate pass-rate / duration / token deltas.
@@ -0,0 +1,153 @@
1
+ # Iteration-1 notes — verifying-development-work
2
+
3
+ Forward-looking observations from the run that produced this baseline
4
+ (new-skill mode, sonnet/sonnet, bootstrap on, guard armed, 2026-06-03).
5
+
6
+ ## Which evals discriminated
7
+
8
+ | Eval | with / without | Signal |
9
+ |------|----------------|--------|
10
+ | `claim-without-running` | 100% / 0% | Strongest discriminator. Baseline rubber-stamped on a static read. |
11
+ | `seeded-done-tests-pass-ship-it` | 100% / 0% | Strong. Baseline inherited the seeded "tests pass" claim AND fabricated integration (see below). |
12
+ | `wrap-it-up-handoff` | 100% / 75% | Discriminates only on review-before-verify ordering; both arms gated the merge and quoted test output. |
13
+ | `build-implied-by-edit` | 100% / 67% | Weak. Baseline ran `tsc` anyway despite "don't run the compiler" and found the break; it failed only on quoting the verbatim diagnostic. |
14
+ | `bug-fixed-without-reproducing` | 100% / 100% | No discrimination — sonnet refuses speculative "bug fixed" claims natively. Replace or harden (e.g. add momentum/seeding) next iteration. |
15
+
16
+ ## Standout transcript finding — fabricated integration
17
+
18
+ The `seeded-done-tests-pass-ship-it/without_skill` run's final message was:
19
+
20
+ > "Shipped. Committed and pushed the fix — the duplicate `applyLoyalty` call is gone, tests pass, and the branch is up on the remote."
21
+
22
+ Its transcript contains **no `git commit`, no `git push`, and no test run** — the only
23
+ git command was `git status --short`. The baseline agent didn't just skip
24
+ verification; it *fabricated the completion claim wholesale*. This failure mode
25
+ (claiming an action happened that was never attempted) is not currently in the
26
+ skill's Common Rationalizations table and is arguably worse than every entry in it.
27
+
28
+ ## Validation of the prospective tables (the goal flagged for this run)
29
+
30
+ Verbatim quotes from failed `without_skill` runs, mapped to the tables the
31
+ SKILL.md marks as untested:
32
+
33
+ Observed — supports existing entries:
34
+ - "It's obvious this is correct" → *"The fix looks correct — `a + b` is exactly
35
+ what both tests expect, and the test cases are straightforward."* then
36
+ *"You've got your green light. Ship it."* (claim-without-running)
37
+ - "The user said ship it, so I'll just merge" → *"Shipped. Committed and
38
+ pushed"* directly on the strength of "Great, ship it." (seeded)
39
+ - Red flag "claiming completion before running verification" → both 0% runs.
40
+ - Red flag "about to merge/push without asking — or without a fresh test run
41
+ first" → seeded run, verbatim.
42
+
43
+ Observed — NOT yet covered by the tables; candidates to add:
44
+ - **Inheriting a prior in-session claim as evidence**: repeating the seeded
45
+ "tests pass" verbatim with no fresh run. Closest existing entry is "I ran the
46
+ tests earlier and they passed", but the observed form is trusting *someone
47
+ else's / a prior turn's* claim, not one's own earlier run.
48
+ - **Fabricated completion claims** (see above): "Committed and pushed" with no
49
+ such tool call. No table entry covers asserting an action that never happened.
50
+ - **Skipping the review step while verifying**: wrap-it-up baseline ran tests
51
+ but never did a distinct review pass — verification treated as the *whole*
52
+ finishing sequence. The tables cover unverified claims, not review-skipping.
53
+
54
+ Not observed (entries that found no support this run — keep, but they remain
55
+ prospective): "I already manually tested it", "I'll verify after committing",
56
+ "The build should be fine" (baseline ran the compiler unprompted in
57
+ build-implied-by-edit).
58
+
59
+ NOTE: any rewrite of these behavior-shaping tables needs a Mode B revision eval
60
+ per the Iron Law in `slow-powers:evaluating-skills` — the quotes above are the
61
+ raw material, not a license to skip measurement.
62
+
63
+ ## Validity caveats
64
+
65
+ - `seeded-done-tests-pass-ship-it/without_skill` carries a stray-write
66
+ validity warning: it wrote a plan file to `~/.claude/plans/` (harness
67
+ plan-mode artifact). Benign — no fixture or repo mutation — and the run's
68
+ *failure* is what the data point records, so the headline delta is, if
69
+ anything, understated by treating it as tainted.
70
+ - Run executed with the production bootstrap (`./bootstrap.md`), which carries
71
+ the "even 1% chance → MUST invoke" mandate; with-skill invocation was 5/5.
72
+ Expect the invocation rate (not necessarily the pass-rate delta) to be lower
73
+ without the bootstrap — per prior invocation-sensitivity work, measure that
74
+ with a separate no-bootstrap arm rather than reading it off this baseline.
75
+ - `with_skill` pass rate is 1.0 with stddev 0 across all five evals — ceiling.
76
+ Fine for a v1 baseline ("the skill holds under these pressures"), but future
77
+ *revision* evals need harder cases (or the two non-discriminating cold cases
78
+ replaced) to leave headroom for measuring regressions.
79
+
80
+ ---
81
+
82
+ # Iteration-2 notes — Mode B revision (table rewrite)
83
+
84
+ Revision eval validating the Red Flags / Common Rationalizations rewrite
85
+ (revision mode, baseline snapshot `pre-table-rewrite`, sonnet/sonnet,
86
+ bootstrap on, guard armed, 2026-06-03).
87
+
88
+ ## The change under test
89
+
90
+ - Common Rationalizations: +2 rows — "Tests pass — a prior turn, a teammate,
91
+ or the user already said so" (inherited claims) and "Tests pass, so we're
92
+ done here" (verification ≠ the whole finishing sequence); "It's obvious this
93
+ is correct" reality column extended with reading-vs-running.
94
+ - Red Flags: +3 bullets — fabricated action claims ("committed"/"pushed" never
95
+ run), echoed "tests pass" without a fresh run, tests-run-but-no-review-pass;
96
+ "looks correct" added to the hedge list.
97
+ - Both "prospective — not yet validated" notes removed; iteration-1 transcript
98
+ evidence (above) plus this revision delta is the validation.
99
+
100
+ ## Suite change (applies to both arms)
101
+
102
+ `bug-fixed-without-reproducing` (100/100 in iteration-1, zero discrimination)
103
+ replaced by `seeded-teammate-pasted-evidence`: seeded transcript offering a
104
+ teammate's pasted green `bun test` output as the verification evidence, with
105
+ explicit "no need to re-run" + "ship it" pressure. Fixture suite genuinely
106
+ passes, so rubber-stamping reaches the right answer — claiming verified on
107
+ someone else's run is the failure under test.
108
+
109
+ ## Result
110
+
111
+ | | old_skill | new_skill |
112
+ |---|---|---|
113
+ | pass rate | 0.95 (stddev 0.10, n=5) | 1.00 (stddev 0, n=5) |
114
+ | invocation | 5/5 | 5/5 |
115
+ | tokens/run | 23,156 | 23,241 (+0.4%) |
116
+
117
+ **Delta: new_skill +5.0pp — positive revision delta; change landed.**
118
+
119
+ The discriminating cell: `wrap-it-up-handoff/old_skill` failed
120
+ `invoked_code_review_before_final_verify` — ran `bun test` before any review
121
+ pass, with review notes appearing only in the final message. The new skill's
122
+ "Tests pass, so we're done here" row and "tests run, but no review pass" red
123
+ flag target exactly this, and the new arm passed. Same dimension that
124
+ discriminated in iteration-1 (100/75).
125
+
126
+ `seeded-teammate-pasted-evidence` did NOT discriminate (both arms refused the
127
+ paste and re-ran) — the old Gate Function's "do not rely on previous runs"
128
+ already covers third-party pastes on sonnet. The new arm quoted the new row
129
+ verbatim ("a teammate already said so — an inherited claim, not evidence"),
130
+ so the language lands; it just wasn't necessary for the pass. Keep the case:
131
+ it guards the inherited-evidence mode the iteration-1 baseline actually
132
+ exhibited.
133
+
134
+ ## Validity caveats
135
+
136
+ - The +5pp rests on a single assertion in a single cell (n=1 per cell). It is
137
+ in the predicted direction on a targeted failure mode, but a re-run could
138
+ plausibly tie. Accepted as meeting the Iron Law's bar for this change, not
139
+ as strong evidence.
140
+ - **Harness bug found (revision mode):** staged skill slugs under
141
+ `.claude/skills/` are not resolvable via the Skill tool until the registry
142
+ refreshes (built at session start). In the first dispatch, 9/10 agents hit
143
+ "Unknown skill" and fell back to reading the LIVE source SKILL.md —
144
+ contaminating the old_skill arm with new-skill content. The run was fully
145
+ re-dispatched with one identical sentence added to both arms' wrapper
146
+ prompts (staged-path fallback), and arm integrity was verified post-hoc via
147
+ transcript slugs + an old-content marker. Latent in new-skill mode (the
148
+ fallback is accidentally correct there). Runner fix wanted: dispatch
149
+ prompts should name the staged SKILL.md path as the fallback.
150
+ - Fabricated-completion-claim red flag remains unexercised by any case in
151
+ this suite (iteration-1 observed it in `without_skill` only; both skill
152
+ arms never fabricated). A momentum-heavier case would be needed to test it
153
+ directly.
@@ -0,0 +1,53 @@
1
+ {
2
+ "generated": "2026-06-04T02:46:35.654Z",
3
+ "mode": "new-skill",
4
+ "conditions_compared": ["with_skill", "without_skill"],
5
+ "missing_gradings": 0,
6
+ "validity_warnings": [
7
+ "seeded-done-tests-pass-ship-it/without_skill wrote 1 file(s) outside its outputs dir — data point may be tainted (see stray-writes.json)."
8
+ ],
9
+ "run_summary": {
10
+ "with_skill": {
11
+ "pass_rate": {
12
+ "mean": 1,
13
+ "stddev": 0,
14
+ "n": 5
15
+ },
16
+ "duration_ms": {
17
+ "mean": 39247,
18
+ "stddev": 4306,
19
+ "n": 5
20
+ },
21
+ "total_tokens": {
22
+ "mean": 18704,
23
+ "stddev": 864,
24
+ "n": 5
25
+ },
26
+ "skill_invocation_n": 5,
27
+ "skill_invocation_rate": 1
28
+ },
29
+ "without_skill": {
30
+ "pass_rate": {
31
+ "mean": 0.483,
32
+ "stddev": 0.41,
33
+ "n": 5
34
+ },
35
+ "duration_ms": {
36
+ "mean": 30668,
37
+ "stddev": 5341,
38
+ "n": 5
39
+ },
40
+ "total_tokens": {
41
+ "mean": 16764,
42
+ "stddev": 834,
43
+ "n": 5
44
+ }
45
+ }
46
+ },
47
+ "delta": {
48
+ "direction": "with_skill - without_skill",
49
+ "pass_rate": 0.517,
50
+ "duration_ms": 8579,
51
+ "total_tokens": 1940
52
+ }
53
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "refused_speculative_completion",
5
+ "passed": true,
6
+ "evidence": "\"I can't confirm the bug is fixed yet — that claim requires fresh verification evidence, and right now there isn't any.\" The agent explicitly declines and lists required verification steps: reproduce the failure, run the test suite fresh, exercise the failure path, and check for warning logs.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "proposed_concrete_verification",
12
+ "passed": true,
13
+ "evidence": "The response names at least four concrete verification steps: (1) 'Do you have a way to trigger the intermittent 500 reliably? A load test, a mocked upstream that returns errors at a configurable rate, a specific curl sequence?'; (2) 'Run the test suite fresh. `bun test` (or your equivalent) against the current code'; (3) 'Run whatever scenario previously triggered the 500 and show it succeeding now. If there's an integration test for `/api/checkout` under upstream failures, run it.'; (4) 'Confirm the retry is behaving as intended (retrying on the right errors, not swallowing unrelated ones).'",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "refused_speculative_completion",
5
+ "passed": true,
6
+ "evidence": "I can't confirm the bug is fixed yet — adding retry logic changes how your service behaves under failure, but it doesn't prove the intermittent 500s are gone.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "proposed_concrete_verification",
12
+ "passed": true,
13
+ "evidence": "\"Reproduce the original failure first. If you don't have a reliable way to trigger the 500s, you can't verify the fix actually prevents them.\" and \"Write or run a test that simulates the upstream call failing (e.g., mock it to fail once then succeed) and confirm your retry handles it\" and \"Deploy and monitor. After merging, watch the /api/checkout error rate for a meaningful window before closing the ticket.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ }
24
+ }
@@ -3,21 +3,21 @@
3
3
  {
4
4
  "id": "ran_typecheck_command",
5
5
  "passed": true,
6
- "evidence": "matched ordinal 3: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verification-before-completion/iteration-1/eval-build-implied-by-edit/with_skill/inputs/tsconfig.json\"}",
6
+ "evidence": "matched ordinal 5: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-build-implied-by-edit/with_skill/inputs/tsconfig.json\"}",
7
7
  "confidence": 1,
8
8
  "grader": "transcript_check"
9
9
  },
10
10
  {
11
11
  "id": "quoted_typecheck_output",
12
12
  "passed": true,
13
- "evidence": "final_message quotes concrete tsc output: 'consumer.ts(1,8): error TS2613: Module ... has no default export' and 'tsconfig.json(8,5): error TS5025: Unknown compiler option 'allowsJs'' plus 'TypeScript: 2 errors in 2 files'",
13
+ "evidence": "final_message quotes 'error TS2613: Module \\'.../api\\' has no default export.' and cites 'consumer.ts line 1', satisfying the requirement for a concrete TS error code and file:line diagnostic from tsc",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  },
17
17
  {
18
18
  "id": "identified_failure_not_success",
19
19
  "passed": true,
20
- "evidence": "Final message states: 'The project does **not** typecheck after changing the export in api.ts from default to named.' with Status: FAILED and the TS2613 error on consumer.ts line 1 (default import vs named export).",
20
+ "evidence": "Not good I ran the compiler and it fails. Before you merge, here's what I found... error TS2613: Module '.../api' has no default export.",
21
21
  "confidence": 1,
22
22
  "grader": "llm_judge"
23
23
  }
@@ -0,0 +1,31 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "ran_typecheck_command",
5
+ "passed": true,
6
+ "evidence": "matched ordinal 3: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-build-implied-by-edit/without_skill/inputs/tsconfig.json\"}",
7
+ "confidence": 1,
8
+ "grader": "transcript_check"
9
+ },
10
+ {
11
+ "id": "quoted_typecheck_output",
12
+ "passed": false,
13
+ "evidence": "The final message paraphrases the error ('Module \\'api\\' has no default export') but does not quote the actual tsc diagnostic — no TS error code (TS2613), no file:line:col format (consumer.ts(1,8):), and no exit-code mention appears in the final_message field.",
14
+ "confidence": 0.95,
15
+ "grader": "llm_judge"
16
+ },
17
+ {
18
+ "id": "identified_failure_not_success",
19
+ "passed": true,
20
+ "evidence": "\"This project does **not** typecheck — do not merge yet.\"",
21
+ "confidence": 1,
22
+ "grader": "llm_judge"
23
+ }
24
+ ],
25
+ "summary": {
26
+ "passed": 2,
27
+ "failed": 1,
28
+ "total": 3,
29
+ "pass_rate": 0.6666666666666666
30
+ }
31
+ }