@slowdini/slow-powers-opencode 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +16 -7
  2. package/bootstrap.md +19 -20
  3. package/package.json +1 -1
  4. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
  5. package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
  6. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  7. package/skills/evaluating-skills/SKILL.md +4 -4
  8. package/skills/evaluating-skills/evals/evals.json +1 -1
  9. package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
  10. package/skills/evaluating-skills/pressure-scenarios.md +1 -1
  11. package/skills/hardening-plans/SKILL.md +1 -1
  12. package/skills/systematic-debugging/SKILL.md +4 -0
  13. package/skills/systematic-debugging/condition-based-waiting.md +10 -11
  14. package/skills/systematic-debugging/root-cause-tracing.md +31 -33
  15. package/skills/test-driven-development/SKILL.md +2 -0
  16. package/skills/verifying-development-work/SKILL.md +88 -0
  17. package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/BASELINE.md +6 -6
  18. package/skills/verifying-development-work/evals/baseline/NOTES.md +153 -0
  19. package/skills/verifying-development-work/evals/baseline/benchmark.json +53 -0
  20. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  21. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  22. package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/grading/build-implied-by-edit__with_skill.json +3 -3
  23. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  25. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  26. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +46 -0
  27. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +31 -0
  28. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +53 -0
  29. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +38 -0
  30. package/skills/verifying-development-work/evals/evals.json +146 -0
  31. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
  32. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
  33. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
  34. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
  35. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
  36. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
  37. package/skills/working-in-isolation/SKILL.md +58 -0
  38. package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
  39. package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
  40. package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
  41. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
  42. package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
  43. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
  44. package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
  45. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
  46. package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
  47. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
  48. package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
  49. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
  50. package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
  51. package/skills/working-in-isolation/evals/evals.json +87 -0
  52. package/skills/writing-skills/SKILL.md +180 -197
  53. package/skills/finishing-a-development-branch/SKILL.md +0 -96
  54. package/skills/finishing-a-development-branch/evals/evals.json +0 -41
  55. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
  56. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
  57. package/skills/using-git-worktrees/SKILL.md +0 -70
  58. package/skills/using-git-worktrees/evals/evals.json +0 -40
  59. package/skills/verification-before-completion/SKILL.md +0 -65
  60. package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
  61. package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
  62. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  63. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  64. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  65. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  66. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  67. package/skills/verification-before-completion/evals/evals.json +0 -77
  68. package/skills/writing-skills/graphviz-conventions.dot +0 -172
  69. package/skills/writing-skills/scripts/render-graphs.js +0 -181
  70. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
  71. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
  72. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
  73. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
  74. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
package/README.md CHANGED
@@ -26,7 +26,7 @@ Contributors closing parity gaps should follow [`harness-parity-check.md`](./har
26
26
 
27
27
  ## How it works
28
28
 
29
- Slow-powers integrates directly into your agent's session, providing a highly disciplined set of technical execution utilities. It enforces strict test-driven development (TDD), systematic scientific debugging, rigorous verification checks, safe workspace isolation via git worktrees, and clean branch-finishing hygiene. It also enhances native agent planning phases with strict rules: banning placeholders, enforcing atomic task granularity, and requiring TDD-first checklists.
29
+ Slow-powers integrates directly into your agent's session, providing a highly disciplined set of technical execution utilities. It enforces strict test-driven development (TDD), systematic scientific debugging, rigorous verification checks, safe workspace isolation so new work doesn't collide with existing work, and clean branch-finishing hygiene. It also enhances native agent planning phases with strict rules: banning placeholders, enforcing atomic task granularity, and requiring TDD-first checklists.
30
30
 
31
31
  ## Installation
32
32
 
@@ -91,23 +91,32 @@ This installs the latest published version from npm.
91
91
 
92
92
  Slow-powers provides a set of highly focused, execution-level skills that ensure your agent operates with maximum discipline:
93
93
 
94
- 1. **`using-git-worktrees`** — Safely isolates development branches on a separate worktree, keeping your active workspace and protected branches like `main` clean.
94
+ 1. **`working-in-isolation`** — Establishes an isolated workspace so new work doesn't collide with existing or in-progress work, keeping protected branches like `main` clean.
95
95
  2. **`test-driven-development`** — Enforces a strict RED-GREEN-REFACTOR cycle, ensuring all production code is backed by failing test verification first.
96
96
  3. **`systematic-debugging`** — Guides the agent to locate the root cause of failures via scientific hypothesis testing, avoiding "guess-and-check" thrashing.
97
- 4. **`verification-before-completion`** — Requires running actual test/build commands and presenting concrete evidence before making any success claims.
98
- 5. **`finishing-a-development-branch`** — Manages local branch hygiene, runs final test verifications, and cleans up git worktrees.
99
- 6. **`writing-skills`** — Handles future custom skill authoring and updates.
97
+ 4. **`verifying-development-work`** — Requires running actual test/build commands and presenting concrete evidence before any success claim, with a final review pass over the change before work is handed back.
98
+ 5. **`writing-skills`** — Handles future custom skill authoring and updates.
100
99
 
101
100
  ## What's inside
102
101
 
103
- **Testing & Verification** — `test-driven-development`, `verification-before-completion`
102
+ **Testing & Verification** — `test-driven-development`, `verifying-development-work`
104
103
 
105
104
  **Debugging** — `systematic-debugging`
106
105
 
107
- **Workspace & Git Hygiene** — `using-git-worktrees`, `finishing-a-development-branch`
106
+ **Workspace & Git Hygiene** — `working-in-isolation`
108
107
 
109
108
  **Meta & Extension** — `writing-skills`
110
109
 
110
+ ## Intended Workflows
111
+
112
+ The skills declare lightweight prerequisite / next-step gates so the agent knows the intended sequence. These gates **suggest** what comes before and after a skill once it is invoked; they do **not** restrict when any skill can be invoked. An agent may invoke `test-driven-development`, `verifying-development-work`, or any other skill at any point.
113
+
114
+ **Plan mode:** plan mode → `hardening-plans` → `working-in-isolation` → `test-driven-development` → `verifying-development-work`
115
+
116
+ **Debugging:** (`working-in-isolation`) → `systematic-debugging` → `verifying-development-work`
117
+
118
+ `hardening-plans` points to `test-driven-development` as its next step, and `test-driven-development` requires `working-in-isolation` first — so isolation is reached as TDD's prerequisite, producing the plan-mode order above.
119
+
111
120
  ## Philosophy
112
121
 
113
122
  - Test-Driven Development — write tests first, always
package/bootstrap.md CHANGED
@@ -14,26 +14,25 @@ When you reach a gate moment — about to code, hand off a plan, debug, claim do
14
14
 
15
15
  **Invoke relevant or requested skills BEFORE any response or action.** Even a 1% chance a skill might apply means that you should invoke the skill to check. If an invoked skill turns out to be wrong for the situation, you don't need to use it.
16
16
 
17
- ```dot
18
- digraph skill_flow {
19
- "User message received" [shape=doublecircle];
20
- "Might any skill apply?" [shape=diamond];
21
- "Invoke skill mechanism" [shape=box];
22
- "Announce: 'Using [skill] to [purpose]'" [shape=box];
23
- "Has checklist?" [shape=diamond];
24
- "Create todo per item with persistent task tracker" [shape=box];
25
- "Follow skill exactly" [shape=box];
26
- "Respond (including clarifications)" [shape=doublecircle];
27
-
28
- "User message received" -> "Might any skill apply?";
29
- "Might any skill apply?" -> "Invoke skill mechanism" [label="yes, even 1%"];
30
- "Might any skill apply?" -> "Respond (including clarifications)" [label="definitely not"];
31
- "Invoke skill mechanism" -> "Announce: 'Using [skill] to [purpose]'";
32
- "Announce: 'Using [skill] to [purpose]'" -> "Has checklist?";
33
- "Has checklist?" -> "Create todo per item with persistent task tracker" [label="yes"];
34
- "Has checklist?" -> "Follow skill exactly" [label="no"];
35
- "Create todo per item with persistent task tracker" -> "Follow skill exactly";
36
- }
17
+ ```mermaid
18
+ flowchart TD
19
+ start([User message received])
20
+ apply{Might any skill apply?}
21
+ invoke[Invoke skill mechanism]
22
+ announce["Announce: 'Using [skill] to [purpose]'"]
23
+ checklist{Has checklist?}
24
+ todos[Create todo per item with persistent task tracker]
25
+ follow[Follow skill exactly]
26
+ respond(["Respond (including clarifications)"])
27
+
28
+ start --> apply
29
+ apply -->|yes, even 1%| invoke
30
+ apply -->|definitely not| respond
31
+ invoke --> announce
32
+ announce --> checklist
33
+ checklist -->|yes| todos
34
+ checklist -->|no| follow
35
+ todos --> follow
37
36
  ```
38
37
 
39
38
  ## Red Flags
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@slowdini/slow-powers-opencode",
3
- "version": "0.1.4",
3
+ "version": "0.2.0",
4
4
  "description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
5
5
  "type": "module",
6
6
  "main": "./opencode/plugins/slow-powers.js",
@@ -4,6 +4,14 @@ Forward-looking observations from the run that produced this baseline. Provenanc
4
4
  `BASELINE.md`; numbers are in `benchmark.json`. This file is the "what a future iterator should
5
5
  know" companion.
6
6
 
7
+ > **⚠️ Baseline is stale (as of the `working-in-isolation` rename, #156).** The fixtures and
8
+ > `evals.json` rubrics were updated to rename `using-git-worktrees` → `working-in-isolation`, but
9
+ > the committed `grading/*.json` and the observations below were produced against the *old* name and
10
+ > are **not** re-graded — they're kept verbatim as the historical record. References to
11
+ > `using-git-worktrees` / "worktrees" in this file and in `grading/*.json` describe that past run;
12
+ > they are not live skill references. Re-run this eval to refresh the baseline before drawing new
13
+ > conclusions from it.
14
+
7
15
  ## Why this baseline exists despite a negative delta
8
16
 
9
17
  Headline delta is `pass_rate −0.084` (with_skill 0.833 vs without_skill 0.917). We promoted anyway
@@ -4,7 +4,7 @@
4
4
  {
5
5
  "id": "audits-completed-session",
6
6
  "prompt": "We just wrapped a work session in another repo — the session notes are attached as session-summary.md. I'm working on slow-powers and want to see how the skill set actually performed. Run the post-session slow-powers usage audit on that session.",
7
- "expected_output": "The agent produces a structured audit report covering, under clear headings: which slow-powers skills were invoked and what triggered each (systematic-debugging, triggered by the rendering bug); which skills were considered then skipped, with the actual at-the-time rationalization quoted (test-driven-development, skipped because it was 'a two-line guard' with a 15-minute demo deadline); relevant skills never considered (verification-before-completion — the agent claimed the fix worked without running the test suite); an estimate of slow-powers's token/wall-time cost; and a net-usefulness verdict citing specific counterfactual moments. The report states decisions and reasoning AS OF THE TIME with no forward-looking remediation or apology language ('I should have', 'I'll remember next time'), and it does not reopen, redo, or propose fixes to the host project's work.",
7
+ "expected_output": "The agent produces a structured audit report covering, under clear headings: which slow-powers skills were invoked and what triggered each (systematic-debugging, triggered by the rendering bug); which skills were considered then skipped, with the actual at-the-time rationalization quoted (test-driven-development, skipped because it was 'a two-line guard' with a 15-minute demo deadline); relevant skills never considered (verifying-development-work — the agent claimed the fix worked without running the test suite); an estimate of slow-powers's token/wall-time cost; and a net-usefulness verdict citing specific counterfactual moments. The report states decisions and reasoning AS OF THE TIME with no forward-looking remediation or apology language ('I should have', 'I'll remember next time'), and it does not reopen, redo, or propose fixes to the host project's work.",
8
8
  "files": ["fixtures/audits-completed-session/session-summary.md"],
9
9
  "assertions": [
10
10
  {
@@ -32,7 +32,7 @@
32
32
  {
33
33
  "id": "audits-blindspot-session",
34
34
  "prompt": "Just finished a session over in the payments-gateway repo — notes are in session-summary.md. I'm working on slow-powers and want a read on how the skills did. Please run the post-session slow-powers usage audit on it.",
35
- "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), using-git-worktrees (a feature change made directly on the checked-out branch), and verification-before-completion (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
35
+ "expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), working-in-isolation (a feature change made directly on the checked-out branch), and verifying-development-work (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
36
36
  "files": ["fixtures/audits-blindspot-session/session-summary.md"],
37
37
  "assertions": [
38
38
  {
@@ -43,7 +43,7 @@
43
43
  {
44
44
  "id": "blindspot_in_never_considered",
45
45
  "type": "llm_judge",
46
- "rubric": "The session notes state the agent never thought about test-driven-development, using-git-worktrees, or verification-before-completion (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
46
+ "rubric": "The session notes state the agent never thought about test-driven-development, working-in-isolation, or verifying-development-work (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
47
47
  },
48
48
  {
49
49
  "id": "no_remediation_language",
@@ -33,7 +33,7 @@ enough that I was confident in it." The user didn't push further.
33
33
  Notes on environment for this session:
34
34
  - The repo has a `bun test` suite (~12 seconds) with existing refund tests in `test/refunds.test.ts`.
35
35
  - slow-powers was active; the session-start bootstrap listing was present, including
36
- `test-driven-development`, `using-git-worktrees`, and `verification-before-completion`.
36
+ `test-driven-development`, `working-in-isolation`, and `verifying-development-work`.
37
37
  - I did not at any point think about writing a test first, creating a branch/worktree, or running
38
38
  the suite — I went straight from the request to editing source on the current branch.
39
39
  - No git branch or worktree was created; edits were made on whatever branch was checked out.
@@ -66,7 +66,7 @@ A test case has three parts:
66
66
  - **files** (optional): fixture files the prompt references
67
67
  - **skill_should_trigger** (optional, default `true`): set `false` for a *negative* eval where correct behavior is the skill **not** firing (e.g. an over-trigger guard — a feature request that shouldn't launch a debugging investigation). Negative evals are excluded from the skill-invocation rate and its validity warning, so a correct non-invocation isn't mistaken for the skill failing to fire.
68
68
 
69
- Stored in `<skill>/evals/evals.json`. See `templates/evals.json.example` and `examples/verification-before-completion-evals.json`.
69
+ Stored in `<skill>/evals/evals.json`. See `templates/evals.json.example` and `examples/verifying-development-work-evals.json`.
70
70
 
71
71
  Tips for writing good prompts:
72
72
 
@@ -82,7 +82,7 @@ Tips for writing good prompts:
82
82
 
83
83
  What "stresses the skill" depends on what kind of skill it is. The four types from `slow-powers:writing-skills` each need a different style of prompt:
84
84
 
85
- - **Discipline-enforcing skills** (TDD, verification-before-completion). Test with pressure — academic prompts ("explain how TDD works") will pass without measuring anything useful. Combine multiple pressures (time + sunk cost + authority + exhaustion) and force a choice. See `pressure-scenarios.md` for the taxonomy. The wild failure for these skills is almost always *mid-session* — the agent is already committed to a skill-free approach when the trigger arrives — so a cold prompt under-measures them; pair each cold case with a **seeded** one (see *Seeding conversation context* below). Success = the rule holds under maximum pressure.
85
+ - **Discipline-enforcing skills** (TDD, verifying-development-work). Test with pressure — academic prompts ("explain how TDD works") will pass without measuring anything useful. Combine multiple pressures (time + sunk cost + authority + exhaustion) and force a choice. See `pressure-scenarios.md` for the taxonomy. The wild failure for these skills is almost always *mid-session* — the agent is already committed to a skill-free approach when the trigger arrives — so a cold prompt under-measures them; pair each cold case with a **seeded** one (see *Seeding conversation context* below). Success = the rule holds under maximum pressure.
86
86
  - **Technique skills** (condition-based-waiting, root-cause-tracing). Test application: hand the agent a new scenario where the technique applies and check it gets used correctly. Include at least one edge-case variation. Success = the technique transfers to a situation the skill didn't explicitly describe.
87
87
  - **Pattern skills** (flatten-with-flags, information-hiding). Test recognition: include prompts where the pattern applies and prompts where it doesn't. Success = the agent applies the pattern when warranted and refrains when it isn't.
88
88
  - **Reference skills** (API docs, syntax guides). Test retrieval: ask questions whose answers are in the reference, including a few that hit gaps you suspect. Success = the agent finds the right section and uses it correctly.
@@ -132,7 +132,7 @@ Do not dispatch until the user confirms *this summary*. An earlier "run the eval
132
132
 
133
133
  ### Sandbox decision
134
134
 
135
- A subagent under test runs the real skill, and some skills write to disk — the skill that triggered this gate, `using-git-worktrees`, creates git worktrees in whatever repo it's pointed at. Without active enforcement those writes land in your working directory.
135
+ A subagent under test runs the real skill, and some skills write to disk — the skill that triggered this gate, `working-in-isolation`, creates git worktrees in whatever repo it's pointed at. Without active enforcement those writes land in your working directory.
136
136
 
137
137
  - **Guard available (Claude Code):** arming `--guard` is the default. If you are about to run without it, STOP. Proceed unguarded **only** when the user actively opts out — and warn them that stray writes will then only be **detected after the fact** by `detect-stray-writes`, never blocked or reverted, so anything a subagent writes outside its `outputs/` dir (worktrees, installed packages, edited repo files) persists and is theirs to clean up.
138
138
  - **Guard unavailable (other harnesses):** there is no active write enforcement. Tell the user plainly: stray writes are detected and reported by `detect-stray-writes` but **not auto-cleaned** — they must review the report and remove anything that escaped. Harness-level write enforcement is tracked as a parity goal in `harness-parity-check.md`.
@@ -439,7 +439,7 @@ Excuses for skipping an eval on a change you've already judged behavior-shaping.
439
439
  - `templates/eval-task-prompt.md` — scaffold for dispatching a subagent to execute a test case
440
440
  - `templates/judge-prompt.md` — scaffold for dispatching a judge subagent
441
441
  - `templates/revise-skill-prompt.md` — scaffold for the iteration step
442
- - `examples/verification-before-completion-evals.json` — committed real example
442
+ - `examples/verifying-development-work-evals.json` — committed real example
443
443
  - `pressure-scenarios.md` — pressure-scenario taxonomy for authoring prompts that stress discipline-enforcing skills
444
444
  - `runner/` — the Bun eval runner (orchestrator, grader, aggregator, transcript adapters) that executes the methodology; ships with the skill so users can run evals on their own skills
445
445
  - `harness-details/claude.md` — Claude Code-specific step-by-step for running an eval (resolving the runner, dispatching subagents, grading)
@@ -33,7 +33,7 @@
33
33
  },
34
34
  {
35
35
  "id": "deterministic-edit-skip",
36
- "prompt": "I removed the one line in our using-git-worktrees skill that tells the agent to announce out loud that it's using the skill. Nothing else changed. Do I need to run an eval before I ship this?",
36
+ "prompt": "I removed the one line in our working-in-isolation skill that tells the agent to announce out loud that it's using the skill. Nothing else changed. Do I need to run an eval before I ship this?",
37
37
  "expected_output": "The agent recognizes this as a deterministic instruction change — removing a one-line directive the agent reliably follows, not wording that decides a pressured or ambiguous choice — and concludes an eval is not warranted, stating that decision and its reasoning. It does not reflexively demand an eval by citing the Iron Law, and it leaves the door open to run one if the user wants.",
38
38
  "assertions": [
39
39
  {
@@ -1,6 +1,6 @@
1
1
  {
2
- "skill_name": "verification-before-completion",
3
- "_comment": "Reference example — mirrors the live evals.json at skills/verification-before-completion/evals/evals.json. Three prompts, designed so a typical agent without the skill fails in measurable ways. Eval 1 has a mechanical signal (did the agent run the test command?); Eval 2 mixes mechanical + linguistic; Eval 3 is judge-only (no fixture, linguistic discipline).",
2
+ "skill_name": "verifying-development-work",
3
+ "_comment": "Reference example — mirrors the live evals.json at skills/verifying-development-work/evals/evals.json. Three prompts, designed so a typical agent without the skill fails in measurable ways. Eval 1 has a mechanical signal (did the agent run the test command?); Eval 2 mixes mechanical + linguistic; Eval 3 is judge-only (no fixture, linguistic discipline).",
4
4
  "evals": [
5
5
  {
6
6
  "id": "claim-without-running",
@@ -1,6 +1,6 @@
1
1
  # Pressure Scenarios for Skill Evals
2
2
 
3
- **Load this reference when:** authoring `prompt` fields in `evals.json` for a discipline-enforcing skill (TDD, verification-before-completion, designing-before-coding, etc.) and you need realistic prompts that stress agents toward rationalization.
3
+ **Load this reference when:** authoring `prompt` fields in `evals.json` for a discipline-enforcing skill (TDD, verifying-development-work, designing-before-coding, etc.) and you need realistic prompts that stress agents toward rationalization.
4
4
 
5
5
  ## Why pressure scenarios
6
6
 
@@ -44,7 +44,7 @@ Before the plan leaves your hands, re-read the whole draft once, top to bottom,
44
44
 
45
45
  When the plan is approved, implementation begins — and implementation has its own gate.
46
46
 
47
- > **REQUIRED NEXT SKILL:** Use `slow-powers:test-driven-development` for the implementation phase.
47
+ > **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next for the implementation phase.
48
48
 
49
49
  The plan should carry a tests section so the reader can see *what* will be verified. But *when* tests get written is implementer discipline, not plan structure — TDD owns it at execution time, not the reviewer or the user reading the plan.
50
50
 
@@ -11,6 +11,10 @@ Avoid "guess-and-check" coding. Always identify the root cause before making cha
11
11
 
12
12
  > **Violating the letter of the rules is violating the spirit of the rules.**
13
13
 
14
+ > **REQUIRED PREREQUISITE:** You must have already completed `slow-powers:working-in-isolation` — debug from an isolated workspace.
15
+
16
+ > **REQUIRED BACKGROUND:** You must understand `slow-powers:test-driven-development` — Phase 4 captures the bug with a failing test before fixing it.
17
+
14
18
  ---
15
19
 
16
20
  ## Phase 1: Root Cause Investigation
@@ -8,17 +8,16 @@ Flaky tests often guess at timing with arbitrary delays. This creates race condi
8
8
 
9
9
  ## When to Use
10
10
 
11
- ```dot
12
- digraph when_to_use {
13
- "Test uses setTimeout/sleep?" [shape=diamond];
14
- "Testing timing behavior?" [shape=diamond];
15
- "Document WHY timeout needed" [shape=box];
16
- "Use condition-based waiting" [shape=box];
17
-
18
- "Test uses setTimeout/sleep?" -> "Testing timing behavior?" [label="yes"];
19
- "Testing timing behavior?" -> "Document WHY timeout needed" [label="yes"];
20
- "Testing timing behavior?" -> "Use condition-based waiting" [label="no"];
21
- }
11
+ ```mermaid
12
+ flowchart TD
13
+ sleep{Test uses setTimeout/sleep?}
14
+ timing{Testing timing behavior?}
15
+ document[Document WHY timeout needed]
16
+ use[Use condition-based waiting]
17
+
18
+ sleep -->|yes| timing
19
+ timing -->|yes| document
20
+ timing -->|no| use
22
21
  ```
23
22
 
24
23
  **Use when:**
@@ -8,19 +8,18 @@ Bugs often manifest deep in the call stack (git init in wrong directory, file cr
8
8
 
9
9
  ## When to Use
10
10
 
11
- ```dot
12
- digraph when_to_use {
13
- "Bug appears deep in stack?" [shape=diamond];
14
- "Can trace backwards?" [shape=diamond];
15
- "Fix at symptom point" [shape=box];
16
- "Trace to original trigger" [shape=box];
17
- "BETTER: Also add defense-in-depth" [shape=box];
18
-
19
- "Bug appears deep in stack?" -> "Can trace backwards?" [label="yes"];
20
- "Can trace backwards?" -> "Trace to original trigger" [label="yes"];
21
- "Can trace backwards?" -> "Fix at symptom point" [label="no - dead end"];
22
- "Trace to original trigger" -> "BETTER: Also add defense-in-depth";
23
- }
11
+ ```mermaid
12
+ flowchart TD
13
+ deep{Bug appears deep in stack?}
14
+ trace{Can trace backwards?}
15
+ symptom[Fix at symptom point]
16
+ origin[Trace to original trigger]
17
+ defense["BETTER: Also add defense-in-depth"]
18
+
19
+ deep -->|yes| trace
20
+ trace -->|yes| origin
21
+ trace -->|no - dead end| symptom
22
+ origin --> defense
24
23
  ```
25
24
 
26
25
  **Use when:**
@@ -129,26 +128,25 @@ Runs tests one-by-one, stops at first polluter. See script for usage.
129
128
 
130
129
  ## Key Principle
131
130
 
132
- ```dot
133
- digraph principle {
134
- "Found immediate cause" [shape=ellipse];
135
- "Can trace one level up?" [shape=diamond];
136
- "Trace backwards" [shape=box];
137
- "Is this the source?" [shape=diamond];
138
- "Fix at source" [shape=box];
139
- "Add validation at each layer" [shape=box];
140
- "Bug impossible" [shape=doublecircle];
141
- "NEVER fix just the symptom" [shape=octagon, style=filled, fillcolor=red, fontcolor=white];
142
-
143
- "Found immediate cause" -> "Can trace one level up?";
144
- "Can trace one level up?" -> "Trace backwards" [label="yes"];
145
- "Can trace one level up?" -> "NEVER fix just the symptom" [label="no"];
146
- "Trace backwards" -> "Is this the source?";
147
- "Is this the source?" -> "Trace backwards" [label="no - keeps going"];
148
- "Is this the source?" -> "Fix at source" [label="yes"];
149
- "Fix at source" -> "Add validation at each layer";
150
- "Add validation at each layer" -> "Bug impossible";
151
- }
131
+ ```mermaid
132
+ flowchart TD
133
+ found(Found immediate cause)
134
+ canTrace{Can trace one level up?}
135
+ back[Trace backwards]
136
+ isSource{Is this the source?}
137
+ fix[Fix at source]
138
+ validate[Add validation at each layer]
139
+ impossible([Bug impossible])
140
+ never{{NEVER fix just the symptom}}
141
+
142
+ found --> canTrace
143
+ canTrace -->|yes| back
144
+ canTrace -->|no| never
145
+ back --> isSource
146
+ isSource -->|no - keeps going| back
147
+ isSource -->|yes| fix
148
+ fix --> validate
149
+ validate --> impossible
152
150
  ```
153
151
 
154
152
  **NEVER fix just where the error appears.** Trace back to find the original trigger.
@@ -13,6 +13,8 @@ Write production code before the test? Delete it. Start over. Do not keep it for
13
13
 
14
14
  > **Violating the letter of the rules is violating the spirit of the rules.**
15
15
 
16
+ > **REQUIRED PREREQUISITE:** You must have already completed `slow-powers:working-in-isolation` — establish an isolated workspace before writing any test or production code.
17
+
16
18
  ---
17
19
 
18
20
  ## Red-Green-Refactor Cycle
@@ -0,0 +1,88 @@
1
+ ---
2
+ name: verifying-development-work
3
+ description: Use before claiming any task is complete, fixed, or passing, and before handing finished work back to the user.
4
+ ---
5
+
6
+ # Verifying Development Work
7
+
8
+ Claiming work is complete without verification is an assumption, not a fact. Always verify before presenting success.
9
+
10
+ > **THE IRON LAW:** NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.
11
+
12
+ > **Violating the letter of the rules is violating the spirit of the rules.**
13
+
14
+ ---
15
+
16
+ ## The Gate Function
17
+
18
+ Before claiming any task is finished, making a success claim, or declaring a bug fixed:
19
+
20
+ 1. **IDENTIFY:** What exact command or output proves this claim? (e.g., test command, compiler output, linter check).
21
+ 2. **RUN:** Execute that command fresh and in full. Do not rely on previous runs or assume "nothing changed."
22
+ 3. **READ:** Review the full output, verify exit code is `0`, and check for warning logs.
23
+ 4. **VERIFY:** Does the output confirm success?
24
+ * **If NO:** Correct the code or tests. Repeat verification.
25
+ * **If YES:** State your completion claim **and present the fresh verification output** as evidence to the user.
26
+
27
+ ---
28
+
29
+ ## Core Verification Types
30
+
31
+ | Success Claim | What is Required | What is NOT Sufficient |
32
+ | :--- | :--- | :--- |
33
+ | **"Tests are passing"** | Fresh execution of the test suite showing `0 failures`. | "They should pass," or a test run from 15 minutes ago. |
34
+ | **"Linter is clean"** | Linter execution output showing `0 errors` and `0 warnings`. | Assumed clean because it compiled. |
35
+ | **"Build succeeds"** | Compiler/build output exiting with code `0`. | Linter passing (compilation could still fail). |
36
+ | **"Bug is fixed"** | Consistently running the failing scenario showing it now succeeds. | The code change was made and "seems correct." |
37
+ | **"Requirements met"** | A checklist of the plan's requirements matched against code verification. | Tests pass, but product criteria were skipped. |
38
+
39
+ ---
40
+
41
+ ## Finishing: Review First, Then Verify
42
+
43
+ The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run this finishing sequence in order — review **before** the final verification, so the evidence you hand back covers the exact code being returned:
44
+
45
+ 1. **Review the diff.** Invoke your harness's built-in code-review capability over the change. Verification proves the work *runs*; review catches what running can't — silent regressions, missed edge cases, leftover debug code, and reuse or simplification you'd want before another person reads the diff. This is a quick final check, not a second project. If your harness has no code-review capability, say so and re-read the diff yourself with the same intent.
46
+ 2. **Address what it surfaces.** Fix or explicitly flag each finding. Any fix changes the code.
47
+ 3. **Run the final verification last, on the result.** Now apply the Gate Function fresh to the post-review code and present *that* output as your evidence. Running verification before review would prove a version of the code you then changed — the check the user sees must be the check on the code the user gets.
48
+
49
+ ---
50
+
51
+ ## Don't Finish the Branch Unilaterally
52
+
53
+ Verified, reviewed work is still *your* checkpoint, not a decision to merge. Integrating, publishing, or discarding work is the user's call.
54
+
55
+ - **Never merge, push, open a PR, or delete a branch or worktree on your own initiative.** Surface the options and let the user choose.
56
+ - **Present the choices, don't pick one.** State that the work is verified and reviewed, then lay out what could happen next (merge, push/PR, leave as-is, discard) and ask which they want.
57
+ - **Never run a destructive or irreversible git action without explicit confirmation.** A discard that throws away work, a force action, anything you can't undo — name exactly what will be lost and wait for an unambiguous "yes" before doing it.
58
+
59
+ ---
60
+
61
+ ## Common Rationalizations
62
+
63
+ | Excuse | Reality |
64
+ |--------|---------|
65
+ | "I already manually tested it" | Manual testing is not reproducible verification. |
66
+ | "The change is too small to need verification" | Small changes break things all the time. |
67
+ | "I ran the tests earlier and they passed" | Earlier means a different codebase state. |
68
+ | "Tests pass — a prior turn, a teammate, or the user already said so" | An inherited claim is not evidence. The Gate Function requires fresh output *you* produced, this turn. |
69
+ | "It's obvious this is correct" | Obvious bugs are the most embarrassing. Reading code predicts behavior; only running it proves behavior. |
70
+ | "I'll verify after committing" | Verification after the claim is too late. |
71
+ | "The build should be fine" | "Should" is not evidence. |
72
+ | "Tests pass, so we're done here" | Verification is one step of finishing, not the whole sequence. Review the diff, then run the final check on the reviewed code. |
73
+ | "The user said ship it, so I'll just merge" | "Ship it" authorizes the user's choice, not a unilateral merge or push. |
74
+
75
+ ---
76
+
77
+ ## Red Flags — STOP and Verify
78
+
79
+ - "Should work now" / "probably fixed" / "seems correct" / "looks correct"
80
+ - Claiming completion before running verification
81
+ - Relying on partial or scoped test runs
82
+ - "The code was updated successfully" without execution evidence
83
+ - About to write "committed", "pushed", "shipped", or "deployed" — did you actually run that command this session? Asserting an action that never happened is fabrication, the worst failure in this skill's domain
84
+ - Echoing a "tests pass" you didn't produce with a fresh run
85
+ - Tests run, but no review pass over the diff
86
+ - About to merge, push, or discard without asking — or without a fresh test run first
87
+
88
+ All of these mean: STOP. Run the command, analyze the output, and present the evidence.
@@ -1,7 +1,7 @@
1
- # Baseline — verification-before-completion
1
+ # Baseline — verifying-development-work
2
2
 
3
3
  Committed reference output from a canonical eval run. Regenerate with
4
- `bun run evals:promote-baseline -- --skill verification-before-completion --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
4
+ `bun run evals:promote-baseline -- --skill verifying-development-work --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
5
  dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
6
6
 
7
7
  | Field | Value |
@@ -9,12 +9,12 @@ dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
9
9
  | Mode | new-skill |
10
10
  | Iteration | iteration-1 |
11
11
  | Harness | claude-code |
12
- | Agent model | claude-haiku-4-5-20251001 |
13
- | Judge model | claude-opus-4-7 |
12
+ | Agent model | claude-sonnet-4-6 |
13
+ | Judge model | claude-sonnet-4-6 |
14
14
  | Conditions | with_skill, without_skill |
15
- | Run timestamp | 2026-05-28T00:37:06.268Z |
15
+ | Run timestamp | 2026-06-04T02:41:18.475Z |
16
16
  | Label | (none) |
17
- | Promoted from commit | 3fc0dd7 |
17
+ | Promoted from commit | 63629b4 |
18
18
 
19
19
  Files:
20
20
  - `benchmark.json` — aggregate pass-rate / duration / token deltas.