@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +69 -5
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/hardening-plans/SKILL.md +29 -7
  16. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  17. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  18. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  19. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  20. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  21. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  22. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  24. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  25. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  26. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  27. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  28. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  29. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  30. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  31. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  32. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  33. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  34. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  35. package/skills/hardening-plans/evals/evals.json +46 -0
  36. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  37. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  38. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  39. package/skills/evaluating-skills/harness-parity.md +0 -155
  40. package/skills/evaluating-skills/runner/README.md +0 -163
  41. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  42. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  43. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  44. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  45. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  46. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  47. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  48. package/skills/evaluating-skills/runner/context.ts +0 -90
  49. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  50. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  51. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  52. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  53. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  54. package/skills/evaluating-skills/runner/grade.ts +0 -603
  55. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  56. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  57. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  58. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  59. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  60. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  61. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  62. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  63. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  64. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  65. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  66. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  67. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  68. package/skills/evaluating-skills/runner/run.ts +0 -1388
  69. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  70. package/skills/evaluating-skills/runner/types.ts +0 -121
  71. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  72. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  73. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  74. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  75. package/skills/evaluating-skills/runner/validate.ts +0 -21
  76. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  77. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  78. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  79. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  80. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  81. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  82. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  83. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  84. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  85. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -0,0 +1,54 @@
1
+ {
2
+ "generated": "2026-06-06T05:28:23.426Z",
3
+ "mode": "revision",
4
+ "baseline": "pre-split",
5
+ "conditions_compared": ["old_skill", "new_skill"],
6
+ "missing_gradings": 0,
7
+ "validity_warnings": [],
8
+ "run_summary": {
9
+ "old_skill": {
10
+ "pass_rate": {
11
+ "mean": 1,
12
+ "stddev": 0,
13
+ "n": 3
14
+ },
15
+ "duration_ms": {
16
+ "mean": 30954,
17
+ "stddev": 5354,
18
+ "n": 3
19
+ },
20
+ "total_tokens": {
21
+ "mean": 95370,
22
+ "stddev": 12031,
23
+ "n": 3
24
+ },
25
+ "skill_invocation_n": 3,
26
+ "skill_invocation_rate": 1
27
+ },
28
+ "new_skill": {
29
+ "pass_rate": {
30
+ "mean": 1,
31
+ "stddev": 0,
32
+ "n": 3
33
+ },
34
+ "duration_ms": {
35
+ "mean": 33603,
36
+ "stddev": 7200,
37
+ "n": 3
38
+ },
39
+ "total_tokens": {
40
+ "mean": 74671,
41
+ "stddev": 9209,
42
+ "n": 3
43
+ },
44
+ "skill_invocation_n": 3,
45
+ "skill_invocation_rate": 1
46
+ }
47
+ },
48
+ "delta": {
49
+ "direction": "old_skill - new_skill",
50
+ "pass_rate": 0,
51
+ "duration_ms": -2649,
52
+ "total_tokens": 20699
53
+ }
54
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "declares_deterministic_and_skips",
5
+ "passed": true,
6
+ "evidence": "\"Removing a 'announce out loud that you're using this skill' line is a deterministic change... **Decision: deterministic instruction removal — skip the eval.** Ship it.\"",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "door_stays_open",
12
+ "passed": true,
13
+ "evidence": "If you want an eval anyway (the skill says the door stays open), it would need to be a real one — actual cases testing a behavior this change could plausibly affect — not a checkbox run to rubber-stamp a foregone conclusion.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "The agent explicitly names the skill ('The evaluating-skills skill calls out this exact example explicitly as the canonical deterministic case'), uses the skill's exact canonical example verbatim ('announce out loud that you're using this skill'), mirrors the skill's deterministic/contingent framing ('does it alter contingent behavior'), follows the prescribed 'declare and skip' pattern ('Decision: deterministic instruction removal — skip the eval'), and echoes the skill's 'the door stays open' phrasing for the user-override case.",
29
+ "confidence": 1,
30
+ "grader": "llm_judge"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "declares_deterministic_and_skips",
5
+ "passed": true,
6
+ "evidence": "Agent explicitly declares the change deterministic and skips the eval: 'Removing a \"announce out loud that you're using this skill\" instruction is a deterministic change, not a contingent one' and concludes 'Decision: skip the eval. Deterministic instruction removal. Ship it.'",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "door_stays_open",
12
+ "passed": true,
13
+ "evidence": "The agent's final message ends with 'Ship it.' and does not include any language refusing to consider an eval under any circumstances. Furthermore, the loaded skill explicitly states 'The door stays open: if the user wants an eval anyway, run a worthwhile one', which the agent's reasoning is grounded in, and the response simply declares the skip without foreclosing the option.",
14
+ "confidence": 0.85,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "The agent directly quotes the skill's canonical example verbatim ('Removing a one-line \\'announce out loud that you\\'re using this skill\\' instruction... changes what the agent is told, not whether it complies under pressure. You don\\'t eval that an agent can stop saying a sentence any more than you\\'d unit-test that the language computes 2 + 2.'), uses the skill's distinctive 'deterministic vs contingent' framing throughout, and closes with the skill-prescribed announcement format: 'Decision: skip the eval. Deterministic instruction removal.'",
29
+ "confidence": 1,
30
+ "grader": "llm_judge"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "prescribes_structured_comparison",
5
+ "passed": true,
6
+ "evidence": "\"You know it's better by running a Mode B revision eval — comparing the old skill against the new one on the same set of test prompts, then looking at the pass-rate delta.\" The agent explicitly prescribes: (1) snapshot the old SKILL.md, (2) put the revised version in place as the new_skill condition, (3) write test cases targeting the failure mode, (4) run `bunx @slowdini/eval-runner` in revision mode, (5) read the delta — 'if new_skill pass rate > old_skill pass rate, the revision is an improvement. Zero or negative delta means revert.'",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "keep_only_on_positive_delta",
12
+ "passed": true,
13
+ "evidence": "\"Read the delta: if `new_skill` pass rate > `old_skill` pass rate, the revision is an improvement. Zero or negative delta means revert.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "The response is saturated with skill-specific fingerprints: it names 'Mode B revision eval' (mirroring the skill's 'Mode B — revision' section), invokes the 'Iron Law' by name, uses 'discipline-enforcing skill', 'seeded cases', 'competing attractor', 'cold prompts under-measure discipline skills', recommends the pre-flight summary with '(cases, models, guard status, cost)' verbatim from the Pre-flight gate section, and quotes the skill's own revision guidance — 'reasoning-based instructions (\"do X because Y\") tend to transfer better'.",
29
+ "confidence": 1,
30
+ "grader": "llm_judge"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "prescribes_structured_comparison",
5
+ "passed": true,
6
+ "evidence": "The agent explicitly prescribes: (1) snapshot the old version ('snapshot --label pre-rewrite --ref HEAD~1'), (2) run both old and new against the same test cases ('--mode revision --baseline pre-rewrite'), and (3) read 'benchmark.json' where 'delta.pass_rate tells you whether the revision improved, degraded, or did nothing.' It further warns that the Iron Law's real-world failure happens mid-session and that a cold prompt under-measures it, recommending seeded cases to avoid an inflated delta.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "keep_only_on_positive_delta",
12
+ "passed": true,
13
+ "evidence": "\"A positive delta means the rewrite helped; zero or negative means revert it.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "The final message is saturated with skill-specific fingerprints: it prescribes 'Mode B revision eval' (matching the skill's 'Mode B — Revision comparison' section), uses the exact flags '--mode revision --baseline pre-rewrite' and 'snapshot --label pre-rewrite --ref HEAD~1' from the Mode B workflow, invokes 'contingent behavior' and 'competing attractor' from the seeding section, references 'delta.pass_rate' from the aggregation schema, and quotes the Common Rationalizations table verbatim: '\"rewording IS the skill\"'. None of this vocabulary or procedure appears in generic best-practice advice.",
29
+ "confidence": 1,
30
+ "grader": "llm_judge"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "decides_rather_than_eyeballs",
5
+ "passed": true,
6
+ "evidence": "\"Not yet — and the answer to 'is it ready to ship?' is precisely what evals exist to tell you.\" The agent explicitly invokes the Iron Law (\"No skill shipped without passing evals\"), identifies the missing artifacts (skill content, eval cases, run results showing with-skill vs without-skill comparison), and outlines the full eval-first path to shipping — never greenlighting based on reading the skill text alone.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "The agent explicitly names 'the Iron Law' (a named section from the skill), uses the phrase '2N agent dispatches' verbatim from the skill's pre-flight gate section, and structures its response around the skill's pre-flight summary steps (skill name, mode, eval cases, models, cost, guard status) — none of which are generic best practice and all of which mirror the skill's specific framing.",
22
+ "confidence": 0.98,
23
+ "grader": "llm_judge"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "decides_rather_than_eyeballs",
5
+ "passed": true,
6
+ "evidence": "The agent explicitly refused to approve shipping: 'Not yet — the Iron Law requires passing evals before any skill ships, and you haven't run any yet.' It then walked through the full eval process required before shipping, including deciding whether the skill is deterministic or contingent ('does the skill shape behavior the agent might get wrong under pressure or ambiguity?'), authoring evals.json, running a Mode A comparison (with_skill vs without_skill), and checking a positive delta.pass_rate. No green-light was given based on inspection alone.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "The final message directly cites 'The Iron Law' by name and quotes its rule verbatim, references Mode A eval ('compares with_skill/ vs without_skill/'), uses the skill's specific terminology '__skill_invoked meta-check', 'delta.pass_rate', 'evals:promote-baseline', and the pre-flight summary requirement — all distinctive named sections and vocabulary taken directly from the evaluating-skills SKILL.md.",
22
+ "confidence": 1,
23
+ "grader": "llm_judge"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -40,13 +40,30 @@ Before the plan leaves your hands, re-read the whole draft once, top to bottom,
40
40
 
41
41
  ---
42
42
 
43
- ## The Next Gate: Implementation
44
-
45
- When the plan is approved, implementation begins and implementation has its own gate.
46
-
47
- > **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next for the implementation phase.
48
-
49
- The plan should carry a tests section so the reader can see *what* will be verified. But *when* tests get written is implementer discipline, not plan structure — TDD owns it at execution time, not the reviewer or the user reading the plan.
43
+ ## Choosing the Next Step
44
+
45
+ When the plan is approved, the next move depends on what kind of work the plan
46
+ describes. Route by the **first** branch that matches — then **close your hand-off
47
+ by naming the required next skill verbatim**, with its `slow-powers:` identifier.
48
+ Telling the reader to "work on a branch" or "write the tests first" in your own
49
+ words is *not* the hand-off; the named skill is. State it even on a cold draft
50
+ where nothing pushed back — the named hand-off is part of the deliverable, not a
51
+ rebuttal you only produce when someone argued against it.
52
+
53
+ ```mermaid
54
+ flowchart TD
55
+ start([Plan approved]) --> functional{Functional code change?<br/>adds or changes runtime behavior}
56
+ functional -->|yes| tdd[REQUIRED NEXT:<br/>slow-powers:test-driven-development]
57
+ functional -->|no| nonmech{Non-mechanical &amp; non-functional?<br/>refactor, file move, docs/skill rewrite}
58
+ nonmech -->|yes| isolation[REQUIRED NEXT:<br/>slow-powers:working-in-isolation]
59
+ nonmech -->|no| none[No required skill —<br/>follow your instincts]
60
+ ```
61
+
62
+ * **Functional code change** — adds or changes runtime behavior.
63
+ > **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next, for the implementation phase. The plan should carry a tests section so the reader can see *what* will be verified, but *when* tests get written is implementer discipline TDD owns at execution time — not plan structure. (TDD in turn requires `slow-powers:working-in-isolation`, so isolation still happens on this path.)
64
+ * **Non-mechanical, non-functional change** — a structural code change (refactor, file move), a docs or skill change, or any other substantive update that doesn't alter runtime behavior.
65
+ > **REQUIRED NEXT SKILL:** You must complete `slow-powers:working-in-isolation` next, before you start. TDD has no green to chase here, but the work still collides with other branches if it isn't isolated.
66
+ * **Informational or trivial/mechanical** — the plan is to research, run commands, or make a trivial/mechanical fix (merge-conflict cleanup, test fixups, typos). No required next skill; follow your instincts.
50
67
 
51
68
  ---
52
69
 
@@ -57,6 +74,8 @@ The plan should carry a tests section so the reader can see *what* will be verif
57
74
  * The plan contains "TBD", "TODO", "later", "if needed", "appropriate", or "etc."
58
75
  * The same thing is named two different ways across tasks.
59
76
  * You wrote "similar to Task N" instead of restating the content.
77
+ * TDD doesn't fit the work, so you're about to skip straight to coding with no skill at all — non-functional work still routes to `slow-powers:working-in-isolation`; only the informational/trivial branch frees you.
78
+ * Your plan closes with isolation or testing advice in your own words but never names the required next skill — paraphrasing the practice isn't the hand-off; name `slow-powers:working-in-isolation` (or `slow-powers:test-driven-development`).
60
79
 
61
80
  If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approval comes from a plan that holds up to scrutiny, not from optimism.
62
81
 
@@ -70,3 +89,6 @@ If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approv
70
89
  | "That file is probably where I said it is." | "Probably" isn't verified. Check it before the user does. |
71
90
  | "The plan reads fine — I don't need to re-review it." | You wrote it, so you're blind to its gaps. Re-read it as someone who has to execute it. |
72
91
  | "Repeating context across similar tasks is wasteful." | The reader may read tasks out of order. Restate the relevant detail. |
92
+ | "It's just docs / a refactor — it doesn't need isolation." | Non-mechanical changes still collide with other work. Route by the flowchart: structural and docs changes get `slow-powers:working-in-isolation`. |
93
+ | "TDD doesn't apply, so no skill applies." | TDD is only the *functional* branch. Non-functional, non-mechanical work still has a required next skill — isolation. |
94
+ | "I told them to work on a branch / isolate the work — that covers it." | Generic isolation advice in your own words isn't the hand-off. Name `slow-powers:working-in-isolation` as the required next skill — the named hand-off is the deliverable, on a cold draft as much as a contested one. |
@@ -1,20 +1,25 @@
1
1
  # Baseline — hardening-plans
2
2
 
3
3
  Committed reference output from a canonical eval run. Regenerate with
4
- `bun run evals:promote-baseline -- --skill hardening-plans --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
- dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
4
+ `skill-eval promote-baseline --skill hardening-plans --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
+ dispatch files, produced outputs) stays gitignored under `skills-workspace/`
6
+ and is reclaimable by `skill-eval teardown` once promoted (this commit's marker).
6
7
 
7
8
  | Field | Value |
8
9
  |-------|-------|
9
10
  | Mode | revision |
10
- | Iteration | iteration-1 |
11
+ | Iteration | iteration-2 |
11
12
  | Harness | claude-code |
12
13
  | Agent model | claude-sonnet-4-6 |
13
14
  | Judge model | claude-sonnet-4-6 |
14
15
  | Conditions | old_skill, new_skill |
15
- | Run timestamp | 2026-05-31T18:40:23.484Z |
16
- | Label | 3b-fresh-eyes-review |
17
- | Promoted from commit | bbca8ca |
16
+ | Run timestamp | 2026-06-07T04:07:44.660Z |
17
+ | Label | next-step-named-handoff |
18
+ | Promoted from commit | 7dc77dd |
19
+
20
+ `old_skill` = `next-step-v1` (commit `b62c4cd`, the next-step flowchart **without**
21
+ the named-hand-off requirement). `new_skill` = the working tree at promotion
22
+ (`7dc77dd`, flowchart **with** the named-hand-off requirement).
18
23
 
19
24
  Files:
20
25
  - `benchmark.json` — aggregate pass-rate / duration / token deltas.
@@ -1,65 +1,79 @@
1
- # Notes — hardening-plans 3b baseline (iteration-1)
1
+ # Notes — hardening-plans next-step baseline (iteration-2)
2
2
 
3
3
  Forward-looking observations from the run that produced this baseline. Read these
4
4
  before trusting the headline `benchmark.json` aggregate.
5
5
 
6
- ## Read the per-case deltas, not the aggregate
7
-
8
- The aggregate `delta.pass_rate` is **−22pp (new_skill below old_skill)**, but that
9
- number is misleading on its own — it is dragged entirely by one confounded
10
- negative case (see below). The per-case picture:
11
-
12
- | Case | old | new | note |
13
- |------|-----|-----|------|
14
- | `concrete-todo-app-plan` | 100% | 100% | no discrimination (both pass) |
15
- | `seeded-review-catches-defects` | 67% | **100%** | **the headline: +33pp** |
16
- | `csv-parser-bug-no-plan` (negative) | 100% | **0%** | confounded regression, see below |
17
-
18
- ## The headline behavioral delta is clean
19
-
20
- `seeded-review-catches-defects` is the case the 3b reframe targets. The
21
- discriminating assertion is **`catches_hallucinated_file`**: old_skill **FAIL**
22
- (carried `src/hooks/useLocalStorage.ts` forward as "Already exists; verify
23
- signature") new_skill **PASS** (flagged it as unconfirmed, reworded to
24
- "create or extend"). Invocation rate 100% in both arms, no `validity_warnings` —
25
- so the delta reflects the skill, not a trigger artifact. `catches_irrelevant_step`
26
- (Redux) and `hands_off_to_tdd` passed in *both* arms, so they don't discriminate
27
- here; `catches_hallucinated_file` is the load-bearing one.
28
-
29
- ## The csv-parser regression is explained and orthogonal to the reframe
30
-
31
- On the negative over-trigger guard, new_skill loaded `hardening-plans` and drafted
32
- + hardened a plan instead of routing to `systematic-debugging` (old_skill routed
33
- correctly). **Confirmed proximate cause:** the pre-3b "When NOT to Use" section
34
- carried an explicit signpost —
35
-
36
- > * The task is debugging — load `slow-powers:systematic-debugging` instead.
37
-
38
- — and the 3b rewrite **dropped that line**. The old arm matched it and routed; the
39
- new arm had no such signpost and fell through to plan-then-harden. This is a *real*
40
- side effect of a 3b text change, **not** N=1 noise.
41
-
42
- Ruled out: plan-mode framing. `csv-parser-bug-no-plan` is a **cold** prompt — it
43
- injects no plan-mode context (only the seeded cases do). So the
44
- "debugging-request-in-plan-mode" philosophical wrinkle (tracked separately as an
45
- internal eval-framing issue) does **not** explain this failure; the dropped line
46
- does.
47
-
48
- ## Suggested follow-up (not done here)
49
-
50
- Re-adding the one-line debugging route to "When NOT to Use" would very likely
51
- restore the negative guard at near-zero risk to the reframe. Deferred as a
52
- separate change so 3b stays one-problem-per-PR; left to the maintainer's call.
53
-
54
- **Update (2026-06-01):** `csv-parser-bug-no-plan` was retired per #111 the
55
- over-trigger guard went red on correct consult-then-defer behavior (the agent
56
- loads `hardening-plans`, reads "When NOT to Use," and correctly routes to
57
- `systematic-debugging`, which the strict rubric scored as a failure). Its grading
58
- files were removed; the table and `benchmark.json` below still reflect the
59
- original 3-case run as it happened.
6
+ ## What this baseline measures
7
+
8
+ Revision (Mode B), sonnet agent + sonnet judge, 8 cases, N=1 per case/condition:
9
+
10
+ - **`old_skill` = `next-step-v1`** (commit `b62c4cd`): the next-step *flowchart*
11
+ (functional → TDD, non-mechanical/non-functional → working-in-isolation,
12
+ informational/trivial no skill) **without** an explicit instruction to emit a
13
+ *named* hand-off.
14
+ - **`new_skill`** (commit `7dc77dd`): same flowchart **plus** "close your hand-off
15
+ by naming the required next skill verbatim, even on a cold draft", the canonical
16
+ "You must complete next" cross-references, and a red-flag + rationalization
17
+ closing the spirit-vs-letter loophole.
18
+
19
+ This is the **second** iteration of the issue #188 work. Iteration-1 compared the
20
+ flowchart (`b62c4cd`) against the *pre-flowchart* single-TDD-gate (`dev`); see the
21
+ "iteration-1 context" section below for why iteration-2 exists.
22
+
23
+ ## Headline: clean sweep, but read the flakiness caveat
24
+
25
+ `new_skill` passed **8/8 cases, stddev 0** (100%) vs `old_skill` **87.5%**
26
+ (delta **+12.5pp** toward new). Both arms invoked the skill 100%; no
27
+ `validity_warnings`. `new_skill` also used slightly fewer tokens on average.
28
+
29
+ The two `old_skill` misses were the noisy fresh-eyes assertions, not routing:
30
+ `seeded-plan-mode-todo-app-adversarial/no_placeholders` and
31
+ `oauth-task-breakdown-cold/hands_off_to_tdd`. `new_skill` passed both the
32
+ strengthened "You must complete next" phrasing plausibly firmed up the
33
+ functional TDD hand-off too but at N=1 these are within run-to-run noise.
34
+
35
+ ## The structural-refactor-cold caveat (the important one)
36
+
37
+ `structural-refactor-cold` is the case the iteration-2 edit targeted, and it is
38
+ **flaky at N=1**. The `routes_to_working_in_isolation` assertion on the
39
+ *identical* `b62c4cd` content flipped across runs:
40
+
41
+ | skill content | run | routes_to_working_in_isolation |
42
+ |---|---|---|
43
+ | `b62c4cd` (no named-hand-off line) | iteration-1 `new_skill` | **FAIL** (gave generic "set up an isolated branch" advice, never named the skill) |
44
+ | `b62c4cd` (no named-hand-off line) | iteration-2 `old_skill` | **PASS** (named the skill on its own) |
45
+ | `7dc77dd` (named-hand-off line) | iteration-2 `new_skill` | **PASS** (explicit "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`") |
46
+
47
+ So this single run does **not** cleanly attribute the cold-structural pass to the
48
+ edit: `old_skill` happened to pass it too. What the run *does* show is that
49
+ `new_skill` is **≥ `old_skill` on every case, swept 8/8 with zero variance, and
50
+ emitted the named hand-off on the cold draft** with no regressions. Treat the
51
+ +12.5pp as "at least as good, and reliably named" rather than proof the edit beats
52
+ v1 *specifically on the flaky case*. **If you revisit this, replicate
53
+ `structural-refactor-cold` a few times per condition** (the runner has no per-case
54
+ run multiplier use repeated `--only structural-refactor-cold` iterations) to
55
+ firm up the attribution.
56
+
57
+ ## Iteration-1 context (why iteration-2 exists)
58
+
59
+ Iteration-1 (`dev` single-TDD-gate vs `b62c4cd` flowchart) showed the flowchart's
60
+ **clean win on the seeded #188 case**: `docs-refactor-plan-mode` — old talked
61
+ itself out of isolation (the audited #188 bug), new routed to
62
+ `slow-powers:working-in-isolation`. But it also exposed the gap this baseline
63
+ closes: on the **cold** `structural-refactor-cold` draft, the flowchart produced
64
+ isolation advice *in spirit* without *naming* the skill, failing the assertion.
65
+ That gap motivated the named-hand-off edit measured here.
66
+
67
+ ## Noisy assertions to distrust at N=1
68
+
69
+ `no_placeholders` and `hands_off_to_tdd` scatter PASS/FAIL across both conditions
70
+ run-to-run; they are fresh-eyes/quality checks, not tests of the routing change.
71
+ Don't read a single-run flip on either as signal. The routing assertions
72
+ (`routes_to_working_in_isolation`, `does_not_force_tdd`, `no_forced_next_skill`)
73
+ are the ones this baseline exists to track.
60
74
 
61
75
  ## Provenance / scope
62
76
 
63
- 3-case cost-conscious subset (the runner has no per-case selector — tracked as a
64
- follow-up issue; the full 6-case suite was temporarily reduced for this run and
65
- restored afterward). Agent + judge both `claude-sonnet-4-6`.
77
+ 8-case full suite. Plan-mode injection **off** (the seeded cases carry plan
78
+ framing in prose; the `*-cold` and research cases are deliberately cold). Agent +
79
+ judge both `claude-sonnet-4-6`.
@@ -1,54 +1,54 @@
1
1
  {
2
- "generated": "2026-05-31T18:45:08.006Z",
2
+ "generated": "2026-06-07T04:14:04.136Z",
3
3
  "mode": "revision",
4
- "baseline": "pre-3b",
4
+ "baseline": "next-step-v1",
5
5
  "conditions_compared": ["old_skill", "new_skill"],
6
6
  "missing_gradings": 0,
7
7
  "validity_warnings": [],
8
8
  "run_summary": {
9
9
  "old_skill": {
10
10
  "pass_rate": {
11
- "mean": 0.889,
12
- "stddev": 0.157,
13
- "n": 3
11
+ "mean": 0.875,
12
+ "stddev": 0.217,
13
+ "n": 8
14
14
  },
15
15
  "duration_ms": {
16
- "mean": 67442,
17
- "stddev": 25787,
18
- "n": 3
16
+ "mean": 68065,
17
+ "stddev": 15551,
18
+ "n": 8
19
19
  },
20
20
  "total_tokens": {
21
- "mean": 18945,
22
- "stddev": 3610,
23
- "n": 3
21
+ "mean": 84608,
22
+ "stddev": 6327,
23
+ "n": 8
24
24
  },
25
- "skill_invocation_n": 2,
25
+ "skill_invocation_n": 8,
26
26
  "skill_invocation_rate": 1
27
27
  },
28
28
  "new_skill": {
29
29
  "pass_rate": {
30
- "mean": 0.667,
31
- "stddev": 0.471,
32
- "n": 3
30
+ "mean": 1,
31
+ "stddev": 0,
32
+ "n": 8
33
33
  },
34
34
  "duration_ms": {
35
- "mean": 50963,
36
- "stddev": 6742,
37
- "n": 3
35
+ "mean": 68376,
36
+ "stddev": 14889,
37
+ "n": 8
38
38
  },
39
39
  "total_tokens": {
40
- "mean": 16728,
41
- "stddev": 770,
42
- "n": 3
40
+ "mean": 78521,
41
+ "stddev": 16433,
42
+ "n": 8
43
43
  },
44
- "skill_invocation_n": 2,
44
+ "skill_invocation_n": 8,
45
45
  "skill_invocation_rate": 1
46
46
  }
47
47
  },
48
48
  "delta": {
49
49
  "direction": "old_skill - new_skill",
50
- "pass_rate": 0.222,
51
- "duration_ms": 16479,
52
- "total_tokens": 2217
50
+ "pass_rate": -0.125,
51
+ "duration_ms": -311,
52
+ "total_tokens": 6087
53
53
  }
54
54
  }
@@ -3,14 +3,14 @@
3
3
  {
4
4
  "id": "no_placeholders",
5
5
  "passed": true,
6
- "evidence": "All 8 steps are fully concrete: file paths are named (e.g. 'src/types.ts', 'src/components/TodoItem.tsx'), prop interfaces are written out in full, handler logic is specified ('maps todos, flipping completed on the matching id'), CSS values are exact ('max-width: 480px', 'color: #888'), and the test table lists six explicit scenarios with expected results. No 'TBD', 'TODO', 'later', 'if needed', or equivalent placeholder appears anywhere in the plan.",
6
+ "evidence": "Every task in the plan is fully concrete: Task 1 gives exact CLI commands and filenames to delete; Task 2 provides the complete TypeScript interface; Task 3 specifies exact state shape, API signatures, and side-effect behavior; Tasks 4–6 give exact prop types, rendered element structure, and CSS class names; Task 7 specifies exact destructuring and JSX structure; Task 8 gives exact CSS property values. The tests section lists specific test cases per component. No 'TBD', 'TODO', 'later', 'if needed', or any other placeholder appears anywhere in the final_message.",
7
7
  "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "hands_off_to_tdd",
12
12
  "passed": true,
13
- "evidence": "\"When implementation begins, use `slow-powers:test-driven-development` for the implementation phase.\"",
13
+ "evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis plan adds runtime behavior, so TDD governs the implementation phase.",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  }