npm - @slowdini/slow-powers-opencode - Versions diffs - 0.3.0 → 0.4.1 - Mend

@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

package/skills/evaluating-skills/evals/baseline/benchmark.json ADDED Viewed

@@ -0,0 +1,54 @@
+{
+  "generated": "2026-06-06T05:28:23.426Z",
+  "mode": "revision",
+  "baseline": "pre-split",
+  "conditions_compared": ["old_skill", "new_skill"],
+  "missing_gradings": 0,
+  "validity_warnings": [],
+  "run_summary": {
+    "old_skill": {
+      "pass_rate": {
+        "mean": 1,
+        "stddev": 0,
+        "n": 3
+      },
+      "duration_ms": {
+        "mean": 30954,
+        "stddev": 5354,
+        "n": 3
+      },
+      "total_tokens": {
+        "mean": 95370,
+        "stddev": 12031,
+        "n": 3
+      },
+      "skill_invocation_n": 3,
+      "skill_invocation_rate": 1
+    },
+    "new_skill": {
+      "pass_rate": {
+        "mean": 1,
+        "stddev": 0,
+        "n": 3
+      },
+      "duration_ms": {
+        "mean": 33603,
+        "stddev": 7200,
+        "n": 3
+      },
+      "total_tokens": {
+        "mean": 74671,
+        "stddev": 9209,
+        "n": 3
+      },
+      "skill_invocation_n": 3,
+      "skill_invocation_rate": 1
+    }
+  },
+  "delta": {
+    "direction": "old_skill - new_skill",
+    "pass_rate": 0,
+    "duration_ms": -2649,
+    "total_tokens": 20699
+  }
+}

package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json ADDED Viewed

@@ -0,0 +1,39 @@
+{
+  "assertion_results": [
+    {
+      "id": "declares_deterministic_and_skips",
+      "passed": true,
+      "evidence": "\"Removing a 'announce out loud that you're using this skill' line is a deterministic change... **Decision: deterministic instruction removal — skip the eval.** Ship it.\"",
+      "confidence": 1,
+      "grader": "llm_judge"
+    },
+    {
+      "id": "door_stays_open",
+      "passed": true,
+      "evidence": "If you want an eval anyway (the skill says the door stays open), it would need to be a real one — actual cases testing a behavior this change could plausibly affect — not a checkbox run to rubber-stamp a foregone conclusion.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 0,
+    "total": 2,
+    "pass_rate": 1
+  },
+  "meta_results": [
+    {
+      "id": "__skill_invoked",
+      "passed": true,
+      "evidence": "The agent explicitly names the skill ('The evaluating-skills skill calls out this exact example explicitly as the canonical deterministic case'), uses the skill's exact canonical example verbatim ('announce out loud that you're using this skill'), mirrors the skill's deterministic/contingent framing ('does it alter contingent behavior'), follows the prescribed 'declare and skip' pattern ('Decision: deterministic instruction removal — skip the eval'), and echoes the skill's 'the door stays open' phrasing for the user-override case.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "meta_summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "skill_invoked": true
+  }
+}

package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json ADDED Viewed

@@ -0,0 +1,39 @@
+{
+  "assertion_results": [
+    {
+      "id": "declares_deterministic_and_skips",
+      "passed": true,
+      "evidence": "Agent explicitly declares the change deterministic and skips the eval: 'Removing a \"announce out loud that you're using this skill\" instruction is a deterministic change, not a contingent one' and concludes 'Decision: skip the eval. Deterministic instruction removal. Ship it.'",
+      "confidence": 1,
+      "grader": "llm_judge"
+    },
+    {
+      "id": "door_stays_open",
+      "passed": true,
+      "evidence": "The agent's final message ends with 'Ship it.' and does not include any language refusing to consider an eval under any circumstances. Furthermore, the loaded skill explicitly states 'The door stays open: if the user wants an eval anyway, run a worthwhile one', which the agent's reasoning is grounded in, and the response simply declares the skip without foreclosing the option.",
+      "confidence": 0.85,
+      "grader": "llm_judge"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 0,
+    "total": 2,
+    "pass_rate": 1
+  },
+  "meta_results": [
+    {
+      "id": "__skill_invoked",
+      "passed": true,
+      "evidence": "The agent directly quotes the skill's canonical example verbatim ('Removing a one-line \\'announce out loud that you\\'re using this skill\\' instruction... changes what the agent is told, not whether it complies under pressure. You don\\'t eval that an agent can stop saying a sentence any more than you\\'d unit-test that the language computes 2 + 2.'), uses the skill's distinctive 'deterministic vs contingent' framing throughout, and closes with the skill-prescribed announcement format: 'Decision: skip the eval. Deterministic instruction removal.'",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "meta_summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "skill_invoked": true
+  }
+}

package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json ADDED Viewed

@@ -0,0 +1,39 @@
+{
+  "assertion_results": [
+    {
+      "id": "prescribes_structured_comparison",
+      "passed": true,
+      "evidence": "\"You know it's better by running a Mode B revision eval — comparing the old skill against the new one on the same set of test prompts, then looking at the pass-rate delta.\" The agent explicitly prescribes: (1) snapshot the old SKILL.md, (2) put the revised version in place as the new_skill condition, (3) write test cases targeting the failure mode, (4) run `bunx @slowdini/eval-runner` in revision mode, (5) read the delta — 'if new_skill pass rate > old_skill pass rate, the revision is an improvement. Zero or negative delta means revert.'",
+      "confidence": 1,
+      "grader": "llm_judge"
+    },
+    {
+      "id": "keep_only_on_positive_delta",
+      "passed": true,
+      "evidence": "\"Read the delta: if `new_skill` pass rate > `old_skill` pass rate, the revision is an improvement. Zero or negative delta means revert.\"",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 0,
+    "total": 2,
+    "pass_rate": 1
+  },
+  "meta_results": [
+    {
+      "id": "__skill_invoked",
+      "passed": true,
+      "evidence": "The response is saturated with skill-specific fingerprints: it names 'Mode B revision eval' (mirroring the skill's 'Mode B — revision' section), invokes the 'Iron Law' by name, uses 'discipline-enforcing skill', 'seeded cases', 'competing attractor', 'cold prompts under-measure discipline skills', recommends the pre-flight summary with '(cases, models, guard status, cost)' verbatim from the Pre-flight gate section, and quotes the skill's own revision guidance — 'reasoning-based instructions (\"do X because Y\") tend to transfer better'.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "meta_summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "skill_invoked": true
+  }
+}

package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json ADDED Viewed

@@ -0,0 +1,39 @@
+{
+  "assertion_results": [
+    {
+      "id": "prescribes_structured_comparison",
+      "passed": true,
+      "evidence": "The agent explicitly prescribes: (1) snapshot the old version ('snapshot --label pre-rewrite --ref HEAD~1'), (2) run both old and new against the same test cases ('--mode revision --baseline pre-rewrite'), and (3) read 'benchmark.json' where 'delta.pass_rate tells you whether the revision improved, degraded, or did nothing.' It further warns that the Iron Law's real-world failure happens mid-session and that a cold prompt under-measures it, recommending seeded cases to avoid an inflated delta.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    },
+    {
+      "id": "keep_only_on_positive_delta",
+      "passed": true,
+      "evidence": "\"A positive delta means the rewrite helped; zero or negative means revert it.\"",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "summary": {
+    "passed": 2,
+    "failed": 0,
+    "total": 2,
+    "pass_rate": 1
+  },
+  "meta_results": [
+    {
+      "id": "__skill_invoked",
+      "passed": true,
+      "evidence": "The final message is saturated with skill-specific fingerprints: it prescribes 'Mode B revision eval' (matching the skill's 'Mode B — Revision comparison' section), uses the exact flags '--mode revision --baseline pre-rewrite' and 'snapshot --label pre-rewrite --ref HEAD~1' from the Mode B workflow, invokes 'contingent behavior' and 'competing attractor' from the seeding section, references 'delta.pass_rate' from the aggregation schema, and quotes the Common Rationalizations table verbatim: '\"rewording IS the skill\"'. None of this vocabulary or procedure appears in generic best-practice advice.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "meta_summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "skill_invoked": true
+  }
+}

package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "assertion_results": [
+    {
+      "id": "decides_rather_than_eyeballs",
+      "passed": true,
+      "evidence": "\"Not yet — and the answer to 'is it ready to ship?' is precisely what evals exist to tell you.\" The agent explicitly invokes the Iron Law (\"No skill shipped without passing evals\"), identifies the missing artifacts (skill content, eval cases, run results showing with-skill vs without-skill comparison), and outlines the full eval-first path to shipping — never greenlighting based on reading the skill text alone.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "pass_rate": 1
+  },
+  "meta_results": [
+    {
+      "id": "__skill_invoked",
+      "passed": true,
+      "evidence": "The agent explicitly names 'the Iron Law' (a named section from the skill), uses the phrase '2N agent dispatches' verbatim from the skill's pre-flight gate section, and structures its response around the skill's pre-flight summary steps (skill name, mode, eval cases, models, cost, guard status) — none of which are generic best practice and all of which mirror the skill's specific framing.",
+      "confidence": 0.98,
+      "grader": "llm_judge"
+    }
+  ],
+  "meta_summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "skill_invoked": true
+  }
+}

package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json ADDED Viewed

@@ -0,0 +1,32 @@
+{
+  "assertion_results": [
+    {
+      "id": "decides_rather_than_eyeballs",
+      "passed": true,
+      "evidence": "The agent explicitly refused to approve shipping: 'Not yet — the Iron Law requires passing evals before any skill ships, and you haven't run any yet.' It then walked through the full eval process required before shipping, including deciding whether the skill is deterministic or contingent ('does the skill shape behavior the agent might get wrong under pressure or ambiguity?'), authoring evals.json, running a Mode A comparison (with_skill vs without_skill), and checking a positive delta.pass_rate. No green-light was given based on inspection alone.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "pass_rate": 1
+  },
+  "meta_results": [
+    {
+      "id": "__skill_invoked",
+      "passed": true,
+      "evidence": "The final message directly cites 'The Iron Law' by name and quotes its rule verbatim, references Mode A eval ('compares with_skill/ vs without_skill/'), uses the skill's specific terminology '__skill_invoked meta-check', 'delta.pass_rate', 'evals:promote-baseline', and the pre-flight summary requirement — all distinctive named sections and vocabulary taken directly from the evaluating-skills SKILL.md.",
+      "confidence": 1,
+      "grader": "llm_judge"
+    }
+  ],
+  "meta_summary": {
+    "passed": 1,
+    "failed": 0,
+    "total": 1,
+    "skill_invoked": true
+  }
+}

package/skills/hardening-plans/SKILL.md CHANGED Viewed

@@ -40,13 +40,30 @@ Before the plan leaves your hands, re-read the whole draft once, top to bottom,
 ---
-## The Next Gate: Implementation
-When the plan is approved, implementation begins — and implementation has its own gate.
-> **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next for the implementation phase.
-The plan should carry a tests section so the reader can see *what* will be verified. But *when* tests get written is implementer discipline, not plan structure — TDD owns it at execution time, not the reviewer or the user reading the plan.
+## Choosing the Next Step
+When the plan is approved, the next move depends on what kind of work the plan
+describes. Route by the **first** branch that matches — then **close your hand-off
+by naming the required next skill verbatim**, with its `slow-powers:` identifier.
+Telling the reader to "work on a branch" or "write the tests first" in your own
+words is *not* the hand-off; the named skill is. State it even on a cold draft
+where nothing pushed back — the named hand-off is part of the deliverable, not a
+rebuttal you only produce when someone argued against it.
+```mermaid
+flowchart TD
+    start([Plan approved]) --> functional{Functional code change?<br/>adds or changes runtime behavior}
+    functional -->|yes| tdd[REQUIRED NEXT:<br/>slow-powers:test-driven-development]
+    functional -->|no| nonmech{Non-mechanical &amp; non-functional?<br/>refactor, file move, docs/skill rewrite}
+    nonmech -->|yes| isolation[REQUIRED NEXT:<br/>slow-powers:working-in-isolation]
+    nonmech -->|no| none[No required skill —<br/>follow your instincts]
+```
+* **Functional code change** — adds or changes runtime behavior.
+  > **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next, for the implementation phase. The plan should carry a tests section so the reader can see *what* will be verified, but *when* tests get written is implementer discipline TDD owns at execution time — not plan structure. (TDD in turn requires `slow-powers:working-in-isolation`, so isolation still happens on this path.)
+* **Non-mechanical, non-functional change** — a structural code change (refactor, file move), a docs or skill change, or any other substantive update that doesn't alter runtime behavior.
+  > **REQUIRED NEXT SKILL:** You must complete `slow-powers:working-in-isolation` next, before you start. TDD has no green to chase here, but the work still collides with other branches if it isn't isolated.
+* **Informational or trivial/mechanical** — the plan is to research, run commands, or make a trivial/mechanical fix (merge-conflict cleanup, test fixups, typos). No required next skill; follow your instincts.
 ---
@@ -57,6 +74,8 @@ The plan should carry a tests section so the reader can see *what* will be verif
 * The plan contains "TBD", "TODO", "later", "if needed", "appropriate", or "etc."
 * The same thing is named two different ways across tasks.
 * You wrote "similar to Task N" instead of restating the content.
+* TDD doesn't fit the work, so you're about to skip straight to coding with no skill at all — non-functional work still routes to `slow-powers:working-in-isolation`; only the informational/trivial branch frees you.
+* Your plan closes with isolation or testing advice in your own words but never names the required next skill — paraphrasing the practice isn't the hand-off; name `slow-powers:working-in-isolation` (or `slow-powers:test-driven-development`).
 If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approval comes from a plan that holds up to scrutiny, not from optimism.
@@ -70,3 +89,6 @@ If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approv
 | "That file is probably where I said it is." | "Probably" isn't verified. Check it before the user does. |
 | "The plan reads fine — I don't need to re-review it." | You wrote it, so you're blind to its gaps. Re-read it as someone who has to execute it. |
 | "Repeating context across similar tasks is wasteful." | The reader may read tasks out of order. Restate the relevant detail. |
+| "It's just docs / a refactor — it doesn't need isolation." | Non-mechanical changes still collide with other work. Route by the flowchart: structural and docs changes get `slow-powers:working-in-isolation`. |
+| "TDD doesn't apply, so no skill applies." | TDD is only the *functional* branch. Non-functional, non-mechanical work still has a required next skill — isolation. |
+| "I told them to work on a branch / isolate the work — that covers it." | Generic isolation advice in your own words isn't the hand-off. Name `slow-powers:working-in-isolation` as the required next skill — the named hand-off is the deliverable, on a cold draft as much as a contested one. |

package/skills/hardening-plans/evals/baseline/BASELINE.md CHANGED Viewed

@@ -1,20 +1,25 @@
 # Baseline — hardening-plans
 Committed reference output from a canonical eval run. Regenerate with
-`bun run evals:promote-baseline -- --skill hardening-plans --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
-dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
+`skill-eval promote-baseline --skill hardening-plans --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
+dispatch files, produced outputs) stays gitignored under `skills-workspace/`
+and is reclaimable by `skill-eval teardown` once promoted (this commit's marker).
 | Field | Value |
 |-------|-------|
 | Mode | revision |
-| Iteration | iteration-1 |
+| Iteration | iteration-2 |
 | Harness | claude-code |
 | Agent model | claude-sonnet-4-6 |
 | Judge model | claude-sonnet-4-6 |
 | Conditions | old_skill, new_skill |
-| Run timestamp | 2026-05-31T18:40:23.484Z |
-| Label | 3b-fresh-eyes-review |
-| Promoted from commit | bbca8ca |
+| Run timestamp | 2026-06-07T04:07:44.660Z |
+| Label | next-step-named-handoff |
+| Promoted from commit | 7dc77dd |
+`old_skill` = `next-step-v1` (commit `b62c4cd`, the next-step flowchart **without**
+the named-hand-off requirement). `new_skill` = the working tree at promotion
+(`7dc77dd`, flowchart **with** the named-hand-off requirement).
 Files:
 - `benchmark.json` — aggregate pass-rate / duration / token deltas.

package/skills/hardening-plans/evals/baseline/NOTES.md CHANGED Viewed

@@ -1,65 +1,79 @@
-# Notes — hardening-plans 3b baseline (iteration-1)
+# Notes — hardening-plans next-step baseline (iteration-2)
 Forward-looking observations from the run that produced this baseline. Read these
 before trusting the headline `benchmark.json` aggregate.
-## Read the per-case deltas, not the aggregate
-The aggregate `delta.pass_rate` is **−22pp (new_skill below old_skill)**, but that
-number is misleading on its own — it is dragged entirely by one confounded
-negative case (see below). The per-case picture:
-| Case | old | new | note |
-|------|-----|-----|------|
-| `concrete-todo-app-plan` | 100% | 100% | no discrimination (both pass) |
-| `seeded-review-catches-defects` | 67% | **100%** | **the headline: +33pp** |
-| `csv-parser-bug-no-plan` (negative) | 100% | **0%** | confounded regression, see below |
-## The headline behavioral delta is clean
-`seeded-review-catches-defects` is the case the 3b reframe targets. The
-discriminating assertion is **`catches_hallucinated_file`**: old_skill **FAIL**
-(carried `src/hooks/useLocalStorage.ts` forward as "Already exists; verify
-signature") → new_skill **PASS** (flagged it as unconfirmed, reworded to
-"create or extend"). Invocation rate 100% in both arms, no `validity_warnings` —
-so the delta reflects the skill, not a trigger artifact. `catches_irrelevant_step`
-(Redux) and `hands_off_to_tdd` passed in *both* arms, so they don't discriminate
-here; `catches_hallucinated_file` is the load-bearing one.
-## The csv-parser regression is explained and orthogonal to the reframe
-On the negative over-trigger guard, new_skill loaded `hardening-plans` and drafted
-+ hardened a plan instead of routing to `systematic-debugging` (old_skill routed
-correctly). **Confirmed proximate cause:** the pre-3b "When NOT to Use" section
-carried an explicit signpost —
-> * The task is debugging — load `slow-powers:systematic-debugging` instead.
-— and the 3b rewrite **dropped that line**. The old arm matched it and routed; the
-new arm had no such signpost and fell through to plan-then-harden. This is a *real*
-side effect of a 3b text change, **not** N=1 noise.
-Ruled out: plan-mode framing. `csv-parser-bug-no-plan` is a **cold** prompt — it
-injects no plan-mode context (only the seeded cases do). So the
-"debugging-request-in-plan-mode" philosophical wrinkle (tracked separately as an
-internal eval-framing issue) does **not** explain this failure; the dropped line
-does.
-## Suggested follow-up (not done here)
-Re-adding the one-line debugging route to "When NOT to Use" would very likely
-restore the negative guard at near-zero risk to the reframe. Deferred as a
-separate change so 3b stays one-problem-per-PR; left to the maintainer's call.
-**Update (2026-06-01):** `csv-parser-bug-no-plan` was retired per #111 — the
-over-trigger guard went red on correct consult-then-defer behavior (the agent
-loads `hardening-plans`, reads "When NOT to Use," and correctly routes to
-`systematic-debugging`, which the strict rubric scored as a failure). Its grading
-files were removed; the table and `benchmark.json` below still reflect the
-original 3-case run as it happened.
+## What this baseline measures
+Revision (Mode B), sonnet agent + sonnet judge, 8 cases, N=1 per case/condition:
+- **`old_skill` = `next-step-v1`** (commit `b62c4cd`): the next-step *flowchart*
+  (functional → TDD, non-mechanical/non-functional → working-in-isolation,
+  informational/trivial → no skill) **without** an explicit instruction to emit a
+  *named* hand-off.
+- **`new_skill`** (commit `7dc77dd`): same flowchart **plus** "close your hand-off
+  by naming the required next skill verbatim, even on a cold draft", the canonical
+  "You must complete … next" cross-references, and a red-flag + rationalization
+  closing the spirit-vs-letter loophole.
+This is the **second** iteration of the issue #188 work. Iteration-1 compared the
+flowchart (`b62c4cd`) against the *pre-flowchart* single-TDD-gate (`dev`); see the
+"iteration-1 context" section below for why iteration-2 exists.
+## Headline: clean sweep, but read the flakiness caveat
+`new_skill` passed **8/8 cases, stddev 0** (100%) vs `old_skill` **87.5%**
+(delta **+12.5pp** toward new). Both arms invoked the skill 100%; no
+`validity_warnings`. `new_skill` also used slightly fewer tokens on average.
+The two `old_skill` misses were the noisy fresh-eyes assertions, not routing:
+`seeded-plan-mode-todo-app-adversarial/no_placeholders` and
+`oauth-task-breakdown-cold/hands_off_to_tdd`. `new_skill` passed both — the
+strengthened "You must complete … next" phrasing plausibly firmed up the
+functional TDD hand-off too — but at N=1 these are within run-to-run noise.
+## The structural-refactor-cold caveat (the important one)
+`structural-refactor-cold` is the case the iteration-2 edit targeted, and it is
+**flaky at N=1**. The `routes_to_working_in_isolation` assertion on the
+*identical* `b62c4cd` content flipped across runs:
+| skill content | run | routes_to_working_in_isolation |
+|---|---|---|
+| `b62c4cd` (no named-hand-off line) | iteration-1 `new_skill` | **FAIL** (gave generic "set up an isolated branch" advice, never named the skill) |
+| `b62c4cd` (no named-hand-off line) | iteration-2 `old_skill` | **PASS** (named the skill on its own) |
+| `7dc77dd` (named-hand-off line)    | iteration-2 `new_skill` | **PASS** (explicit "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`") |
+So this single run does **not** cleanly attribute the cold-structural pass to the
+edit: `old_skill` happened to pass it too. What the run *does* show is that
+`new_skill` is **≥ `old_skill` on every case, swept 8/8 with zero variance, and
+emitted the named hand-off on the cold draft** — with no regressions. Treat the
++12.5pp as "at least as good, and reliably named" rather than proof the edit beats
+v1 *specifically on the flaky case*. **If you revisit this, replicate
+`structural-refactor-cold` a few times per condition** (the runner has no per-case
+run multiplier — use repeated `--only structural-refactor-cold` iterations) to
+firm up the attribution.
+## Iteration-1 context (why iteration-2 exists)
+Iteration-1 (`dev` single-TDD-gate vs `b62c4cd` flowchart) showed the flowchart's
+**clean win on the seeded #188 case**: `docs-refactor-plan-mode` — old talked
+itself out of isolation (the audited #188 bug), new routed to
+`slow-powers:working-in-isolation`. But it also exposed the gap this baseline
+closes: on the **cold** `structural-refactor-cold` draft, the flowchart produced
+isolation advice *in spirit* without *naming* the skill, failing the assertion.
+That gap motivated the named-hand-off edit measured here.
+## Noisy assertions to distrust at N=1
+`no_placeholders` and `hands_off_to_tdd` scatter PASS/FAIL across both conditions
+run-to-run; they are fresh-eyes/quality checks, not tests of the routing change.
+Don't read a single-run flip on either as signal. The routing assertions
+(`routes_to_working_in_isolation`, `does_not_force_tdd`, `no_forced_next_skill`)
+are the ones this baseline exists to track.
 ## Provenance / scope
-3-case cost-conscious subset (the runner has no per-case selector — tracked as a
-follow-up issue; the full 6-case suite was temporarily reduced for this run and
-restored afterward). Agent + judge both `claude-sonnet-4-6`.
+8-case full suite. Plan-mode injection **off** (the seeded cases carry plan
+framing in prose; the `*-cold` and research cases are deliberately cold). Agent +
+judge both `claude-sonnet-4-6`.

package/skills/hardening-plans/evals/baseline/benchmark.json CHANGED Viewed

@@ -1,54 +1,54 @@
 {
-  "generated": "2026-05-31T18:45:08.006Z",
+  "generated": "2026-06-07T04:14:04.136Z",
   "mode": "revision",
-  "baseline": "pre-3b",
+  "baseline": "next-step-v1",
   "conditions_compared": ["old_skill", "new_skill"],
   "missing_gradings": 0,
   "validity_warnings": [],
   "run_summary": {
     "old_skill": {
       "pass_rate": {
-        "mean": 0.889,
-        "stddev": 0.157,
-        "n": 3
+        "mean": 0.875,
+        "stddev": 0.217,
+        "n": 8
       },
       "duration_ms": {
-        "mean": 67442,
-        "stddev": 25787,
-        "n": 3
+        "mean": 68065,
+        "stddev": 15551,
+        "n": 8
       },
       "total_tokens": {
-        "mean": 18945,
-        "stddev": 3610,
-        "n": 3
+        "mean": 84608,
+        "stddev": 6327,
+        "n": 8
       },
-      "skill_invocation_n": 2,
+      "skill_invocation_n": 8,
       "skill_invocation_rate": 1
     },
     "new_skill": {
       "pass_rate": {
-        "mean": 0.667,
-        "stddev": 0.471,
-        "n": 3
+        "mean": 1,
+        "stddev": 0,
+        "n": 8
       },
       "duration_ms": {
-        "mean": 50963,
-        "stddev": 6742,
-        "n": 3
+        "mean": 68376,
+        "stddev": 14889,
+        "n": 8
       },
       "total_tokens": {
-        "mean": 16728,
-        "stddev": 770,
-        "n": 3
+        "mean": 78521,
+        "stddev": 16433,
+        "n": 8
       },
-      "skill_invocation_n": 2,
+      "skill_invocation_n": 8,
       "skill_invocation_rate": 1
     }
   },
   "delta": {
     "direction": "old_skill - new_skill",
-    "pass_rate": 0.222,
-    "duration_ms": 16479,
-    "total_tokens": 2217
+    "pass_rate": -0.125,
+    "duration_ms": -311,
+    "total_tokens": 6087
   }
 }

package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json CHANGED Viewed

@@ -3,14 +3,14 @@
     {
       "id": "no_placeholders",
       "passed": true,
-      "evidence": "All 8 steps are fully concrete: file paths are named (e.g. 'src/types.ts', 'src/components/TodoItem.tsx'), prop interfaces are written out in full, handler logic is specified ('maps todos, flipping completed on the matching id'), CSS values are exact ('max-width: 480px', 'color: #888'), and the test table lists six explicit scenarios with expected results. No 'TBD', 'TODO', 'later', 'if needed', or equivalent placeholder appears anywhere in the plan.",
+      "evidence": "Every task in the plan is fully concrete: Task 1 gives exact CLI commands and filenames to delete; Task 2 provides the complete TypeScript interface; Task 3 specifies exact state shape, API signatures, and side-effect behavior; Tasks 4–6 give exact prop types, rendered element structure, and CSS class names; Task 7 specifies exact destructuring and JSX structure; Task 8 gives exact CSS property values. The tests section lists specific test cases per component. No 'TBD', 'TODO', 'later', 'if needed', or any other placeholder appears anywhere in the final_message.",
       "confidence": 1,
       "grader": "llm_judge"
     },
     {
       "id": "hands_off_to_tdd",
       "passed": true,
-      "evidence": "\"When implementation begins, use `slow-powers:test-driven-development` for the implementation phase.\"",
+      "evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis plan adds runtime behavior, so TDD governs the implementation phase.",
       "confidence": 1,
       "grader": "llm_judge"
     }