@slowdini/slow-powers-opencode 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. package/README.md +2 -6
  2. package/opencode/plugins/slow-powers.js +52 -4
  3. package/package.json +1 -1
  4. package/skills/hardening-plans/SKILL.md +29 -7
  5. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  6. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  7. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  8. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  9. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  10. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  11. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  12. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  13. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  14. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  15. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  16. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  17. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  18. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  19. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  20. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  21. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  22. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  24. package/skills/hardening-plans/evals/evals.json +46 -0
package/README.md CHANGED
@@ -60,12 +60,8 @@ You can also browse and install it interactively: run `codex`, open
60
60
 
61
61
  ### OpenCode
62
62
 
63
- Add Slow-powers to the `plugin` array in `~/.config/opencode/opencode.json`:
64
-
65
- ```json
66
- {
67
- "plugin": ["@slowdini/slow-powers-opencode"]
68
- }
63
+ ```bash
64
+ opencode plugin @slowdini/slow-powers-opencode -g
69
65
  ```
70
66
 
71
67
  ## The skills
@@ -3,6 +3,7 @@
3
3
  *
4
4
  * Injects slow-powers bootstrap context via system prompt transform.
5
5
  * Auto-registers skills directory via config hook (no symlinks needed).
6
+ * Intercepts plan file writes in plan mode and triggers hardening-plans skill.
6
7
  */
7
8
 
8
9
  import fs from "node:fs";
@@ -22,10 +23,11 @@ const bootstrapLeadingPhrase = "<EXTREMELY-IMPORTANT>";
22
23
  // once eliminates redundant fs work on every agent step.
23
24
  let _bootstrapCache; // undefined = not yet loaded, null = file missing
24
25
 
25
- export const SlowPowersPlugin = async ({
26
- client: _client,
27
- directory: _directory,
28
- }) => {
26
+ // Tracks plan files we've already sent the hardening prompt for.
27
+ // Once we ask the agent to harden a plan, we never ask again for that file.
28
+ const hardeningPromptSentFor = new Set();
29
+
30
+ export const SlowPowersPlugin = async ({ client, directory: _directory }) => {
29
31
  // Helper to load bootstrap content (cached after first call)
30
32
  const getBootstrapContent = () => {
31
33
  if (_bootstrapCache !== undefined) return _bootstrapCache;
@@ -40,6 +42,47 @@ export const SlowPowersPlugin = async ({
40
42
  return _bootstrapCache;
41
43
  };
42
44
 
45
+ const handlePlanFileEdit = async (event) => {
46
+ const filePath = event.properties.file;
47
+ const sessionID = event.properties.sessionID;
48
+
49
+ if (!filePath || !sessionID) return;
50
+
51
+ if (!filePath.match(/\.opencode\/plans\/.*\.md$/)) return;
52
+
53
+ let session;
54
+ try {
55
+ session = await client.session.get({ path: { id: sessionID } });
56
+ } catch {
57
+ return;
58
+ }
59
+ if (session.agent !== "plan") return;
60
+
61
+ // Only prompt once per plan file. After we've asked the agent to harden
62
+ // it, we trust them to do so or not; re-prompting causes loops.
63
+ if (hardeningPromptSentFor.has(filePath)) return;
64
+
65
+ hardeningPromptSentFor.add(filePath);
66
+
67
+ try {
68
+ await client.session.prompt({
69
+ path: { id: sessionID },
70
+ body: {
71
+ noReply: true,
72
+ parts: [
73
+ {
74
+ type: "text",
75
+ text: `The plan at ${filePath} has been written. If not already done, please run the hardening-plans skill on this plan file to review it before presentation.`,
76
+ },
77
+ ],
78
+ },
79
+ });
80
+ } catch (err) {
81
+ hardeningPromptSentFor.delete(filePath);
82
+ console.error("[slow-powers] Failed to trigger hardening-plans:", err);
83
+ }
84
+ };
85
+
43
86
  return {
44
87
  // Inject skills path into live config so OpenCode discovers slow-powers skills
45
88
  // without requiring manual symlinks or config file edits.
@@ -82,5 +125,10 @@ export const SlowPowersPlugin = async ({
82
125
 
83
126
  firstUser.parts.unshift({ type: "text", text: bootstrap });
84
127
  },
128
+
129
+ event: async ({ event }) => {
130
+ if (event.type !== "file.edited") return;
131
+ await handlePlanFileEdit(event);
132
+ },
85
133
  };
86
134
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@slowdini/slow-powers-opencode",
3
- "version": "0.4.0",
3
+ "version": "0.4.2",
4
4
  "description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
5
5
  "type": "module",
6
6
  "main": "./opencode/plugins/slow-powers.js",
@@ -40,13 +40,30 @@ Before the plan leaves your hands, re-read the whole draft once, top to bottom,
40
40
 
41
41
  ---
42
42
 
43
- ## The Next Gate: Implementation
44
-
45
- When the plan is approved, implementation begins and implementation has its own gate.
46
-
47
- > **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next for the implementation phase.
48
-
49
- The plan should carry a tests section so the reader can see *what* will be verified. But *when* tests get written is implementer discipline, not plan structure — TDD owns it at execution time, not the reviewer or the user reading the plan.
43
+ ## Choosing the Next Step
44
+
45
+ When the plan is approved, the next move depends on what kind of work the plan
46
+ describes. Route by the **first** branch that matches — then **close your hand-off
47
+ by naming the required next skill verbatim**, with its `slow-powers:` identifier.
48
+ Telling the reader to "work on a branch" or "write the tests first" in your own
49
+ words is *not* the hand-off; the named skill is. State it even on a cold draft
50
+ where nothing pushed back — the named hand-off is part of the deliverable, not a
51
+ rebuttal you only produce when someone argued against it.
52
+
53
+ ```mermaid
54
+ flowchart TD
55
+ start([Plan approved]) --> functional{Functional code change?<br/>adds or changes runtime behavior}
56
+ functional -->|yes| tdd[REQUIRED NEXT:<br/>slow-powers:test-driven-development]
57
+ functional -->|no| nonmech{Non-mechanical &amp; non-functional?<br/>refactor, file move, docs/skill rewrite}
58
+ nonmech -->|yes| isolation[REQUIRED NEXT:<br/>slow-powers:working-in-isolation]
59
+ nonmech -->|no| none[No required skill —<br/>follow your instincts]
60
+ ```
61
+
62
+ * **Functional code change** — adds or changes runtime behavior.
63
+ > **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next, for the implementation phase. The plan should carry a tests section so the reader can see *what* will be verified, but *when* tests get written is implementer discipline TDD owns at execution time — not plan structure. (TDD in turn requires `slow-powers:working-in-isolation`, so isolation still happens on this path.)
64
+ * **Non-mechanical, non-functional change** — a structural code change (refactor, file move), a docs or skill change, or any other substantive update that doesn't alter runtime behavior.
65
+ > **REQUIRED NEXT SKILL:** You must complete `slow-powers:working-in-isolation` next, before you start. TDD has no green to chase here, but the work still collides with other branches if it isn't isolated.
66
+ * **Informational or trivial/mechanical** — the plan is to research, run commands, or make a trivial/mechanical fix (merge-conflict cleanup, test fixups, typos). No required next skill; follow your instincts.
50
67
 
51
68
  ---
52
69
 
@@ -57,6 +74,8 @@ The plan should carry a tests section so the reader can see *what* will be verif
57
74
  * The plan contains "TBD", "TODO", "later", "if needed", "appropriate", or "etc."
58
75
  * The same thing is named two different ways across tasks.
59
76
  * You wrote "similar to Task N" instead of restating the content.
77
+ * TDD doesn't fit the work, so you're about to skip straight to coding with no skill at all — non-functional work still routes to `slow-powers:working-in-isolation`; only the informational/trivial branch frees you.
78
+ * Your plan closes with isolation or testing advice in your own words but never names the required next skill — paraphrasing the practice isn't the hand-off; name `slow-powers:working-in-isolation` (or `slow-powers:test-driven-development`).
60
79
 
61
80
  If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approval comes from a plan that holds up to scrutiny, not from optimism.
62
81
 
@@ -70,3 +89,6 @@ If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approv
70
89
  | "That file is probably where I said it is." | "Probably" isn't verified. Check it before the user does. |
71
90
  | "The plan reads fine — I don't need to re-review it." | You wrote it, so you're blind to its gaps. Re-read it as someone who has to execute it. |
72
91
  | "Repeating context across similar tasks is wasteful." | The reader may read tasks out of order. Restate the relevant detail. |
92
+ | "It's just docs / a refactor — it doesn't need isolation." | Non-mechanical changes still collide with other work. Route by the flowchart: structural and docs changes get `slow-powers:working-in-isolation`. |
93
+ | "TDD doesn't apply, so no skill applies." | TDD is only the *functional* branch. Non-functional, non-mechanical work still has a required next skill — isolation. |
94
+ | "I told them to work on a branch / isolate the work — that covers it." | Generic isolation advice in your own words isn't the hand-off. Name `slow-powers:working-in-isolation` as the required next skill — the named hand-off is the deliverable, on a cold draft as much as a contested one. |
@@ -1,20 +1,25 @@
1
1
  # Baseline — hardening-plans
2
2
 
3
3
  Committed reference output from a canonical eval run. Regenerate with
4
- `bun run evals:promote-baseline -- --skill hardening-plans --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
- dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
4
+ `skill-eval promote-baseline --skill hardening-plans --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
5
+ dispatch files, produced outputs) stays gitignored under `skills-workspace/`
6
+ and is reclaimable by `skill-eval teardown` once promoted (this commit's marker).
6
7
 
7
8
  | Field | Value |
8
9
  |-------|-------|
9
10
  | Mode | revision |
10
- | Iteration | iteration-1 |
11
+ | Iteration | iteration-2 |
11
12
  | Harness | claude-code |
12
13
  | Agent model | claude-sonnet-4-6 |
13
14
  | Judge model | claude-sonnet-4-6 |
14
15
  | Conditions | old_skill, new_skill |
15
- | Run timestamp | 2026-05-31T18:40:23.484Z |
16
- | Label | 3b-fresh-eyes-review |
17
- | Promoted from commit | bbca8ca |
16
+ | Run timestamp | 2026-06-07T04:07:44.660Z |
17
+ | Label | next-step-named-handoff |
18
+ | Promoted from commit | 7dc77dd |
19
+
20
+ `old_skill` = `next-step-v1` (commit `b62c4cd`, the next-step flowchart **without**
21
+ the named-hand-off requirement). `new_skill` = the working tree at promotion
22
+ (`7dc77dd`, flowchart **with** the named-hand-off requirement).
18
23
 
19
24
  Files:
20
25
  - `benchmark.json` — aggregate pass-rate / duration / token deltas.
@@ -1,65 +1,79 @@
1
- # Notes — hardening-plans 3b baseline (iteration-1)
1
+ # Notes — hardening-plans next-step baseline (iteration-2)
2
2
 
3
3
  Forward-looking observations from the run that produced this baseline. Read these
4
4
  before trusting the headline `benchmark.json` aggregate.
5
5
 
6
- ## Read the per-case deltas, not the aggregate
7
-
8
- The aggregate `delta.pass_rate` is **−22pp (new_skill below old_skill)**, but that
9
- number is misleading on its own — it is dragged entirely by one confounded
10
- negative case (see below). The per-case picture:
11
-
12
- | Case | old | new | note |
13
- |------|-----|-----|------|
14
- | `concrete-todo-app-plan` | 100% | 100% | no discrimination (both pass) |
15
- | `seeded-review-catches-defects` | 67% | **100%** | **the headline: +33pp** |
16
- | `csv-parser-bug-no-plan` (negative) | 100% | **0%** | confounded regression, see below |
17
-
18
- ## The headline behavioral delta is clean
19
-
20
- `seeded-review-catches-defects` is the case the 3b reframe targets. The
21
- discriminating assertion is **`catches_hallucinated_file`**: old_skill **FAIL**
22
- (carried `src/hooks/useLocalStorage.ts` forward as "Already exists; verify
23
- signature") new_skill **PASS** (flagged it as unconfirmed, reworded to
24
- "create or extend"). Invocation rate 100% in both arms, no `validity_warnings` —
25
- so the delta reflects the skill, not a trigger artifact. `catches_irrelevant_step`
26
- (Redux) and `hands_off_to_tdd` passed in *both* arms, so they don't discriminate
27
- here; `catches_hallucinated_file` is the load-bearing one.
28
-
29
- ## The csv-parser regression is explained and orthogonal to the reframe
30
-
31
- On the negative over-trigger guard, new_skill loaded `hardening-plans` and drafted
32
- + hardened a plan instead of routing to `systematic-debugging` (old_skill routed
33
- correctly). **Confirmed proximate cause:** the pre-3b "When NOT to Use" section
34
- carried an explicit signpost —
35
-
36
- > * The task is debugging — load `slow-powers:systematic-debugging` instead.
37
-
38
- — and the 3b rewrite **dropped that line**. The old arm matched it and routed; the
39
- new arm had no such signpost and fell through to plan-then-harden. This is a *real*
40
- side effect of a 3b text change, **not** N=1 noise.
41
-
42
- Ruled out: plan-mode framing. `csv-parser-bug-no-plan` is a **cold** prompt — it
43
- injects no plan-mode context (only the seeded cases do). So the
44
- "debugging-request-in-plan-mode" philosophical wrinkle (tracked separately as an
45
- internal eval-framing issue) does **not** explain this failure; the dropped line
46
- does.
47
-
48
- ## Suggested follow-up (not done here)
49
-
50
- Re-adding the one-line debugging route to "When NOT to Use" would very likely
51
- restore the negative guard at near-zero risk to the reframe. Deferred as a
52
- separate change so 3b stays one-problem-per-PR; left to the maintainer's call.
53
-
54
- **Update (2026-06-01):** `csv-parser-bug-no-plan` was retired per #111 the
55
- over-trigger guard went red on correct consult-then-defer behavior (the agent
56
- loads `hardening-plans`, reads "When NOT to Use," and correctly routes to
57
- `systematic-debugging`, which the strict rubric scored as a failure). Its grading
58
- files were removed; the table and `benchmark.json` below still reflect the
59
- original 3-case run as it happened.
6
+ ## What this baseline measures
7
+
8
+ Revision (Mode B), sonnet agent + sonnet judge, 8 cases, N=1 per case/condition:
9
+
10
+ - **`old_skill` = `next-step-v1`** (commit `b62c4cd`): the next-step *flowchart*
11
+ (functional → TDD, non-mechanical/non-functional → working-in-isolation,
12
+ informational/trivial no skill) **without** an explicit instruction to emit a
13
+ *named* hand-off.
14
+ - **`new_skill`** (commit `7dc77dd`): same flowchart **plus** "close your hand-off
15
+ by naming the required next skill verbatim, even on a cold draft", the canonical
16
+ "You must complete next" cross-references, and a red-flag + rationalization
17
+ closing the spirit-vs-letter loophole.
18
+
19
+ This is the **second** iteration of the issue #188 work. Iteration-1 compared the
20
+ flowchart (`b62c4cd`) against the *pre-flowchart* single-TDD-gate (`dev`); see the
21
+ "iteration-1 context" section below for why iteration-2 exists.
22
+
23
+ ## Headline: clean sweep, but read the flakiness caveat
24
+
25
+ `new_skill` passed **8/8 cases, stddev 0** (100%) vs `old_skill` **87.5%**
26
+ (delta **+12.5pp** toward new). Both arms invoked the skill 100%; no
27
+ `validity_warnings`. `new_skill` also used slightly fewer tokens on average.
28
+
29
+ The two `old_skill` misses were the noisy fresh-eyes assertions, not routing:
30
+ `seeded-plan-mode-todo-app-adversarial/no_placeholders` and
31
+ `oauth-task-breakdown-cold/hands_off_to_tdd`. `new_skill` passed both the
32
+ strengthened "You must complete next" phrasing plausibly firmed up the
33
+ functional TDD hand-off too but at N=1 these are within run-to-run noise.
34
+
35
+ ## The structural-refactor-cold caveat (the important one)
36
+
37
+ `structural-refactor-cold` is the case the iteration-2 edit targeted, and it is
38
+ **flaky at N=1**. The `routes_to_working_in_isolation` assertion on the
39
+ *identical* `b62c4cd` content flipped across runs:
40
+
41
+ | skill content | run | routes_to_working_in_isolation |
42
+ |---|---|---|
43
+ | `b62c4cd` (no named-hand-off line) | iteration-1 `new_skill` | **FAIL** (gave generic "set up an isolated branch" advice, never named the skill) |
44
+ | `b62c4cd` (no named-hand-off line) | iteration-2 `old_skill` | **PASS** (named the skill on its own) |
45
+ | `7dc77dd` (named-hand-off line) | iteration-2 `new_skill` | **PASS** (explicit "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`") |
46
+
47
+ So this single run does **not** cleanly attribute the cold-structural pass to the
48
+ edit: `old_skill` happened to pass it too. What the run *does* show is that
49
+ `new_skill` is **≥ `old_skill` on every case, swept 8/8 with zero variance, and
50
+ emitted the named hand-off on the cold draft** with no regressions. Treat the
51
+ +12.5pp as "at least as good, and reliably named" rather than proof the edit beats
52
+ v1 *specifically on the flaky case*. **If you revisit this, replicate
53
+ `structural-refactor-cold` a few times per condition** (the runner has no per-case
54
+ run multiplier use repeated `--only structural-refactor-cold` iterations) to
55
+ firm up the attribution.
56
+
57
+ ## Iteration-1 context (why iteration-2 exists)
58
+
59
+ Iteration-1 (`dev` single-TDD-gate vs `b62c4cd` flowchart) showed the flowchart's
60
+ **clean win on the seeded #188 case**: `docs-refactor-plan-mode` — old talked
61
+ itself out of isolation (the audited #188 bug), new routed to
62
+ `slow-powers:working-in-isolation`. But it also exposed the gap this baseline
63
+ closes: on the **cold** `structural-refactor-cold` draft, the flowchart produced
64
+ isolation advice *in spirit* without *naming* the skill, failing the assertion.
65
+ That gap motivated the named-hand-off edit measured here.
66
+
67
+ ## Noisy assertions to distrust at N=1
68
+
69
+ `no_placeholders` and `hands_off_to_tdd` scatter PASS/FAIL across both conditions
70
+ run-to-run; they are fresh-eyes/quality checks, not tests of the routing change.
71
+ Don't read a single-run flip on either as signal. The routing assertions
72
+ (`routes_to_working_in_isolation`, `does_not_force_tdd`, `no_forced_next_skill`)
73
+ are the ones this baseline exists to track.
60
74
 
61
75
  ## Provenance / scope
62
76
 
63
- 3-case cost-conscious subset (the runner has no per-case selector — tracked as a
64
- follow-up issue; the full 6-case suite was temporarily reduced for this run and
65
- restored afterward). Agent + judge both `claude-sonnet-4-6`.
77
+ 8-case full suite. Plan-mode injection **off** (the seeded cases carry plan
78
+ framing in prose; the `*-cold` and research cases are deliberately cold). Agent +
79
+ judge both `claude-sonnet-4-6`.
@@ -1,54 +1,54 @@
1
1
  {
2
- "generated": "2026-05-31T18:45:08.006Z",
2
+ "generated": "2026-06-07T04:14:04.136Z",
3
3
  "mode": "revision",
4
- "baseline": "pre-3b",
4
+ "baseline": "next-step-v1",
5
5
  "conditions_compared": ["old_skill", "new_skill"],
6
6
  "missing_gradings": 0,
7
7
  "validity_warnings": [],
8
8
  "run_summary": {
9
9
  "old_skill": {
10
10
  "pass_rate": {
11
- "mean": 0.889,
12
- "stddev": 0.157,
13
- "n": 3
11
+ "mean": 0.875,
12
+ "stddev": 0.217,
13
+ "n": 8
14
14
  },
15
15
  "duration_ms": {
16
- "mean": 67442,
17
- "stddev": 25787,
18
- "n": 3
16
+ "mean": 68065,
17
+ "stddev": 15551,
18
+ "n": 8
19
19
  },
20
20
  "total_tokens": {
21
- "mean": 18945,
22
- "stddev": 3610,
23
- "n": 3
21
+ "mean": 84608,
22
+ "stddev": 6327,
23
+ "n": 8
24
24
  },
25
- "skill_invocation_n": 2,
25
+ "skill_invocation_n": 8,
26
26
  "skill_invocation_rate": 1
27
27
  },
28
28
  "new_skill": {
29
29
  "pass_rate": {
30
- "mean": 0.667,
31
- "stddev": 0.471,
32
- "n": 3
30
+ "mean": 1,
31
+ "stddev": 0,
32
+ "n": 8
33
33
  },
34
34
  "duration_ms": {
35
- "mean": 50963,
36
- "stddev": 6742,
37
- "n": 3
35
+ "mean": 68376,
36
+ "stddev": 14889,
37
+ "n": 8
38
38
  },
39
39
  "total_tokens": {
40
- "mean": 16728,
41
- "stddev": 770,
42
- "n": 3
40
+ "mean": 78521,
41
+ "stddev": 16433,
42
+ "n": 8
43
43
  },
44
- "skill_invocation_n": 2,
44
+ "skill_invocation_n": 8,
45
45
  "skill_invocation_rate": 1
46
46
  }
47
47
  },
48
48
  "delta": {
49
49
  "direction": "old_skill - new_skill",
50
- "pass_rate": 0.222,
51
- "duration_ms": 16479,
52
- "total_tokens": 2217
50
+ "pass_rate": -0.125,
51
+ "duration_ms": -311,
52
+ "total_tokens": 6087
53
53
  }
54
54
  }
@@ -3,14 +3,14 @@
3
3
  {
4
4
  "id": "no_placeholders",
5
5
  "passed": true,
6
- "evidence": "All 8 steps are fully concrete: file paths are named (e.g. 'src/types.ts', 'src/components/TodoItem.tsx'), prop interfaces are written out in full, handler logic is specified ('maps todos, flipping completed on the matching id'), CSS values are exact ('max-width: 480px', 'color: #888'), and the test table lists six explicit scenarios with expected results. No 'TBD', 'TODO', 'later', 'if needed', or equivalent placeholder appears anywhere in the plan.",
6
+ "evidence": "Every task in the plan is fully concrete: Task 1 gives exact CLI commands and filenames to delete; Task 2 provides the complete TypeScript interface; Task 3 specifies exact state shape, API signatures, and side-effect behavior; Tasks 4–6 give exact prop types, rendered element structure, and CSS class names; Task 7 specifies exact destructuring and JSX structure; Task 8 gives exact CSS property values. The tests section lists specific test cases per component. No 'TBD', 'TODO', 'later', 'if needed', or any other placeholder appears anywhere in the final_message.",
7
7
  "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "hands_off_to_tdd",
12
12
  "passed": true,
13
- "evidence": "\"When implementation begins, use `slow-powers:test-driven-development` for the implementation phase.\"",
13
+ "evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis plan adds runtime behavior, so TDD governs the implementation phase.",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  }
@@ -3,14 +3,14 @@
3
3
  {
4
4
  "id": "no_placeholders",
5
5
  "passed": true,
6
- "evidence": "Every task in the plan names a specific file, function, and observable outcome. The final message states explicitly: 'Every task names the file, the function, and the observable outcome (exact error message or assertion). No placeholders.' The grep scan found no placeholder terms all matches were false positives from domain vocabulary ('TodoItem', 'useTodos'). No 'TBD', 'TODO' (as a placeholder), 'later', 'if needed', or 'etc.' appear as unresolved items anywhere in the plan.",
6
+ "evidence": "All steps are fully concrete: step 1 gives exact CLI commands (`npm create vite@latest todo-app -- --template react-ts`), step 2 provides a complete TypeScript interface, step 4 lists specific file paths and prop signatures, step 7 names exact npm packages and config keys, and step 9 gives verbatim build/test commands. No 'TBD', 'TODO', 'later', 'if needed', or any other placeholder appears anywhere in the plan.",
7
7
  "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "hands_off_to_tdd",
12
12
  "passed": true,
13
- "evidence": "\"Implementation follows Red→Green→Refactor load `slow-powers:test-driven-development` when you start coding.\" (final_message, last line)",
13
+ "evidence": "\"Once you approve this plan, the next step is `slow-powers:test-driven-development` (this is a functional code change that adds runtime behavior).\"",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation` — before any file is touched.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "The final_message names exactly one required next skill: 'REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`'. There is no mention of test-driven-development anywhere in the response.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "\"Wrong next step. I said 'I'll skip the slow-powers skills and just start editing on the current branch.' That's incorrect. A docs restructure of this scope ... is non-mechanical and non-functional. The routing rule is clear: non-mechanical, non-functional changes require `slow-powers:working-in-isolation` before starting.\" and \"Set the next step correctly to `slow-powers:working-in-isolation`.\"",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "The final_message sets the corrected next step as 'slow-powers:working-in-isolation' and never mentions test-driven-development at all — TDD is neither imposed nor referenced anywhere in the response.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "No 'TBD', 'TODO', 'later', or 'if needed' language appears anywhere in the final message. Every task is concrete: Task 9 even explicitly instructs 'Identify those routes now (do not defer)'. The angle-bracket tokens in Task 3 (e.g. '<paste from Google Console>') are engineer instructions, not deferred decisions.",
7
+ "confidence": 0.97,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis plan adds runtime behavior (new auth routes, session handling, database writes). Before the engineer writes any implementation code, they must complete `slow-powers:test-driven-development`.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "No 'TBD', 'TODO', 'later', or 'if needed' strings appear anywhere in the final_message. Every task supplies concrete file paths (e.g. 'src/auth/passport.js'), package names with version-pinning instructions, explicit configuration values ('resave: false', 'saveUninitialized: false'), and step-by-step actions. Template tokens like '<provider>' are intentional decision-point markers resolved by Task 1, not unspecified implementation items.",
7
+ "confidence": 0.95,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": false,
13
+ "evidence": "The final_message contains no mention of 'slow-powers:test-driven-development', no hand-off to the TDD skill, and no use of the label 'TDD' anywhere. Task 10 ('Write tests') describes what tests to write but names no implementation skill and makes no explicit hand-off.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 1,
20
+ "failed": 1,
21
+ "total": 2,
22
+ "pass_rate": 0.5
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_forced_next_skill",
5
+ "passed": true,
6
+ "evidence": "The final_message contains no mention of slow-powers:test-driven-development or slow-powers:working-in-isolation as required next steps; it closes with 'No repository or pipeline changes are required during the investigation,' confirming the response treats this as a pure read-only investigation with no forced skill gate.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "Skill invocation verified from transcript.",
22
+ "confidence": 1,
23
+ "grader": "transcript_check"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_forced_next_skill",
5
+ "passed": true,
6
+ "evidence": "The final_message ends with 'Once you've worked through these steps, share what you find and we can scope the fixes.' — no mention of slow-powers:test-driven-development or slow-powers:working-in-isolation as a required next step anywhere in the response.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "Skill invocation verified from transcript.",
22
+ "confidence": 1,
23
+ "grader": "transcript_check"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "Every section of the plan is fully concrete: file paths are specified ('src/components/TodoItem.tsx', 'src/components/TodoList.tsx', 'src/components/AddTodoForm.tsx', etc.), the data shape is typed with exact fields, prop interfaces are written out, handler logic is described step-by-step, test cases enumerate specific scenarios including edge cases ('Persistence — corrupt key: set localStorage.getItem(\"todos\") to \"not-json\" before mounting; assert the component mounts with an empty list and does not throw'), and the smoke test is a numbered checklist. No occurrences of 'TBD', 'TODO', 'later', 'if needed', or equivalent placeholder language appear anywhere in the final_message.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "### Required next skill\n\nThis plan describes a functional code change (adds runtime behavior). The required next step is `slow-powers:test-driven-development`.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": false,
6
+ "evidence": "The agent explicitly labels the file paths as placeholders: 'I’ve added placeholder paths below using the conventional `src/components/` location — confirm or correct these before we proceed.' The plan therefore contains a declared placeholder rather than a confirmed concrete value.",
7
+ "confidence": 0.97,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "\"This is a functional code change, so **`slow-powers:test-driven-development` is required next** — write the tests in Task 5 before touching the implementation files. TDD will also set up the isolated workspace.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 1,
20
+ "failed": 1,
21
+ "total": 2,
22
+ "pass_rate": 0.5
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "The final_message contains no placeholders — every step is fully specified: file paths are named (e.g. 'src/components/TodoItem.tsx'), props and state shapes are enumerated, handler logic is described in concrete terms (e.g. 'appends { id: crypto.randomUUID(), text, completed: false } to todos'), and all five test cases are spelled out with explicit setup and assertion steps. No 'TBD', 'TODO', 'later', 'if needed', or equivalent deferral language appears anywhere.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "\"REQUIRED NEXT: `slow-powers:working-in-isolation` (to establish an isolated branch), then `slow-powers:test-driven-development` for the implementation phase.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "The final_message contains no instances of 'TBD', 'TODO', 'later', or 'if needed'. Every task specifies concrete file names, TypeScript interfaces, JSX snippets, and handler logic. The one vague phrase — 'max-width as appropriate for the app's existing style' in the TodoList.css description — is a style guidance note, not a placeholder that defers implementation.",
7
+ "confidence": 0.9,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "\"This plan adds runtime behavior, so the implementation phase must follow `slow-powers:test-driven-development` (Task 1 above is the TDD entry point — tests before code). That skill also requires `slow-powers:working-in-isolation` before touching any files.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -3,21 +3,21 @@
3
3
  {
4
4
  "id": "catches_hallucinated_file",
5
5
  "passed": true,
6
- "evidence": "\"Unverified file reference. The draft said 'extend the *existing* `src/hooks/useLocalStorage.ts`' I have no confirmation this file exists in your project. Reworded to 'create or extend' so the step is honest about what will happen.\"",
6
+ "evidence": "The final_message explicitly calls out 'Hallucinated file references two files cited as \"existing\" without verification.' listing `src/hooks/useLocalStorage.ts` and noting 'Neither was confirmed to exist.' The corrected plan's Step 1 reads: 'Confirm `src/hooks/useLocalStorage.ts` exists. If it does not, create it…'",
7
7
  "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "catches_irrelevant_step",
12
12
  "passed": true,
13
- "evidence": "\"Hallucinated Redux dependency (Red Flag). Step 4 introduced a Redux store and todo slice that you never asked for. Redux is a different state management pattern that contradicts the localStorage-first approach in step 1. Removed.\"",
13
+ "evidence": "**Unrequested dependency Redux introduced with no basis in the spec.** The request is for localStorage persistence. A Redux slice is an invented architectural choice. It also creates structural incoherence: localStorage (Step 1) and Redux (Step 4) would both manage the same state.",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  },
17
17
  {
18
18
  "id": "hands_off_to_tdd",
19
19
  "passed": true,
20
- "evidence": "\"Ready to start? I'll follow `test-driven-development` for the implementation phase.\"",
20
+ "evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis is a functional code change that adds runtime behavior. TDD governs the implementation phase.",
21
21
  "confidence": 1,
22
22
  "grader": "llm_judge"
23
23
  }
@@ -2,31 +2,31 @@
2
2
  "assertion_results": [
3
3
  {
4
4
  "id": "catches_hallucinated_file",
5
- "passed": false,
6
- "evidence": "The file map states 'Already exists; verify signature covers generics.' and Task 2 says 'Open src/hooks/useLocalStorage.ts. Confirm the exported signature...' the response treats the file's existence as a given inherited from the draft, only adding a signature-verification step, never questioning whether the file actually exists at all.",
7
- "confidence": 0.92,
5
+ "passed": true,
6
+ "evidence": "\"Hallucinated file references (Red Flag): The plan said 'extend the existing `src/hooks/useLocalStorage.ts`' and 'add Redux store configuration in `src/store/index.ts`' neither was verified to exist. The plan now flags both for explicit confirmation before coding begins.\" and the finalized plan includes a prerequisite verification step: \"Check whether `src/hooks/useLocalStorage.ts` exists. If yes, extend it. If no, create it at that path.\"",
7
+ "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "catches_irrelevant_step",
12
12
  "passed": true,
13
- "evidence": "Redux introduced without justification. Step 4 added a Redux store slice for a feature that only needs `useLocalStorage`. Introducing Redux adds three files and a new dependency for no benefit here. I've dropped it and kept state local to the `TodoList` component, backed by the hook.",
13
+ "evidence": "\"Unrequested Redux dependency: Step 4 introduced a Redux slice and store wiring that the user never asked for and that useState + useLocalStorage handles without additional infrastructure. Removed entirely it was an irrelevant step.\"",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  },
17
17
  {
18
18
  "id": "hands_off_to_tdd",
19
19
  "passed": true,
20
- "evidence": "\"For all code-writing tasks below, follow `slow-powers:test-driven-development` (Red-Green-Refactor).\" and \"load `slow-powers:test-driven-development` when you reach the code-writing steps\" — both replace the bare '(TDD.)' label from the draft with an explicit named skill hand-off.",
20
+ "evidence": "**Next step:** This is a functional code change. Required next skill: `slow-powers:test-driven-development`.",
21
21
  "confidence": 1,
22
22
  "grader": "llm_judge"
23
23
  }
24
24
  ],
25
25
  "summary": {
26
- "passed": 2,
27
- "failed": 1,
26
+ "passed": 3,
27
+ "failed": 0,
28
28
  "total": 3,
29
- "pass_rate": 0.6666666666666666
29
+ "pass_rate": 1
30
30
  },
31
31
  "meta_results": [
32
32
  {
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation` — complete this before starting, so the reorganization branch doesn't collide with other in-flight work.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "The response names 'REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`' as the required gate and never mentions test-driven-development. The only reference to tests is in step 6/Verify: 'Run: full test suite; tsc --noEmit; linter' — using existing tests to confirm no behavior change, not demanding tests-first authorship.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "\"Before you start: because this is a structural, non-functional change, set up an isolated branch/worktree first (`slow-powers:working-in-isolation`) so this work doesn't collide with other in-progress branches.\"",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "Task 5 says 'Run the full test suite to confirm no behavior changes were introduced' — running existing tests to validate a refactor, not imposing TDD. The only skill referenced is 'slow-powers:working-in-isolation'; test-driven-development is never mentioned or required.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -90,6 +90,52 @@
90
90
  "rubric": "Does the response make a real hand-off to the implementation skill — explicitly naming `slow-powers:test-driven-development` as the required next skill for the implementation phase — rather than merely dropping 'TDD' as an unexplained label or parenthetical? PASS if test-driven-development is named as the next/required skill for implementation. FAIL if 'TDD' appears only as a bare label with no hand-off, or is absent. (The plan is NOT required to sequence each task tests-first; that is implementer discipline owned by the TDD skill, not plan structure.)"
91
91
  }
92
92
  ]
93
+ },
94
+ {
95
+ "id": "docs-refactor-plan-mode",
96
+ "prompt": "[The following is the conversation so far in this session. You are the assistant operating inside the harness's plan-mode workflow — you have already explored the repo and drafted a plan. Continue from the final user turn.]\n\nUser: Our docs have drifted. I want a big restructure of the README and the docs/ tree — split the monolithic README into topic pages, rewrite the getting-started guide, and fix the cross-links. Draft a plan I can review before you touch any files.\n\nAssistant: I've read through the docs and drafted the restructure plan.\n\n## Approach\n1. Split `README.md` into `docs/install.md`, `docs/usage.md`, and `docs/faq.md`, leaving a short overview in `README.md`.\n2. Rewrite `docs/getting-started.md` against the current CLI flags.\n3. Update every internal cross-link to the new page locations.\n\n## Next step\nThis is a docs-only change — there's no code and nothing to test, so I'll skip the slow-powers skills and just start editing on the current branch once you approve. Working in a separate workspace would be overkill for prose.\n\nUser: Approach looks right. Give me the finalized plan and let's go.",
97
+ "expected_output": "The agent should load `hardening-plans` and run the fresh-eyes review on the drafted docs-restructure plan (spec coverage, referenced files real, every step relevant, no placeholders, names consistent). Crucially, on the next-step routing it should NOT force `slow-powers:test-driven-development` (there is no functional code change) and should NOT accept the seeded 'skip the skills, edit on the current branch' rationalization: a docs/skill rewrite is a non-mechanical, non-functional change, so the required next skill is `slow-powers:working-in-isolation`. This mirrors a real audited session (issue #188) where a docs refactor wrongly talked itself out of isolation. This is a seeded probe, so a pass is a stronger-than-cold signal, not ground truth (see the 'Seeding conversation context (and its ceiling)' note in evaluating-skills). skill_should_trigger stays true.",
98
+ "assertions": [
99
+ {
100
+ "id": "routes_to_working_in_isolation",
101
+ "type": "llm_judge",
102
+ "rubric": "Does the response name `slow-powers:working-in-isolation` as the required next step before editing — correcting the seeded plan's 'skip the skills and edit on the current branch' stance? PASS if working-in-isolation is named as the required/next skill for this docs change. FAIL if it endorses skipping isolation, or names no next skill at all."
103
+ },
104
+ {
105
+ "id": "does_not_force_tdd",
106
+ "type": "llm_judge",
107
+ "rubric": "Does the response correctly refrain from demanding `slow-powers:test-driven-development` as the required next skill for this docs-only change? PASS if TDD is not imposed as the required next step (it may be mentioned only to explain it does not apply). FAIL if the response forces test-driven-development as the next gate for a change with no runtime behavior."
108
+ }
109
+ ]
110
+ },
111
+ {
112
+ "id": "structural-refactor-cold",
113
+ "prompt": "I'm about to reorganize our backend source. Right now everything sits flat in `src/` — I want to move the files into module sub-directories (`src/auth/`, `src/billing/`, `src/users/`, etc.) and fix the imports, with no behavior changes. Before I start, break this into a plan I can review.",
114
+ "expected_output": "The agent should load `hardening-plans` and produce a concrete refactor plan (every referenced path real or a real named home, no placeholders, every step relevant to the move). On next-step routing, this is a structural, non-functional code change: existing tests act as a continuous green flag and there is no new behavior to drive tests-first, so the agent should NOT force `slow-powers:test-driven-development`. The required next skill is `slow-powers:working-in-isolation` (a large file move must be isolated from other work). This mirrors a real audited session (issue #188). This is a colder trigger: the request frames the agent as already about to plan, so the skill must fire on the about-to-hand-off moment.",
115
+ "assertions": [
116
+ {
117
+ "id": "routes_to_working_in_isolation",
118
+ "type": "llm_judge",
119
+ "rubric": "Does the response name `slow-powers:working-in-isolation` as the required next step before starting the refactor? PASS if working-in-isolation is named as the required/next skill. FAIL if no next skill is named, or only TDD is named."
120
+ },
121
+ {
122
+ "id": "does_not_force_tdd",
123
+ "type": "llm_judge",
124
+ "rubric": "Does the response correctly refrain from demanding `slow-powers:test-driven-development` as the required next skill for this behavior-preserving refactor? PASS if TDD is not imposed as the required next gate (it may note that existing tests verify the move). FAIL if the response forces test-driven-development tests-first for a refactor that adds no behavior."
125
+ }
126
+ ]
127
+ },
128
+ {
129
+ "id": "research-plan-no-required-skill",
130
+ "prompt": "Before we change anything, I want to understand why our CI pipeline got slow over the last month. Draft a plan for the investigation — what to measure, which logs and timing data to pull, and how to narrow down the regression. No code changes yet, just the investigation steps.",
131
+ "expected_output": "The agent should load `hardening-plans` and harden the investigation plan (concrete measurement steps, real log/artifact references, no placeholders, every step relevant to finding the regression). On next-step routing, this plan is purely informational — research and running commands, no code change — so the correct outcome is that NO follow-up skill is forced: neither `slow-powers:test-driven-development` nor `slow-powers:working-in-isolation` should be imposed as a required next gate. The agent should route to 'follow your instincts' for the investigation itself.",
132
+ "assertions": [
133
+ {
134
+ "id": "no_forced_next_skill",
135
+ "type": "llm_judge",
136
+ "rubric": "Does the response correctly avoid imposing any required follow-up slow-powers skill for this research/investigation plan? PASS if it does not mandate `slow-powers:test-driven-development` or `slow-powers:working-in-isolation` as a required next step (it may mention that no required skill applies). FAIL if it forces either skill as the required next gate for a pure investigation with no code change."
137
+ }
138
+ ]
93
139
  }
94
140
  ]
95
141
  }