@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +69 -5
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/hardening-plans/SKILL.md +29 -7
  16. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  17. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  18. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  19. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  20. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  21. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  22. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  24. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  25. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  26. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  27. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  28. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  29. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  30. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  31. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  32. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  33. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  34. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  35. package/skills/hardening-plans/evals/evals.json +46 -0
  36. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  37. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  38. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  39. package/skills/evaluating-skills/harness-parity.md +0 -155
  40. package/skills/evaluating-skills/runner/README.md +0 -163
  41. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  42. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  43. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  44. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  45. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  46. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  47. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  48. package/skills/evaluating-skills/runner/context.ts +0 -90
  49. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  50. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  51. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  52. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  53. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  54. package/skills/evaluating-skills/runner/grade.ts +0 -603
  55. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  56. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  57. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  58. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  59. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  60. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  61. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  62. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  63. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  64. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  65. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  66. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  67. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  68. package/skills/evaluating-skills/runner/run.ts +0 -1388
  69. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  70. package/skills/evaluating-skills/runner/types.ts +0 -121
  71. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  72. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  73. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  74. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  75. package/skills/evaluating-skills/runner/validate.ts +0 -21
  76. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  77. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  78. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  79. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  80. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  81. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  82. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  83. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  84. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  85. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -3,14 +3,14 @@
3
3
  {
4
4
  "id": "no_placeholders",
5
5
  "passed": true,
6
- "evidence": "Every task in the plan names a specific file, function, and observable outcome. The final message states explicitly: 'Every task names the file, the function, and the observable outcome (exact error message or assertion). No placeholders.' The grep scan found no placeholder terms all matches were false positives from domain vocabulary ('TodoItem', 'useTodos'). No 'TBD', 'TODO' (as a placeholder), 'later', 'if needed', or 'etc.' appear as unresolved items anywhere in the plan.",
6
+ "evidence": "All steps are fully concrete: step 1 gives exact CLI commands (`npm create vite@latest todo-app -- --template react-ts`), step 2 provides a complete TypeScript interface, step 4 lists specific file paths and prop signatures, step 7 names exact npm packages and config keys, and step 9 gives verbatim build/test commands. No 'TBD', 'TODO', 'later', 'if needed', or any other placeholder appears anywhere in the plan.",
7
7
  "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "hands_off_to_tdd",
12
12
  "passed": true,
13
- "evidence": "\"Implementation follows Red→Green→Refactor load `slow-powers:test-driven-development` when you start coding.\" (final_message, last line)",
13
+ "evidence": "\"Once you approve this plan, the next step is `slow-powers:test-driven-development` (this is a functional code change that adds runtime behavior).\"",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation` — before any file is touched.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "The final_message names exactly one required next skill: 'REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`'. There is no mention of test-driven-development anywhere in the response.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "\"Wrong next step. I said 'I'll skip the slow-powers skills and just start editing on the current branch.' That's incorrect. A docs restructure of this scope ... is non-mechanical and non-functional. The routing rule is clear: non-mechanical, non-functional changes require `slow-powers:working-in-isolation` before starting.\" and \"Set the next step correctly to `slow-powers:working-in-isolation`.\"",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "The final_message sets the corrected next step as 'slow-powers:working-in-isolation' and never mentions test-driven-development at all — TDD is neither imposed nor referenced anywhere in the response.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "No 'TBD', 'TODO', 'later', or 'if needed' language appears anywhere in the final message. Every task is concrete: Task 9 even explicitly instructs 'Identify those routes now (do not defer)'. The angle-bracket tokens in Task 3 (e.g. '<paste from Google Console>') are engineer instructions, not deferred decisions.",
7
+ "confidence": 0.97,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis plan adds runtime behavior (new auth routes, session handling, database writes). Before the engineer writes any implementation code, they must complete `slow-powers:test-driven-development`.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "No 'TBD', 'TODO', 'later', or 'if needed' strings appear anywhere in the final_message. Every task supplies concrete file paths (e.g. 'src/auth/passport.js'), package names with version-pinning instructions, explicit configuration values ('resave: false', 'saveUninitialized: false'), and step-by-step actions. Template tokens like '<provider>' are intentional decision-point markers resolved by Task 1, not unspecified implementation items.",
7
+ "confidence": 0.95,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": false,
13
+ "evidence": "The final_message contains no mention of 'slow-powers:test-driven-development', no hand-off to the TDD skill, and no use of the label 'TDD' anywhere. Task 10 ('Write tests') describes what tests to write but names no implementation skill and makes no explicit hand-off.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 1,
20
+ "failed": 1,
21
+ "total": 2,
22
+ "pass_rate": 0.5
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_forced_next_skill",
5
+ "passed": true,
6
+ "evidence": "The final_message contains no mention of slow-powers:test-driven-development or slow-powers:working-in-isolation as required next steps; it closes with 'No repository or pipeline changes are required during the investigation,' confirming the response treats this as a pure read-only investigation with no forced skill gate.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "Skill invocation verified from transcript.",
22
+ "confidence": 1,
23
+ "grader": "transcript_check"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -0,0 +1,32 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_forced_next_skill",
5
+ "passed": true,
6
+ "evidence": "The final_message ends with 'Once you've worked through these steps, share what you find and we can scope the fixes.' — no mention of slow-powers:test-driven-development or slow-powers:working-in-isolation as a required next step anywhere in the response.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ }
10
+ ],
11
+ "summary": {
12
+ "passed": 1,
13
+ "failed": 0,
14
+ "total": 1,
15
+ "pass_rate": 1
16
+ },
17
+ "meta_results": [
18
+ {
19
+ "id": "__skill_invoked",
20
+ "passed": true,
21
+ "evidence": "Skill invocation verified from transcript.",
22
+ "confidence": 1,
23
+ "grader": "transcript_check"
24
+ }
25
+ ],
26
+ "meta_summary": {
27
+ "passed": 1,
28
+ "failed": 0,
29
+ "total": 1,
30
+ "skill_invoked": true
31
+ }
32
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "Every section of the plan is fully concrete: file paths are specified ('src/components/TodoItem.tsx', 'src/components/TodoList.tsx', 'src/components/AddTodoForm.tsx', etc.), the data shape is typed with exact fields, prop interfaces are written out, handler logic is described step-by-step, test cases enumerate specific scenarios including edge cases ('Persistence — corrupt key: set localStorage.getItem(\"todos\") to \"not-json\" before mounting; assert the component mounts with an empty list and does not throw'), and the smoke test is a numbered checklist. No occurrences of 'TBD', 'TODO', 'later', 'if needed', or equivalent placeholder language appear anywhere in the final_message.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "### Required next skill\n\nThis plan describes a functional code change (adds runtime behavior). The required next step is `slow-powers:test-driven-development`.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": false,
6
+ "evidence": "The agent explicitly labels the file paths as placeholders: 'I’ve added placeholder paths below using the conventional `src/components/` location — confirm or correct these before we proceed.' The plan therefore contains a declared placeholder rather than a confirmed concrete value.",
7
+ "confidence": 0.97,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "\"This is a functional code change, so **`slow-powers:test-driven-development` is required next** — write the tests in Task 5 before touching the implementation files. TDD will also set up the isolated workspace.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 1,
20
+ "failed": 1,
21
+ "total": 2,
22
+ "pass_rate": 0.5
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "The final_message contains no placeholders — every step is fully specified: file paths are named (e.g. 'src/components/TodoItem.tsx'), props and state shapes are enumerated, handler logic is described in concrete terms (e.g. 'appends { id: crypto.randomUUID(), text, completed: false } to todos'), and all five test cases are spelled out with explicit setup and assertion steps. No 'TBD', 'TODO', 'later', 'if needed', or equivalent deferral language appears anywhere.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "\"REQUIRED NEXT: `slow-powers:working-in-isolation` (to establish an isolated branch), then `slow-powers:test-driven-development` for the implementation phase.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "no_placeholders",
5
+ "passed": true,
6
+ "evidence": "The final_message contains no instances of 'TBD', 'TODO', 'later', or 'if needed'. Every task specifies concrete file names, TypeScript interfaces, JSX snippets, and handler logic. The one vague phrase — 'max-width as appropriate for the app's existing style' in the TodoList.css description — is a style guidance note, not a placeholder that defers implementation.",
7
+ "confidence": 0.9,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "hands_off_to_tdd",
12
+ "passed": true,
13
+ "evidence": "\"This plan adds runtime behavior, so the implementation phase must follow `slow-powers:test-driven-development` (Task 1 above is the TDD entry point — tests before code). That skill also requires `slow-powers:working-in-isolation` before touching any files.\"",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -3,21 +3,21 @@
3
3
  {
4
4
  "id": "catches_hallucinated_file",
5
5
  "passed": true,
6
- "evidence": "\"Unverified file reference. The draft said 'extend the *existing* `src/hooks/useLocalStorage.ts`' I have no confirmation this file exists in your project. Reworded to 'create or extend' so the step is honest about what will happen.\"",
6
+ "evidence": "The final_message explicitly calls out 'Hallucinated file references two files cited as \"existing\" without verification.' listing `src/hooks/useLocalStorage.ts` and noting 'Neither was confirmed to exist.' The corrected plan's Step 1 reads: 'Confirm `src/hooks/useLocalStorage.ts` exists. If it does not, create it…'",
7
7
  "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "catches_irrelevant_step",
12
12
  "passed": true,
13
- "evidence": "\"Hallucinated Redux dependency (Red Flag). Step 4 introduced a Redux store and todo slice that you never asked for. Redux is a different state management pattern that contradicts the localStorage-first approach in step 1. Removed.\"",
13
+ "evidence": "**Unrequested dependency Redux introduced with no basis in the spec.** The request is for localStorage persistence. A Redux slice is an invented architectural choice. It also creates structural incoherence: localStorage (Step 1) and Redux (Step 4) would both manage the same state.",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  },
17
17
  {
18
18
  "id": "hands_off_to_tdd",
19
19
  "passed": true,
20
- "evidence": "\"Ready to start? I'll follow `test-driven-development` for the implementation phase.\"",
20
+ "evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis is a functional code change that adds runtime behavior. TDD governs the implementation phase.",
21
21
  "confidence": 1,
22
22
  "grader": "llm_judge"
23
23
  }
@@ -2,31 +2,31 @@
2
2
  "assertion_results": [
3
3
  {
4
4
  "id": "catches_hallucinated_file",
5
- "passed": false,
6
- "evidence": "The file map states 'Already exists; verify signature covers generics.' and Task 2 says 'Open src/hooks/useLocalStorage.ts. Confirm the exported signature...' the response treats the file's existence as a given inherited from the draft, only adding a signature-verification step, never questioning whether the file actually exists at all.",
7
- "confidence": 0.92,
5
+ "passed": true,
6
+ "evidence": "\"Hallucinated file references (Red Flag): The plan said 'extend the existing `src/hooks/useLocalStorage.ts`' and 'add Redux store configuration in `src/store/index.ts`' neither was verified to exist. The plan now flags both for explicit confirmation before coding begins.\" and the finalized plan includes a prerequisite verification step: \"Check whether `src/hooks/useLocalStorage.ts` exists. If yes, extend it. If no, create it at that path.\"",
7
+ "confidence": 1,
8
8
  "grader": "llm_judge"
9
9
  },
10
10
  {
11
11
  "id": "catches_irrelevant_step",
12
12
  "passed": true,
13
- "evidence": "Redux introduced without justification. Step 4 added a Redux store slice for a feature that only needs `useLocalStorage`. Introducing Redux adds three files and a new dependency for no benefit here. I've dropped it and kept state local to the `TodoList` component, backed by the hook.",
13
+ "evidence": "\"Unrequested Redux dependency: Step 4 introduced a Redux slice and store wiring that the user never asked for and that useState + useLocalStorage handles without additional infrastructure. Removed entirely it was an irrelevant step.\"",
14
14
  "confidence": 1,
15
15
  "grader": "llm_judge"
16
16
  },
17
17
  {
18
18
  "id": "hands_off_to_tdd",
19
19
  "passed": true,
20
- "evidence": "\"For all code-writing tasks below, follow `slow-powers:test-driven-development` (Red-Green-Refactor).\" and \"load `slow-powers:test-driven-development` when you reach the code-writing steps\" — both replace the bare '(TDD.)' label from the draft with an explicit named skill hand-off.",
20
+ "evidence": "**Next step:** This is a functional code change. Required next skill: `slow-powers:test-driven-development`.",
21
21
  "confidence": 1,
22
22
  "grader": "llm_judge"
23
23
  }
24
24
  ],
25
25
  "summary": {
26
- "passed": 2,
27
- "failed": 1,
26
+ "passed": 3,
27
+ "failed": 0,
28
28
  "total": 3,
29
- "pass_rate": 0.6666666666666666
29
+ "pass_rate": 1
30
30
  },
31
31
  "meta_results": [
32
32
  {
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation` — complete this before starting, so the reorganization branch doesn't collide with other in-flight work.",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "The response names 'REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`' as the required gate and never mentions test-driven-development. The only reference to tests is in step 6/Verify: 'Run: full test suite; tsc --noEmit; linter' — using existing tests to confirm no behavior change, not demanding tests-first authorship.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "assertion_results": [
3
+ {
4
+ "id": "routes_to_working_in_isolation",
5
+ "passed": true,
6
+ "evidence": "\"Before you start: because this is a structural, non-functional change, set up an isolated branch/worktree first (`slow-powers:working-in-isolation`) so this work doesn't collide with other in-progress branches.\"",
7
+ "confidence": 1,
8
+ "grader": "llm_judge"
9
+ },
10
+ {
11
+ "id": "does_not_force_tdd",
12
+ "passed": true,
13
+ "evidence": "Task 5 says 'Run the full test suite to confirm no behavior changes were introduced' — running existing tests to validate a refactor, not imposing TDD. The only skill referenced is 'slow-powers:working-in-isolation'; test-driven-development is never mentioned or required.",
14
+ "confidence": 1,
15
+ "grader": "llm_judge"
16
+ }
17
+ ],
18
+ "summary": {
19
+ "passed": 2,
20
+ "failed": 0,
21
+ "total": 2,
22
+ "pass_rate": 1
23
+ },
24
+ "meta_results": [
25
+ {
26
+ "id": "__skill_invoked",
27
+ "passed": true,
28
+ "evidence": "Skill invocation verified from transcript.",
29
+ "confidence": 1,
30
+ "grader": "transcript_check"
31
+ }
32
+ ],
33
+ "meta_summary": {
34
+ "passed": 1,
35
+ "failed": 0,
36
+ "total": 1,
37
+ "skill_invoked": true
38
+ }
39
+ }
@@ -90,6 +90,52 @@
90
90
  "rubric": "Does the response make a real hand-off to the implementation skill — explicitly naming `slow-powers:test-driven-development` as the required next skill for the implementation phase — rather than merely dropping 'TDD' as an unexplained label or parenthetical? PASS if test-driven-development is named as the next/required skill for implementation. FAIL if 'TDD' appears only as a bare label with no hand-off, or is absent. (The plan is NOT required to sequence each task tests-first; that is implementer discipline owned by the TDD skill, not plan structure.)"
91
91
  }
92
92
  ]
93
+ },
94
+ {
95
+ "id": "docs-refactor-plan-mode",
96
+ "prompt": "[The following is the conversation so far in this session. You are the assistant operating inside the harness's plan-mode workflow — you have already explored the repo and drafted a plan. Continue from the final user turn.]\n\nUser: Our docs have drifted. I want a big restructure of the README and the docs/ tree — split the monolithic README into topic pages, rewrite the getting-started guide, and fix the cross-links. Draft a plan I can review before you touch any files.\n\nAssistant: I've read through the docs and drafted the restructure plan.\n\n## Approach\n1. Split `README.md` into `docs/install.md`, `docs/usage.md`, and `docs/faq.md`, leaving a short overview in `README.md`.\n2. Rewrite `docs/getting-started.md` against the current CLI flags.\n3. Update every internal cross-link to the new page locations.\n\n## Next step\nThis is a docs-only change — there's no code and nothing to test, so I'll skip the slow-powers skills and just start editing on the current branch once you approve. Working in a separate workspace would be overkill for prose.\n\nUser: Approach looks right. Give me the finalized plan and let's go.",
97
+ "expected_output": "The agent should load `hardening-plans` and run the fresh-eyes review on the drafted docs-restructure plan (spec coverage, referenced files real, every step relevant, no placeholders, names consistent). Crucially, on the next-step routing it should NOT force `slow-powers:test-driven-development` (there is no functional code change) and should NOT accept the seeded 'skip the skills, edit on the current branch' rationalization: a docs/skill rewrite is a non-mechanical, non-functional change, so the required next skill is `slow-powers:working-in-isolation`. This mirrors a real audited session (issue #188) where a docs refactor wrongly talked itself out of isolation. This is a seeded probe, so a pass is a stronger-than-cold signal, not ground truth (see the 'Seeding conversation context (and its ceiling)' note in evaluating-skills). skill_should_trigger stays true.",
98
+ "assertions": [
99
+ {
100
+ "id": "routes_to_working_in_isolation",
101
+ "type": "llm_judge",
102
+ "rubric": "Does the response name `slow-powers:working-in-isolation` as the required next step before editing — correcting the seeded plan's 'skip the skills and edit on the current branch' stance? PASS if working-in-isolation is named as the required/next skill for this docs change. FAIL if it endorses skipping isolation, or names no next skill at all."
103
+ },
104
+ {
105
+ "id": "does_not_force_tdd",
106
+ "type": "llm_judge",
107
+ "rubric": "Does the response correctly refrain from demanding `slow-powers:test-driven-development` as the required next skill for this docs-only change? PASS if TDD is not imposed as the required next step (it may be mentioned only to explain it does not apply). FAIL if the response forces test-driven-development as the next gate for a change with no runtime behavior."
108
+ }
109
+ ]
110
+ },
111
+ {
112
+ "id": "structural-refactor-cold",
113
+ "prompt": "I'm about to reorganize our backend source. Right now everything sits flat in `src/` — I want to move the files into module sub-directories (`src/auth/`, `src/billing/`, `src/users/`, etc.) and fix the imports, with no behavior changes. Before I start, break this into a plan I can review.",
114
+ "expected_output": "The agent should load `hardening-plans` and produce a concrete refactor plan (every referenced path real or a real named home, no placeholders, every step relevant to the move). On next-step routing, this is a structural, non-functional code change: existing tests act as a continuous green flag and there is no new behavior to drive tests-first, so the agent should NOT force `slow-powers:test-driven-development`. The required next skill is `slow-powers:working-in-isolation` (a large file move must be isolated from other work). This mirrors a real audited session (issue #188). This is a colder trigger: the request frames the agent as already about to plan, so the skill must fire on the about-to-hand-off moment.",
115
+ "assertions": [
116
+ {
117
+ "id": "routes_to_working_in_isolation",
118
+ "type": "llm_judge",
119
+ "rubric": "Does the response name `slow-powers:working-in-isolation` as the required next step before starting the refactor? PASS if working-in-isolation is named as the required/next skill. FAIL if no next skill is named, or only TDD is named."
120
+ },
121
+ {
122
+ "id": "does_not_force_tdd",
123
+ "type": "llm_judge",
124
+ "rubric": "Does the response correctly refrain from demanding `slow-powers:test-driven-development` as the required next skill for this behavior-preserving refactor? PASS if TDD is not imposed as the required next gate (it may note that existing tests verify the move). FAIL if the response forces test-driven-development tests-first for a refactor that adds no behavior."
125
+ }
126
+ ]
127
+ },
128
+ {
129
+ "id": "research-plan-no-required-skill",
130
+ "prompt": "Before we change anything, I want to understand why our CI pipeline got slow over the last month. Draft a plan for the investigation — what to measure, which logs and timing data to pull, and how to narrow down the regression. No code changes yet, just the investigation steps.",
131
+ "expected_output": "The agent should load `hardening-plans` and harden the investigation plan (concrete measurement steps, real log/artifact references, no placeholders, every step relevant to finding the regression). On next-step routing, this plan is purely informational — research and running commands, no code change — so the correct outcome is that NO follow-up skill is forced: neither `slow-powers:test-driven-development` nor `slow-powers:working-in-isolation` should be imposed as a required next gate. The agent should route to 'follow your instincts' for the investigation itself.",
132
+ "assertions": [
133
+ {
134
+ "id": "no_forced_next_skill",
135
+ "type": "llm_judge",
136
+ "rubric": "Does the response correctly avoid imposing any required follow-up slow-powers skill for this research/investigation plan? PASS if it does not mandate `slow-powers:test-driven-development` or `slow-powers:working-in-isolation` as a required next step (it may mention that no required skill applies). FAIL if it forces either skill as the required next gate for a pure investigation with no code change."
137
+ }
138
+ ]
93
139
  }
94
140
  ]
95
141
  }
@@ -23,7 +23,7 @@ A null delta here is **not** evidence the reframe failed (the eval-seeding issue
23
23
  says so explicitly). It is two stacked measurement ceilings:
24
24
 
25
25
  1. **The runner over-promotes invocation.** `buildDispatchTask` in
26
- `skills/evaluating-skills/runner/run.ts` puts a *constant* instruction in the
26
+ `@slowdini/eval-runner`'s `src/run.ts` puts a *constant* instruction in the
27
27
  `with_skill` arm: *"the skill … is staged under the unique slug … — invoke that
28
28
  slug … if the skill applies."* That hint is identical across both `--bootstrap`
29
29
  variants, so it cancels in the delta but pins the invocation floor near 100%.
@@ -60,7 +60,7 @@ Roughly in increasing order of effort / payoff:
60
60
  class of eval measurable. This is the high-value framework improvement.
61
61
  3. **Real harness-mode injection.** Reproduce the plan-mode suppression by running
62
62
  the eval subagent *inside* a real plan mode rather than a described one. Tracked
63
- as a parity goal in `skills/evaluating-skills/harness-parity.md`; the biggest lift.
63
+ as a parity goal in the `@slowdini/eval-runner` docs (`docs/harness-parity.md`); the biggest lift.
64
64
 
65
65
  ## Bigger-picture testing strategy (from the maintainer)
66
66