@slowdini/slow-powers-opencode 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/opencode/plugins/slow-powers.js +68 -4
- package/package.json +1 -1
- package/skills/hardening-plans/SKILL.md +29 -7
- package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
- package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
- package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/evals.json +46 -0
|
@@ -3,8 +3,10 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Injects slow-powers bootstrap context via system prompt transform.
|
|
5
5
|
* Auto-registers skills directory via config hook (no symlinks needed).
|
|
6
|
+
* Intercepts plan file writes in plan mode and triggers hardening-plans skill.
|
|
6
7
|
*/
|
|
7
8
|
|
|
9
|
+
import { createHash } from "node:crypto";
|
|
8
10
|
import fs from "node:fs";
|
|
9
11
|
import path from "node:path";
|
|
10
12
|
import { fileURLToPath } from "node:url";
|
|
@@ -22,10 +24,12 @@ const bootstrapLeadingPhrase = "<EXTREMELY-IMPORTANT>";
|
|
|
22
24
|
// once eliminates redundant fs work on every agent step.
|
|
23
25
|
let _bootstrapCache; // undefined = not yet loaded, null = file missing
|
|
24
26
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
// Deduplication state for plan hardening
|
|
28
|
+
// Map<filePath, contentHash> - tracks processed plan file versions
|
|
29
|
+
const processedPlanHashes = new Map();
|
|
30
|
+
const HARDENED_MARKER = "<!-- hardened-plans -->";
|
|
31
|
+
|
|
32
|
+
export const SlowPowersPlugin = async ({ client, directory: _directory }) => {
|
|
29
33
|
// Helper to load bootstrap content (cached after first call)
|
|
30
34
|
const getBootstrapContent = () => {
|
|
31
35
|
if (_bootstrapCache !== undefined) return _bootstrapCache;
|
|
@@ -40,6 +44,61 @@ export const SlowPowersPlugin = async ({
|
|
|
40
44
|
return _bootstrapCache;
|
|
41
45
|
};
|
|
42
46
|
|
|
47
|
+
const hashContent = (content) =>
|
|
48
|
+
createHash("sha256").update(content).digest("hex");
|
|
49
|
+
|
|
50
|
+
const isPlanHardened = (content) => content.includes(HARDENED_MARKER);
|
|
51
|
+
|
|
52
|
+
const handlePlanFileEdit = async (event) => {
|
|
53
|
+
const filePath = event.properties.file;
|
|
54
|
+
const sessionID = event.properties.sessionID;
|
|
55
|
+
|
|
56
|
+
if (!filePath || !sessionID) return;
|
|
57
|
+
|
|
58
|
+
if (!filePath.match(/\.opencode\/plans\/.*\.md$/)) return;
|
|
59
|
+
|
|
60
|
+
let session;
|
|
61
|
+
try {
|
|
62
|
+
session = await client.session.get({ path: { id: sessionID } });
|
|
63
|
+
} catch {
|
|
64
|
+
return;
|
|
65
|
+
}
|
|
66
|
+
if (session.agent !== "plan") return;
|
|
67
|
+
|
|
68
|
+
let content;
|
|
69
|
+
try {
|
|
70
|
+
content = fs.readFileSync(filePath, "utf8");
|
|
71
|
+
} catch {
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (isPlanHardened(content)) return;
|
|
76
|
+
|
|
77
|
+
const contentHash = hashContent(content);
|
|
78
|
+
const previousHash = processedPlanHashes.get(filePath);
|
|
79
|
+
if (previousHash === contentHash) return;
|
|
80
|
+
|
|
81
|
+
processedPlanHashes.set(filePath, contentHash);
|
|
82
|
+
|
|
83
|
+
try {
|
|
84
|
+
await client.session.prompt({
|
|
85
|
+
path: { id: sessionID },
|
|
86
|
+
body: {
|
|
87
|
+
noReply: true,
|
|
88
|
+
parts: [
|
|
89
|
+
{
|
|
90
|
+
type: "text",
|
|
91
|
+
text: `The plan at ${filePath} has been written. Please run the hardening-plans skill on this plan file to review it for hallucinations, missing file references, vague steps, and coverage gaps before presenting it. Update the file in place with the hardened version. Add ${HARDENED_MARKER} marker when done.`,
|
|
92
|
+
},
|
|
93
|
+
],
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
} catch (err) {
|
|
97
|
+
processedPlanHashes.delete(filePath);
|
|
98
|
+
console.error("[slow-powers] Failed to trigger hardening-plans:", err);
|
|
99
|
+
}
|
|
100
|
+
};
|
|
101
|
+
|
|
43
102
|
return {
|
|
44
103
|
// Inject skills path into live config so OpenCode discovers slow-powers skills
|
|
45
104
|
// without requiring manual symlinks or config file edits.
|
|
@@ -82,5 +141,10 @@ export const SlowPowersPlugin = async ({
|
|
|
82
141
|
|
|
83
142
|
firstUser.parts.unshift({ type: "text", text: bootstrap });
|
|
84
143
|
},
|
|
144
|
+
|
|
145
|
+
event: async ({ event }) => {
|
|
146
|
+
if (event.type !== "file.edited") return;
|
|
147
|
+
await handlePlanFileEdit(event);
|
|
148
|
+
},
|
|
85
149
|
};
|
|
86
150
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@slowdini/slow-powers-opencode",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.1",
|
|
4
4
|
"description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./opencode/plugins/slow-powers.js",
|
|
@@ -40,13 +40,30 @@ Before the plan leaves your hands, re-read the whole draft once, top to bottom,
|
|
|
40
40
|
|
|
41
41
|
---
|
|
42
42
|
|
|
43
|
-
##
|
|
44
|
-
|
|
45
|
-
When the plan is approved,
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
43
|
+
## Choosing the Next Step
|
|
44
|
+
|
|
45
|
+
When the plan is approved, the next move depends on what kind of work the plan
|
|
46
|
+
describes. Route by the **first** branch that matches — then **close your hand-off
|
|
47
|
+
by naming the required next skill verbatim**, with its `slow-powers:` identifier.
|
|
48
|
+
Telling the reader to "work on a branch" or "write the tests first" in your own
|
|
49
|
+
words is *not* the hand-off; the named skill is. State it even on a cold draft
|
|
50
|
+
where nothing pushed back — the named hand-off is part of the deliverable, not a
|
|
51
|
+
rebuttal you only produce when someone argued against it.
|
|
52
|
+
|
|
53
|
+
```mermaid
|
|
54
|
+
flowchart TD
|
|
55
|
+
start([Plan approved]) --> functional{Functional code change?<br/>adds or changes runtime behavior}
|
|
56
|
+
functional -->|yes| tdd[REQUIRED NEXT:<br/>slow-powers:test-driven-development]
|
|
57
|
+
functional -->|no| nonmech{Non-mechanical & non-functional?<br/>refactor, file move, docs/skill rewrite}
|
|
58
|
+
nonmech -->|yes| isolation[REQUIRED NEXT:<br/>slow-powers:working-in-isolation]
|
|
59
|
+
nonmech -->|no| none[No required skill —<br/>follow your instincts]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
* **Functional code change** — adds or changes runtime behavior.
|
|
63
|
+
> **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next, for the implementation phase. The plan should carry a tests section so the reader can see *what* will be verified, but *when* tests get written is implementer discipline TDD owns at execution time — not plan structure. (TDD in turn requires `slow-powers:working-in-isolation`, so isolation still happens on this path.)
|
|
64
|
+
* **Non-mechanical, non-functional change** — a structural code change (refactor, file move), a docs or skill change, or any other substantive update that doesn't alter runtime behavior.
|
|
65
|
+
> **REQUIRED NEXT SKILL:** You must complete `slow-powers:working-in-isolation` next, before you start. TDD has no green to chase here, but the work still collides with other branches if it isn't isolated.
|
|
66
|
+
* **Informational or trivial/mechanical** — the plan is to research, run commands, or make a trivial/mechanical fix (merge-conflict cleanup, test fixups, typos). No required next skill; follow your instincts.
|
|
50
67
|
|
|
51
68
|
---
|
|
52
69
|
|
|
@@ -57,6 +74,8 @@ The plan should carry a tests section so the reader can see *what* will be verif
|
|
|
57
74
|
* The plan contains "TBD", "TODO", "later", "if needed", "appropriate", or "etc."
|
|
58
75
|
* The same thing is named two different ways across tasks.
|
|
59
76
|
* You wrote "similar to Task N" instead of restating the content.
|
|
77
|
+
* TDD doesn't fit the work, so you're about to skip straight to coding with no skill at all — non-functional work still routes to `slow-powers:working-in-isolation`; only the informational/trivial branch frees you.
|
|
78
|
+
* Your plan closes with isolation or testing advice in your own words but never names the required next skill — paraphrasing the practice isn't the hand-off; name `slow-powers:working-in-isolation` (or `slow-powers:test-driven-development`).
|
|
60
79
|
|
|
61
80
|
If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approval comes from a plan that holds up to scrutiny, not from optimism.
|
|
62
81
|
|
|
@@ -70,3 +89,6 @@ If you hit a Red Flag: stop and fix it before the plan leaves your hands. Approv
|
|
|
70
89
|
| "That file is probably where I said it is." | "Probably" isn't verified. Check it before the user does. |
|
|
71
90
|
| "The plan reads fine — I don't need to re-review it." | You wrote it, so you're blind to its gaps. Re-read it as someone who has to execute it. |
|
|
72
91
|
| "Repeating context across similar tasks is wasteful." | The reader may read tasks out of order. Restate the relevant detail. |
|
|
92
|
+
| "It's just docs / a refactor — it doesn't need isolation." | Non-mechanical changes still collide with other work. Route by the flowchart: structural and docs changes get `slow-powers:working-in-isolation`. |
|
|
93
|
+
| "TDD doesn't apply, so no skill applies." | TDD is only the *functional* branch. Non-functional, non-mechanical work still has a required next skill — isolation. |
|
|
94
|
+
| "I told them to work on a branch / isolate the work — that covers it." | Generic isolation advice in your own words isn't the hand-off. Name `slow-powers:working-in-isolation` as the required next skill — the named hand-off is the deliverable, on a cold draft as much as a contested one. |
|
|
@@ -1,20 +1,25 @@
|
|
|
1
1
|
# Baseline — hardening-plans
|
|
2
2
|
|
|
3
3
|
Committed reference output from a canonical eval run. Regenerate with
|
|
4
|
-
`
|
|
5
|
-
dispatch files, produced outputs) stays gitignored under `skills-workspace
|
|
4
|
+
`skill-eval promote-baseline --skill hardening-plans --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
|
|
5
|
+
dispatch files, produced outputs) stays gitignored under `skills-workspace/`
|
|
6
|
+
and is reclaimable by `skill-eval teardown` once promoted (this commit's marker).
|
|
6
7
|
|
|
7
8
|
| Field | Value |
|
|
8
9
|
|-------|-------|
|
|
9
10
|
| Mode | revision |
|
|
10
|
-
| Iteration | iteration-
|
|
11
|
+
| Iteration | iteration-2 |
|
|
11
12
|
| Harness | claude-code |
|
|
12
13
|
| Agent model | claude-sonnet-4-6 |
|
|
13
14
|
| Judge model | claude-sonnet-4-6 |
|
|
14
15
|
| Conditions | old_skill, new_skill |
|
|
15
|
-
| Run timestamp | 2026-
|
|
16
|
-
| Label |
|
|
17
|
-
| Promoted from commit |
|
|
16
|
+
| Run timestamp | 2026-06-07T04:07:44.660Z |
|
|
17
|
+
| Label | next-step-named-handoff |
|
|
18
|
+
| Promoted from commit | 7dc77dd |
|
|
19
|
+
|
|
20
|
+
`old_skill` = `next-step-v1` (commit `b62c4cd`, the next-step flowchart **without**
|
|
21
|
+
the named-hand-off requirement). `new_skill` = the working tree at promotion
|
|
22
|
+
(`7dc77dd`, flowchart **with** the named-hand-off requirement).
|
|
18
23
|
|
|
19
24
|
Files:
|
|
20
25
|
- `benchmark.json` — aggregate pass-rate / duration / token deltas.
|
|
@@ -1,65 +1,79 @@
|
|
|
1
|
-
# Notes — hardening-plans
|
|
1
|
+
# Notes — hardening-plans next-step baseline (iteration-2)
|
|
2
2
|
|
|
3
3
|
Forward-looking observations from the run that produced this baseline. Read these
|
|
4
4
|
before trusting the headline `benchmark.json` aggregate.
|
|
5
5
|
|
|
6
|
-
##
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
`
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
(
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
6
|
+
## What this baseline measures
|
|
7
|
+
|
|
8
|
+
Revision (Mode B), sonnet agent + sonnet judge, 8 cases, N=1 per case/condition:
|
|
9
|
+
|
|
10
|
+
- **`old_skill` = `next-step-v1`** (commit `b62c4cd`): the next-step *flowchart*
|
|
11
|
+
(functional → TDD, non-mechanical/non-functional → working-in-isolation,
|
|
12
|
+
informational/trivial → no skill) **without** an explicit instruction to emit a
|
|
13
|
+
*named* hand-off.
|
|
14
|
+
- **`new_skill`** (commit `7dc77dd`): same flowchart **plus** "close your hand-off
|
|
15
|
+
by naming the required next skill verbatim, even on a cold draft", the canonical
|
|
16
|
+
"You must complete … next" cross-references, and a red-flag + rationalization
|
|
17
|
+
closing the spirit-vs-letter loophole.
|
|
18
|
+
|
|
19
|
+
This is the **second** iteration of the issue #188 work. Iteration-1 compared the
|
|
20
|
+
flowchart (`b62c4cd`) against the *pre-flowchart* single-TDD-gate (`dev`); see the
|
|
21
|
+
"iteration-1 context" section below for why iteration-2 exists.
|
|
22
|
+
|
|
23
|
+
## Headline: clean sweep, but read the flakiness caveat
|
|
24
|
+
|
|
25
|
+
`new_skill` passed **8/8 cases, stddev 0** (100%) vs `old_skill` **87.5%**
|
|
26
|
+
(delta **+12.5pp** toward new). Both arms invoked the skill 100%; no
|
|
27
|
+
`validity_warnings`. `new_skill` also used slightly fewer tokens on average.
|
|
28
|
+
|
|
29
|
+
The two `old_skill` misses were the noisy fresh-eyes assertions, not routing:
|
|
30
|
+
`seeded-plan-mode-todo-app-adversarial/no_placeholders` and
|
|
31
|
+
`oauth-task-breakdown-cold/hands_off_to_tdd`. `new_skill` passed both — the
|
|
32
|
+
strengthened "You must complete … next" phrasing plausibly firmed up the
|
|
33
|
+
functional TDD hand-off too — but at N=1 these are within run-to-run noise.
|
|
34
|
+
|
|
35
|
+
## The structural-refactor-cold caveat (the important one)
|
|
36
|
+
|
|
37
|
+
`structural-refactor-cold` is the case the iteration-2 edit targeted, and it is
|
|
38
|
+
**flaky at N=1**. The `routes_to_working_in_isolation` assertion on the
|
|
39
|
+
*identical* `b62c4cd` content flipped across runs:
|
|
40
|
+
|
|
41
|
+
| skill content | run | routes_to_working_in_isolation |
|
|
42
|
+
|---|---|---|
|
|
43
|
+
| `b62c4cd` (no named-hand-off line) | iteration-1 `new_skill` | **FAIL** (gave generic "set up an isolated branch" advice, never named the skill) |
|
|
44
|
+
| `b62c4cd` (no named-hand-off line) | iteration-2 `old_skill` | **PASS** (named the skill on its own) |
|
|
45
|
+
| `7dc77dd` (named-hand-off line) | iteration-2 `new_skill` | **PASS** (explicit "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`") |
|
|
46
|
+
|
|
47
|
+
So this single run does **not** cleanly attribute the cold-structural pass to the
|
|
48
|
+
edit: `old_skill` happened to pass it too. What the run *does* show is that
|
|
49
|
+
`new_skill` is **≥ `old_skill` on every case, swept 8/8 with zero variance, and
|
|
50
|
+
emitted the named hand-off on the cold draft** — with no regressions. Treat the
|
|
51
|
+
+12.5pp as "at least as good, and reliably named" rather than proof the edit beats
|
|
52
|
+
v1 *specifically on the flaky case*. **If you revisit this, replicate
|
|
53
|
+
`structural-refactor-cold` a few times per condition** (the runner has no per-case
|
|
54
|
+
run multiplier — use repeated `--only structural-refactor-cold` iterations) to
|
|
55
|
+
firm up the attribution.
|
|
56
|
+
|
|
57
|
+
## Iteration-1 context (why iteration-2 exists)
|
|
58
|
+
|
|
59
|
+
Iteration-1 (`dev` single-TDD-gate vs `b62c4cd` flowchart) showed the flowchart's
|
|
60
|
+
**clean win on the seeded #188 case**: `docs-refactor-plan-mode` — old talked
|
|
61
|
+
itself out of isolation (the audited #188 bug), new routed to
|
|
62
|
+
`slow-powers:working-in-isolation`. But it also exposed the gap this baseline
|
|
63
|
+
closes: on the **cold** `structural-refactor-cold` draft, the flowchart produced
|
|
64
|
+
isolation advice *in spirit* without *naming* the skill, failing the assertion.
|
|
65
|
+
That gap motivated the named-hand-off edit measured here.
|
|
66
|
+
|
|
67
|
+
## Noisy assertions to distrust at N=1
|
|
68
|
+
|
|
69
|
+
`no_placeholders` and `hands_off_to_tdd` scatter PASS/FAIL across both conditions
|
|
70
|
+
run-to-run; they are fresh-eyes/quality checks, not tests of the routing change.
|
|
71
|
+
Don't read a single-run flip on either as signal. The routing assertions
|
|
72
|
+
(`routes_to_working_in_isolation`, `does_not_force_tdd`, `no_forced_next_skill`)
|
|
73
|
+
are the ones this baseline exists to track.
|
|
60
74
|
|
|
61
75
|
## Provenance / scope
|
|
62
76
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
77
|
+
8-case full suite. Plan-mode injection **off** (the seeded cases carry plan
|
|
78
|
+
framing in prose; the `*-cold` and research cases are deliberately cold). Agent +
|
|
79
|
+
judge both `claude-sonnet-4-6`.
|
|
@@ -1,54 +1,54 @@
|
|
|
1
1
|
{
|
|
2
|
-
"generated": "2026-
|
|
2
|
+
"generated": "2026-06-07T04:14:04.136Z",
|
|
3
3
|
"mode": "revision",
|
|
4
|
-
"baseline": "
|
|
4
|
+
"baseline": "next-step-v1",
|
|
5
5
|
"conditions_compared": ["old_skill", "new_skill"],
|
|
6
6
|
"missing_gradings": 0,
|
|
7
7
|
"validity_warnings": [],
|
|
8
8
|
"run_summary": {
|
|
9
9
|
"old_skill": {
|
|
10
10
|
"pass_rate": {
|
|
11
|
-
"mean": 0.
|
|
12
|
-
"stddev": 0.
|
|
13
|
-
"n":
|
|
11
|
+
"mean": 0.875,
|
|
12
|
+
"stddev": 0.217,
|
|
13
|
+
"n": 8
|
|
14
14
|
},
|
|
15
15
|
"duration_ms": {
|
|
16
|
-
"mean":
|
|
17
|
-
"stddev":
|
|
18
|
-
"n":
|
|
16
|
+
"mean": 68065,
|
|
17
|
+
"stddev": 15551,
|
|
18
|
+
"n": 8
|
|
19
19
|
},
|
|
20
20
|
"total_tokens": {
|
|
21
|
-
"mean":
|
|
22
|
-
"stddev":
|
|
23
|
-
"n":
|
|
21
|
+
"mean": 84608,
|
|
22
|
+
"stddev": 6327,
|
|
23
|
+
"n": 8
|
|
24
24
|
},
|
|
25
|
-
"skill_invocation_n":
|
|
25
|
+
"skill_invocation_n": 8,
|
|
26
26
|
"skill_invocation_rate": 1
|
|
27
27
|
},
|
|
28
28
|
"new_skill": {
|
|
29
29
|
"pass_rate": {
|
|
30
|
-
"mean":
|
|
31
|
-
"stddev": 0
|
|
32
|
-
"n":
|
|
30
|
+
"mean": 1,
|
|
31
|
+
"stddev": 0,
|
|
32
|
+
"n": 8
|
|
33
33
|
},
|
|
34
34
|
"duration_ms": {
|
|
35
|
-
"mean":
|
|
36
|
-
"stddev":
|
|
37
|
-
"n":
|
|
35
|
+
"mean": 68376,
|
|
36
|
+
"stddev": 14889,
|
|
37
|
+
"n": 8
|
|
38
38
|
},
|
|
39
39
|
"total_tokens": {
|
|
40
|
-
"mean":
|
|
41
|
-
"stddev":
|
|
42
|
-
"n":
|
|
40
|
+
"mean": 78521,
|
|
41
|
+
"stddev": 16433,
|
|
42
|
+
"n": 8
|
|
43
43
|
},
|
|
44
|
-
"skill_invocation_n":
|
|
44
|
+
"skill_invocation_n": 8,
|
|
45
45
|
"skill_invocation_rate": 1
|
|
46
46
|
}
|
|
47
47
|
},
|
|
48
48
|
"delta": {
|
|
49
49
|
"direction": "old_skill - new_skill",
|
|
50
|
-
"pass_rate": 0.
|
|
51
|
-
"duration_ms":
|
|
52
|
-
"total_tokens":
|
|
50
|
+
"pass_rate": -0.125,
|
|
51
|
+
"duration_ms": -311,
|
|
52
|
+
"total_tokens": 6087
|
|
53
53
|
}
|
|
54
54
|
}
|
package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json
CHANGED
|
@@ -3,14 +3,14 @@
|
|
|
3
3
|
{
|
|
4
4
|
"id": "no_placeholders",
|
|
5
5
|
"passed": true,
|
|
6
|
-
"evidence": "
|
|
6
|
+
"evidence": "Every task in the plan is fully concrete: Task 1 gives exact CLI commands and filenames to delete; Task 2 provides the complete TypeScript interface; Task 3 specifies exact state shape, API signatures, and side-effect behavior; Tasks 4–6 give exact prop types, rendered element structure, and CSS class names; Task 7 specifies exact destructuring and JSX structure; Task 8 gives exact CSS property values. The tests section lists specific test cases per component. No 'TBD', 'TODO', 'later', 'if needed', or any other placeholder appears anywhere in the final_message.",
|
|
7
7
|
"confidence": 1,
|
|
8
8
|
"grader": "llm_judge"
|
|
9
9
|
},
|
|
10
10
|
{
|
|
11
11
|
"id": "hands_off_to_tdd",
|
|
12
12
|
"passed": true,
|
|
13
|
-
"evidence": "
|
|
13
|
+
"evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis plan adds runtime behavior, so TDD governs the implementation phase.",
|
|
14
14
|
"confidence": 1,
|
|
15
15
|
"grader": "llm_judge"
|
|
16
16
|
}
|
package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json
CHANGED
|
@@ -3,14 +3,14 @@
|
|
|
3
3
|
{
|
|
4
4
|
"id": "no_placeholders",
|
|
5
5
|
"passed": true,
|
|
6
|
-
"evidence": "
|
|
6
|
+
"evidence": "All steps are fully concrete: step 1 gives exact CLI commands (`npm create vite@latest todo-app -- --template react-ts`), step 2 provides a complete TypeScript interface, step 4 lists specific file paths and prop signatures, step 7 names exact npm packages and config keys, and step 9 gives verbatim build/test commands. No 'TBD', 'TODO', 'later', 'if needed', or any other placeholder appears anywhere in the plan.",
|
|
7
7
|
"confidence": 1,
|
|
8
8
|
"grader": "llm_judge"
|
|
9
9
|
},
|
|
10
10
|
{
|
|
11
11
|
"id": "hands_off_to_tdd",
|
|
12
12
|
"passed": true,
|
|
13
|
-
"evidence": "\"
|
|
13
|
+
"evidence": "\"Once you approve this plan, the next step is `slow-powers:test-driven-development` (this is a functional code change that adds runtime behavior).\"",
|
|
14
14
|
"confidence": 1,
|
|
15
15
|
"grader": "llm_judge"
|
|
16
16
|
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "routes_to_working_in_isolation",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation` — before any file is touched.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "does_not_force_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The final_message names exactly one required next skill: 'REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`'. There is no mention of test-driven-development anywhere in the response.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "routes_to_working_in_isolation",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "\"Wrong next step. I said 'I'll skip the slow-powers skills and just start editing on the current branch.' That's incorrect. A docs restructure of this scope ... is non-mechanical and non-functional. The routing rule is clear: non-mechanical, non-functional changes require `slow-powers:working-in-isolation` before starting.\" and \"Set the next step correctly to `slow-powers:working-in-isolation`.\"",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "does_not_force_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The final_message sets the corrected next step as 'slow-powers:working-in-isolation' and never mentions test-driven-development at all — TDD is neither imposed nor referenced anywhere in the response.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_placeholders",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "No 'TBD', 'TODO', 'later', or 'if needed' language appears anywhere in the final message. Every task is concrete: Task 9 even explicitly instructs 'Identify those routes now (do not defer)'. The angle-bracket tokens in Task 3 (e.g. '<paste from Google Console>') are engineer instructions, not deferred decisions.",
|
|
7
|
+
"confidence": 0.97,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "hands_off_to_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis plan adds runtime behavior (new auth routes, session handling, database writes). Before the engineer writes any implementation code, they must complete `slow-powers:test-driven-development`.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_placeholders",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "No 'TBD', 'TODO', 'later', or 'if needed' strings appear anywhere in the final_message. Every task supplies concrete file paths (e.g. 'src/auth/passport.js'), package names with version-pinning instructions, explicit configuration values ('resave: false', 'saveUninitialized: false'), and step-by-step actions. Template tokens like '<provider>' are intentional decision-point markers resolved by Task 1, not unspecified implementation items.",
|
|
7
|
+
"confidence": 0.95,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "hands_off_to_tdd",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "The final_message contains no mention of 'slow-powers:test-driven-development', no hand-off to the TDD skill, and no use of the label 'TDD' anywhere. Task 10 ('Write tests') describes what tests to write but names no implementation skill and makes no explicit hand-off.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 1,
|
|
20
|
+
"failed": 1,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 0.5
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_forced_next_skill",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "The final_message contains no mention of slow-powers:test-driven-development or slow-powers:working-in-isolation as required next steps; it closes with 'No repository or pipeline changes are required during the investigation,' confirming the response treats this as a pure read-only investigation with no forced skill gate.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 1,
|
|
13
|
+
"failed": 0,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 1
|
|
16
|
+
},
|
|
17
|
+
"meta_results": [
|
|
18
|
+
{
|
|
19
|
+
"id": "__skill_invoked",
|
|
20
|
+
"passed": true,
|
|
21
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
22
|
+
"confidence": 1,
|
|
23
|
+
"grader": "transcript_check"
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"meta_summary": {
|
|
27
|
+
"passed": 1,
|
|
28
|
+
"failed": 0,
|
|
29
|
+
"total": 1,
|
|
30
|
+
"skill_invoked": true
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_forced_next_skill",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "The final_message ends with 'Once you've worked through these steps, share what you find and we can scope the fixes.' — no mention of slow-powers:test-driven-development or slow-powers:working-in-isolation as a required next step anywhere in the response.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
}
|
|
10
|
+
],
|
|
11
|
+
"summary": {
|
|
12
|
+
"passed": 1,
|
|
13
|
+
"failed": 0,
|
|
14
|
+
"total": 1,
|
|
15
|
+
"pass_rate": 1
|
|
16
|
+
},
|
|
17
|
+
"meta_results": [
|
|
18
|
+
{
|
|
19
|
+
"id": "__skill_invoked",
|
|
20
|
+
"passed": true,
|
|
21
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
22
|
+
"confidence": 1,
|
|
23
|
+
"grader": "transcript_check"
|
|
24
|
+
}
|
|
25
|
+
],
|
|
26
|
+
"meta_summary": {
|
|
27
|
+
"passed": 1,
|
|
28
|
+
"failed": 0,
|
|
29
|
+
"total": 1,
|
|
30
|
+
"skill_invoked": true
|
|
31
|
+
}
|
|
32
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_placeholders",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "Every section of the plan is fully concrete: file paths are specified ('src/components/TodoItem.tsx', 'src/components/TodoList.tsx', 'src/components/AddTodoForm.tsx', etc.), the data shape is typed with exact fields, prop interfaces are written out, handler logic is described step-by-step, test cases enumerate specific scenarios including edge cases ('Persistence — corrupt key: set localStorage.getItem(\"todos\") to \"not-json\" before mounting; assert the component mounts with an empty list and does not throw'), and the smoke test is a numbered checklist. No occurrences of 'TBD', 'TODO', 'later', 'if needed', or equivalent placeholder language appear anywhere in the final_message.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "hands_off_to_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "### Required next skill\n\nThis plan describes a functional code change (adds runtime behavior). The required next step is `slow-powers:test-driven-development`.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_placeholders",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "The agent explicitly labels the file paths as placeholders: 'I’ve added placeholder paths below using the conventional `src/components/` location — confirm or correct these before we proceed.' The plan therefore contains a declared placeholder rather than a confirmed concrete value.",
|
|
7
|
+
"confidence": 0.97,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "hands_off_to_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "\"This is a functional code change, so **`slow-powers:test-driven-development` is required next** — write the tests in Task 5 before touching the implementation files. TDD will also set up the isolated workspace.\"",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 1,
|
|
20
|
+
"failed": 1,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 0.5
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_placeholders",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "The final_message contains no placeholders — every step is fully specified: file paths are named (e.g. 'src/components/TodoItem.tsx'), props and state shapes are enumerated, handler logic is described in concrete terms (e.g. 'appends { id: crypto.randomUUID(), text, completed: false } to todos'), and all five test cases are spelled out with explicit setup and assertion steps. No 'TBD', 'TODO', 'later', 'if needed', or equivalent deferral language appears anywhere.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "hands_off_to_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "\"REQUIRED NEXT: `slow-powers:working-in-isolation` (to establish an isolated branch), then `slow-powers:test-driven-development` for the implementation phase.\"",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "no_placeholders",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "The final_message contains no instances of 'TBD', 'TODO', 'later', or 'if needed'. Every task specifies concrete file names, TypeScript interfaces, JSX snippets, and handler logic. The one vague phrase — 'max-width as appropriate for the app's existing style' in the TodoList.css description — is a style guidance note, not a placeholder that defers implementation.",
|
|
7
|
+
"confidence": 0.9,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "hands_off_to_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "\"This plan adds runtime behavior, so the implementation phase must follow `slow-powers:test-driven-development` (Task 1 above is the TDD entry point — tests before code). That skill also requires `slow-powers:working-in-isolation` before touching any files.\"",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json
CHANGED
|
@@ -3,21 +3,21 @@
|
|
|
3
3
|
{
|
|
4
4
|
"id": "catches_hallucinated_file",
|
|
5
5
|
"passed": true,
|
|
6
|
-
"evidence": "
|
|
6
|
+
"evidence": "The final_message explicitly calls out 'Hallucinated file references — two files cited as \"existing\" without verification.' listing `src/hooks/useLocalStorage.ts` and noting 'Neither was confirmed to exist.' The corrected plan's Step 1 reads: 'Confirm `src/hooks/useLocalStorage.ts` exists. If it does not, create it…'",
|
|
7
7
|
"confidence": 1,
|
|
8
8
|
"grader": "llm_judge"
|
|
9
9
|
},
|
|
10
10
|
{
|
|
11
11
|
"id": "catches_irrelevant_step",
|
|
12
12
|
"passed": true,
|
|
13
|
-
"evidence": "
|
|
13
|
+
"evidence": "**Unrequested dependency — Redux introduced with no basis in the spec.** The request is for localStorage persistence. A Redux slice is an invented architectural choice. It also creates structural incoherence: localStorage (Step 1) and Redux (Step 4) would both manage the same state.",
|
|
14
14
|
"confidence": 1,
|
|
15
15
|
"grader": "llm_judge"
|
|
16
16
|
},
|
|
17
17
|
{
|
|
18
18
|
"id": "hands_off_to_tdd",
|
|
19
19
|
"passed": true,
|
|
20
|
-
"evidence": "
|
|
20
|
+
"evidence": "**REQUIRED NEXT SKILL:** `slow-powers:test-driven-development`\n\nThis is a functional code change that adds runtime behavior. TDD governs the implementation phase.",
|
|
21
21
|
"confidence": 1,
|
|
22
22
|
"grader": "llm_judge"
|
|
23
23
|
}
|
package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json
CHANGED
|
@@ -2,31 +2,31 @@
|
|
|
2
2
|
"assertion_results": [
|
|
3
3
|
{
|
|
4
4
|
"id": "catches_hallucinated_file",
|
|
5
|
-
"passed":
|
|
6
|
-
"evidence": "
|
|
7
|
-
"confidence":
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "\"Hallucinated file references (Red Flag): The plan said 'extend the existing `src/hooks/useLocalStorage.ts`' and 'add Redux store configuration in `src/store/index.ts`' — neither was verified to exist. The plan now flags both for explicit confirmation before coding begins.\" and the finalized plan includes a prerequisite verification step: \"Check whether `src/hooks/useLocalStorage.ts` exists. If yes, extend it. If no, create it at that path.\"",
|
|
7
|
+
"confidence": 1,
|
|
8
8
|
"grader": "llm_judge"
|
|
9
9
|
},
|
|
10
10
|
{
|
|
11
11
|
"id": "catches_irrelevant_step",
|
|
12
12
|
"passed": true,
|
|
13
|
-
"evidence": "Redux
|
|
13
|
+
"evidence": "\"Unrequested Redux dependency: Step 4 introduced a Redux slice and store wiring that the user never asked for and that useState + useLocalStorage handles without additional infrastructure. Removed entirely — it was an irrelevant step.\"",
|
|
14
14
|
"confidence": 1,
|
|
15
15
|
"grader": "llm_judge"
|
|
16
16
|
},
|
|
17
17
|
{
|
|
18
18
|
"id": "hands_off_to_tdd",
|
|
19
19
|
"passed": true,
|
|
20
|
-
"evidence": "
|
|
20
|
+
"evidence": "**Next step:** This is a functional code change. Required next skill: `slow-powers:test-driven-development`.",
|
|
21
21
|
"confidence": 1,
|
|
22
22
|
"grader": "llm_judge"
|
|
23
23
|
}
|
|
24
24
|
],
|
|
25
25
|
"summary": {
|
|
26
|
-
"passed":
|
|
27
|
-
"failed":
|
|
26
|
+
"passed": 3,
|
|
27
|
+
"failed": 0,
|
|
28
28
|
"total": 3,
|
|
29
|
-
"pass_rate":
|
|
29
|
+
"pass_rate": 1
|
|
30
30
|
},
|
|
31
31
|
"meta_results": [
|
|
32
32
|
{
|
package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "routes_to_working_in_isolation",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "REQUIRED NEXT SKILL: `slow-powers:working-in-isolation` — complete this before starting, so the reorganization branch doesn't collide with other in-flight work.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "does_not_force_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The response names 'REQUIRED NEXT SKILL: `slow-powers:working-in-isolation`' as the required gate and never mentions test-driven-development. The only reference to tests is in step 6/Verify: 'Run: full test suite; tsc --noEmit; linter' — using existing tests to confirm no behavior change, not demanding tests-first authorship.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "routes_to_working_in_isolation",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "\"Before you start: because this is a structural, non-functional change, set up an isolated branch/worktree first (`slow-powers:working-in-isolation`) so this work doesn't collide with other in-progress branches.\"",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "does_not_force_tdd",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "Task 5 says 'Run the full test suite to confirm no behavior changes were introduced' — running existing tests to validate a refactor, not imposing TDD. The only skill referenced is 'slow-powers:working-in-isolation'; test-driven-development is never mentioned or required.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -90,6 +90,52 @@
|
|
|
90
90
|
"rubric": "Does the response make a real hand-off to the implementation skill — explicitly naming `slow-powers:test-driven-development` as the required next skill for the implementation phase — rather than merely dropping 'TDD' as an unexplained label or parenthetical? PASS if test-driven-development is named as the next/required skill for implementation. FAIL if 'TDD' appears only as a bare label with no hand-off, or is absent. (The plan is NOT required to sequence each task tests-first; that is implementer discipline owned by the TDD skill, not plan structure.)"
|
|
91
91
|
}
|
|
92
92
|
]
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"id": "docs-refactor-plan-mode",
|
|
96
|
+
"prompt": "[The following is the conversation so far in this session. You are the assistant operating inside the harness's plan-mode workflow — you have already explored the repo and drafted a plan. Continue from the final user turn.]\n\nUser: Our docs have drifted. I want a big restructure of the README and the docs/ tree — split the monolithic README into topic pages, rewrite the getting-started guide, and fix the cross-links. Draft a plan I can review before you touch any files.\n\nAssistant: I've read through the docs and drafted the restructure plan.\n\n## Approach\n1. Split `README.md` into `docs/install.md`, `docs/usage.md`, and `docs/faq.md`, leaving a short overview in `README.md`.\n2. Rewrite `docs/getting-started.md` against the current CLI flags.\n3. Update every internal cross-link to the new page locations.\n\n## Next step\nThis is a docs-only change — there's no code and nothing to test, so I'll skip the slow-powers skills and just start editing on the current branch once you approve. Working in a separate workspace would be overkill for prose.\n\nUser: Approach looks right. Give me the finalized plan and let's go.",
|
|
97
|
+
"expected_output": "The agent should load `hardening-plans` and run the fresh-eyes review on the drafted docs-restructure plan (spec coverage, referenced files real, every step relevant, no placeholders, names consistent). Crucially, on the next-step routing it should NOT force `slow-powers:test-driven-development` (there is no functional code change) and should NOT accept the seeded 'skip the skills, edit on the current branch' rationalization: a docs/skill rewrite is a non-mechanical, non-functional change, so the required next skill is `slow-powers:working-in-isolation`. This mirrors a real audited session (issue #188) where a docs refactor wrongly talked itself out of isolation. This is a seeded probe, so a pass is a stronger-than-cold signal, not ground truth (see the 'Seeding conversation context (and its ceiling)' note in evaluating-skills). skill_should_trigger stays true.",
|
|
98
|
+
"assertions": [
|
|
99
|
+
{
|
|
100
|
+
"id": "routes_to_working_in_isolation",
|
|
101
|
+
"type": "llm_judge",
|
|
102
|
+
"rubric": "Does the response name `slow-powers:working-in-isolation` as the required next step before editing — correcting the seeded plan's 'skip the skills and edit on the current branch' stance? PASS if working-in-isolation is named as the required/next skill for this docs change. FAIL if it endorses skipping isolation, or names no next skill at all."
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
"id": "does_not_force_tdd",
|
|
106
|
+
"type": "llm_judge",
|
|
107
|
+
"rubric": "Does the response correctly refrain from demanding `slow-powers:test-driven-development` as the required next skill for this docs-only change? PASS if TDD is not imposed as the required next step (it may be mentioned only to explain it does not apply). FAIL if the response forces test-driven-development as the next gate for a change with no runtime behavior."
|
|
108
|
+
}
|
|
109
|
+
]
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"id": "structural-refactor-cold",
|
|
113
|
+
"prompt": "I'm about to reorganize our backend source. Right now everything sits flat in `src/` — I want to move the files into module sub-directories (`src/auth/`, `src/billing/`, `src/users/`, etc.) and fix the imports, with no behavior changes. Before I start, break this into a plan I can review.",
|
|
114
|
+
"expected_output": "The agent should load `hardening-plans` and produce a concrete refactor plan (every referenced path real or a real named home, no placeholders, every step relevant to the move). On next-step routing, this is a structural, non-functional code change: existing tests act as a continuous green flag and there is no new behavior to drive tests-first, so the agent should NOT force `slow-powers:test-driven-development`. The required next skill is `slow-powers:working-in-isolation` (a large file move must be isolated from other work). This mirrors a real audited session (issue #188). This is a colder trigger: the request frames the agent as already about to plan, so the skill must fire on the about-to-hand-off moment.",
|
|
115
|
+
"assertions": [
|
|
116
|
+
{
|
|
117
|
+
"id": "routes_to_working_in_isolation",
|
|
118
|
+
"type": "llm_judge",
|
|
119
|
+
"rubric": "Does the response name `slow-powers:working-in-isolation` as the required next step before starting the refactor? PASS if working-in-isolation is named as the required/next skill. FAIL if no next skill is named, or only TDD is named."
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
"id": "does_not_force_tdd",
|
|
123
|
+
"type": "llm_judge",
|
|
124
|
+
"rubric": "Does the response correctly refrain from demanding `slow-powers:test-driven-development` as the required next skill for this behavior-preserving refactor? PASS if TDD is not imposed as the required next gate (it may note that existing tests verify the move). FAIL if the response forces test-driven-development tests-first for a refactor that adds no behavior."
|
|
125
|
+
}
|
|
126
|
+
]
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
"id": "research-plan-no-required-skill",
|
|
130
|
+
"prompt": "Before we change anything, I want to understand why our CI pipeline got slow over the last month. Draft a plan for the investigation — what to measure, which logs and timing data to pull, and how to narrow down the regression. No code changes yet, just the investigation steps.",
|
|
131
|
+
"expected_output": "The agent should load `hardening-plans` and harden the investigation plan (concrete measurement steps, real log/artifact references, no placeholders, every step relevant to finding the regression). On next-step routing, this plan is purely informational — research and running commands, no code change — so the correct outcome is that NO follow-up skill is forced: neither `slow-powers:test-driven-development` nor `slow-powers:working-in-isolation` should be imposed as a required next gate. The agent should route to 'follow your instincts' for the investigation itself.",
|
|
132
|
+
"assertions": [
|
|
133
|
+
{
|
|
134
|
+
"id": "no_forced_next_skill",
|
|
135
|
+
"type": "llm_judge",
|
|
136
|
+
"rubric": "Does the response correctly avoid imposing any required follow-up slow-powers skill for this research/investigation plan? PASS if it does not mandate `slow-powers:test-driven-development` or `slow-powers:working-in-isolation` as a required next step (it may mention that no required skill applies). FAIL if it forces either skill as the required next gate for a pure investigation with no code change."
|
|
137
|
+
}
|
|
138
|
+
]
|
|
93
139
|
}
|
|
94
140
|
]
|
|
95
141
|
}
|