@slowdini/slow-powers-opencode 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -26,7 +26,7 @@ Slow-powers guides agents through an evidence-backed, no-guess debugging approac
26
26
 
27
27
  Skills for writing skills! Slow-powers skills are all written and evaluated following the same guidelines and processes it ships. Back up your own skills with real stats, and understand their cost in time and tokens.
28
28
 
29
- Skill evaluations are powered by [@slowdini/eval-runner](https://github.com/slowdini/eval-runner)
29
+ Skill evaluations are powered by [eval-magic](https://github.com/slowdini/eval-magic)
30
30
 
31
31
  ## Installation
32
32
 
@@ -23,11 +23,18 @@ const bootstrapLeadingPhrase = "<EXTREMELY-IMPORTANT>";
23
23
  // once eliminates redundant fs work on every agent step.
24
24
  let _bootstrapCache; // undefined = not yet loaded, null = file missing
25
25
 
26
- // Tracks plan files we've already sent the hardening prompt for.
27
- // Once we ask the agent to harden a plan, we never ask again for that file.
28
- const hardeningPromptSentFor = new Set();
29
-
30
26
  export const SlowPowersPlugin = async ({ client, directory: _directory }) => {
27
+ // Tracks plan files we've already sent the hardening prompt for, keyed by
28
+ // `${sessionID}:${filePath}` so different sessions with the same plan path
29
+ // still get prompted. Scoped to the plugin instance (one per opencode process).
30
+ const hardeningPromptSentFor = new Set();
31
+
32
+ const log = (level, message) => {
33
+ client.app
34
+ .log({ body: { service: "slow-powers", level, message } })
35
+ .catch(() => {});
36
+ };
37
+
31
38
  // Helper to load bootstrap content (cached after first call)
32
39
  const getBootstrapContent = () => {
33
40
  if (_bootstrapCache !== undefined) return _bootstrapCache;
@@ -46,23 +53,30 @@ export const SlowPowersPlugin = async ({ client, directory: _directory }) => {
46
53
  const filePath = event.properties.file;
47
54
  const sessionID = event.properties.sessionID;
48
55
 
49
- if (!filePath || !sessionID) return;
50
-
51
- if (!filePath.match(/\.opencode\/plans\/.*\.md$/)) return;
56
+ if (!filePath || !sessionID) {
57
+ log("debug", `[hardening] skipped: missing filePath or sessionID`);
58
+ return;
59
+ }
52
60
 
53
- let session;
54
- try {
55
- session = await client.session.get({ path: { id: sessionID } });
56
- } catch {
61
+ if (!filePath.match(/\.opencode\/plans\/.*\.md$/)) {
62
+ log("debug", `[hardening] skipped: ${filePath} not in .opencode/plans/`);
57
63
  return;
58
64
  }
59
- if (session.agent !== "plan") return;
60
65
 
61
- // Only prompt once per plan file. After we've asked the agent to harden
62
- // it, we trust them to do so or not; re-prompting causes loops.
63
- if (hardeningPromptSentFor.has(filePath)) return;
66
+ const promptKey = `${sessionID}:${filePath}`;
67
+
68
+ // Only prompt once per plan file per session. After we've asked the agent
69
+ // to harden it, we trust them to do so or not; re-prompting causes loops.
70
+ if (hardeningPromptSentFor.has(promptKey)) {
71
+ log("debug", `[hardening] skipped: already prompted for ${promptKey}`);
72
+ return;
73
+ }
64
74
 
65
- hardeningPromptSentFor.add(filePath);
75
+ hardeningPromptSentFor.add(promptKey);
76
+ log(
77
+ "info",
78
+ `[hardening] prompting agent to harden ${filePath} in session ${sessionID}`,
79
+ );
66
80
 
67
81
  try {
68
82
  await client.session.prompt({
@@ -78,8 +92,8 @@ export const SlowPowersPlugin = async ({ client, directory: _directory }) => {
78
92
  },
79
93
  });
80
94
  } catch (err) {
81
- hardeningPromptSentFor.delete(filePath);
82
- console.error("[slow-powers] Failed to trigger hardening-plans:", err);
95
+ hardeningPromptSentFor.delete(promptKey);
96
+ log("error", `[hardening] failed to trigger hardening-plans: ${err}`);
83
97
  }
84
98
  };
85
99
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@slowdini/slow-powers-opencode",
3
- "version": "0.4.2",
3
+ "version": "0.4.4",
4
4
  "description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
5
5
  "type": "module",
6
6
  "main": "./opencode/plugins/slow-powers.js",
@@ -36,19 +36,19 @@
36
36
  },
37
37
  "scripts": {
38
38
  "test": "bun test --path-ignore-patterns='skills-workspace/**'",
39
- "evals": "bunx @slowdini/eval-runner run --skill-dir ./skills --bootstrap ./bootstrap.md",
40
- "evals:snapshot": "bunx @slowdini/eval-runner snapshot --skill-dir ./skills",
41
- "evals:validate": "bunx @slowdini/eval-runner validate --skill-dir ./skills",
42
- "evals:ingest": "bunx @slowdini/eval-runner ingest --skill-dir ./skills",
43
- "evals:finalize": "bunx @slowdini/eval-runner finalize --skill-dir ./skills",
44
- "evals:record-runs": "bunx @slowdini/eval-runner record-runs --skill-dir ./skills",
45
- "evals:fill-transcripts": "bunx @slowdini/eval-runner fill-transcripts --skill-dir ./skills",
46
- "evals:detect-stray-writes": "bunx @slowdini/eval-runner detect-stray-writes --skill-dir ./skills",
47
- "evals:teardown-guard": "bunx @slowdini/eval-runner teardown-guard --skill-dir ./skills",
48
- "evals:teardown": "bunx @slowdini/eval-runner teardown --skill-dir ./skills",
49
- "evals:grade": "bunx @slowdini/eval-runner grade --skill-dir ./skills",
50
- "evals:aggregate": "bunx @slowdini/eval-runner aggregate --skill-dir ./skills",
51
- "evals:promote-baseline": "bunx @slowdini/eval-runner promote-baseline --skill-dir ./skills",
39
+ "evals": "skill-eval run --skill-dir ./skills --bootstrap ./bootstrap.md",
40
+ "evals:snapshot": "skill-eval snapshot --skill-dir ./skills",
41
+ "evals:validate": "skill-eval validate --skill-dir ./skills",
42
+ "evals:ingest": "skill-eval ingest --skill-dir ./skills",
43
+ "evals:finalize": "skill-eval finalize --skill-dir ./skills",
44
+ "evals:record-runs": "skill-eval record-runs --skill-dir ./skills",
45
+ "evals:fill-transcripts": "skill-eval fill-transcripts --skill-dir ./skills",
46
+ "evals:detect-stray-writes": "skill-eval detect-stray-writes --skill-dir ./skills",
47
+ "evals:teardown-guard": "skill-eval teardown-guard --skill-dir ./skills",
48
+ "evals:teardown": "skill-eval teardown --skill-dir ./skills",
49
+ "evals:grade": "skill-eval grade --skill-dir ./skills",
50
+ "evals:aggregate": "skill-eval aggregate --skill-dir ./skills",
51
+ "evals:promote-baseline": "skill-eval promote-baseline --skill-dir ./skills",
52
52
  "check": "biome check --write .",
53
53
  "check:ci": "biome check --error-on-warnings .",
54
54
  "typecheck": "tsc --noEmit",
@@ -5,7 +5,7 @@ description: Use when testing whether a new skill improves agent behavior, or wh
5
5
 
6
6
  # Evaluating Skills
7
7
 
8
- Skill development has two phases: **drafting** (`slow-powers:writing-skills`) and **evaluation** (this skill). This skill owns the *craft* of evaluation — deciding whether a change needs measuring, designing test cases, devising pressure-testing scenarios, writing assertions, and reading results. The *mechanics* of actually running an eval — building the workspace, staging skills, dispatching subagents, grading, aggregating — are owned by a dedicated tool, **[`@slowdini/eval-runner`](https://www.npmjs.com/package/@slowdini/eval-runner)**, run via `bunx @slowdini/eval-runner`. See [Running the eval](#running-the-eval) for the hand-off.
8
+ Skill development has two phases: **drafting** (`slow-powers:writing-skills`) and **evaluation** (this skill). This skill owns the *craft* of evaluation — deciding whether a change needs measuring, designing test cases, devising pressure-testing scenarios, writing assertions, and reading results. The *mechanics* of actually running an eval — building the workspace, staging skills, dispatching subagents, grading, aggregating — are owned by a dedicated tool, **[eval-magic](https://github.com/slowdini/eval-magic)**, which ships as a dependency-less prebuilt binary you invoke as `skill-eval`. See [Running the eval](#running-the-eval) for the hand-off.
9
9
 
10
10
  ## Overview
11
11
 
@@ -95,7 +95,7 @@ A test case has these parts:
95
95
  - **files** (optional): fixture files the prompt references
96
96
  - **skill_should_trigger** (optional, default `true`): set `false` for a *negative* eval where correct behavior is the skill **not** firing (e.g. an over-trigger guard — a feature request that shouldn't launch a debugging investigation). Negative evals are excluded from the skill-invocation rate, so a correct non-invocation isn't mistaken for the skill failing to fire.
97
97
 
98
- Cases live in `<skill>/evals/evals.json`. For the file shape, see the bare scaffold the runner ships (`templates/evals.json.example` in `@slowdini/eval-runner`); for worked, maintained examples, read the live suites in this repo — e.g. `skills/verifying-development-work/evals/evals.json` and `skills/hardening-plans/evals/evals.json`.
98
+ Cases live in `<skill>/evals/evals.json`. For the file shape, see the author-template example in the eval-magic README and validate against the bundled schema with `skill-eval validate`; for worked, maintained examples, read the live suites in this repo — e.g. `skills/verifying-development-work/evals/evals.json` and `skills/hardening-plans/evals/evals.json`.
99
99
 
100
100
  Tips for writing good prompts:
101
101
 
@@ -142,7 +142,7 @@ Keep the seeded turns short and concrete; the point is to establish momentum, no
142
142
 
143
143
  **The ceiling — state it plainly.** A seed is *text the subagent reads*, not a state it operates under. It cannot place the agent in a harness-injected mode — a real plan mode, an enforced multi-phase workflow, genuine context-window pressure — it can only *describe* one. So when the wild failure you're chasing was *caused* by such a mode (the documented case: an agent in plan mode that invoked **zero** skills because the mode's own procedure made loading them feel redundant), a text seed cannot fully reproduce it — the causal layer is exactly the one a prompt string can't inject. A seeded **pass is therefore necessary but not sufficient** — it under-estimates real-session difficulty — and a seed that *fails* to reproduce a known wild failure is usually hitting this ceiling, not testing a bad seed. Treat seeded results as a stronger-than-cold signal, not as ground truth, and don't let downstream work over-trust them.
144
144
 
145
- **Narrowing the gap — `--plan-mode`.** For the documented plan-mode case, the runner offers the highest-fidelity in-runner approximation: its `--plan-mode` flag injects the harness's *verbatim* plan-mode procedure into every dispatch as an operating-context layer the subagent is told it is operating under, rather than a paraphrase the agent merely reads in the seed prose. This narrows the gap (verbatim procedure > paraphrase) but does **not** close it: it is still text the agent reads, not an injected mode, so the necessary-not-sufficient ceiling above stands unchanged. Use it as the strongest in-runner signal and pair it with a paraphrase-seed arm. See the runner's docs for the flag and the per-harness profiles it depends on.
145
+ **Narrowing the gap — `--plan-mode`.** For the documented plan-mode case, the runner offers the highest-fidelity in-runner approximation: its `--plan-mode` flag injects the harness's *verbatim* plan-mode procedure into every dispatch as an operating-context layer the subagent is told it is operating under, rather than a paraphrase the agent merely reads in the seed prose. This narrows the gap (verbatim procedure > paraphrase) but does **not** close it: it is still text the agent reads, not an injected mode, so the necessary-not-sufficient ceiling above stands unchanged. Use it as the strongest in-runner signal and pair it with a paraphrase-seed arm. See `skill-eval run --help` for the flag and the per-harness profiles it depends on.
146
146
 
147
147
  ## Writing assertions
148
148
 
@@ -186,19 +186,19 @@ Once a run is graded and aggregated, the headline is the **delta**: what the ski
186
186
 
187
187
  ## Running the eval
188
188
 
189
- The mechanics of executing a run live in **[`@slowdini/eval-runner`](https://www.npmjs.com/package/@slowdini/eval-runner)** — `bunx @slowdini/eval-runner`.
189
+ The mechanics of executing a run live in **[eval-magic](https://github.com/slowdini/eval-magic)** — the `skill-eval` binary. eval-magic's README is the complete operating guide, and every flag is documented in the tool's own help.
190
190
 
191
191
  | Need | Where |
192
192
  |------|-------|
193
- | Quickstart, install, the two modes end-to-end | the package README |
194
- | Every subcommand and flag; the `--skill-dir` model; workspace layout | `docs/cli.md` |
195
- | Full run mechanics: dispatch loop, transcript access, grading, aggregating, baselines | `docs/methodology.md` |
196
- | Claude Code operator walkthrough (isolating from installed plugins, the guard, judging) | `docs/harness-claude-code.md` |
193
+ | Quickstart, install, the two modes end-to-end | the eval-magic README |
194
+ | Every subcommand and flag; the `--skill-dir` model; workspace layout | `skill-eval --help` and `skill-eval <subcommand> --help` |
195
+ | Full run mechanics: dispatch loop, transcript access, grading, aggregating, baselines | the eval-magic README |
196
+ | Claude Code & Codex harness specifics — isolating from installed plugins, the guard, judging | the README's Harnesses section |
197
197
  | What a harness needs to reach Claude-Code-tier support | `docs/harness-parity.md` |
198
198
 
199
199
  ## See also
200
200
 
201
201
  - `slow-powers:writing-skills` — drafting a skill (Phase 1)
202
202
  - `pressure-scenarios.md` — pressure-scenario taxonomy for authoring prompts that stress discipline-enforcing skills
203
- - `@slowdini/eval-runner` the tool that runs the evals this skill teaches you to author
203
+ - eval-magic (the `skill-eval` tool) runs the evals this skill teaches you to author
204
204
  - agentskills.io/skill-creation/evaluating-skills — the methodology this skill is derived from