@slowdini/slow-powers-opencode 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +37 -65
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -13
  5. package/skills/evaluating-skills/SKILL.md +91 -337
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/verifying-development-work/SKILL.md +17 -6
  17. package/skills/verifying-development-work/code-review.md +68 -0
  18. package/skills/verifying-development-work/comment-review.md +85 -0
  19. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  20. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  21. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  22. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  23. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  25. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  26. package/skills/verifying-development-work/evals/evals.json +34 -2
  27. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  28. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  29. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  30. package/skills/evaluating-skills/harness-details/claude.md +0 -158
  31. package/skills/evaluating-skills/runner/README.md +0 -154
  32. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  33. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  34. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
  35. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
  36. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
  37. package/skills/evaluating-skills/runner/aggregate.ts +0 -248
  38. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  39. package/skills/evaluating-skills/runner/context.ts +0 -90
  40. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
  41. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
  42. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  43. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  44. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  45. package/skills/evaluating-skills/runner/grade.ts +0 -603
  46. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  47. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  48. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  49. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
  50. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  51. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  52. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  53. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  54. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
  55. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
  56. package/skills/evaluating-skills/runner/run.test.ts +0 -1180
  57. package/skills/evaluating-skills/runner/run.ts +0 -1029
  58. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
  59. package/skills/evaluating-skills/runner/types.ts +0 -112
  60. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  61. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  62. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  63. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  64. package/skills/evaluating-skills/runner/validate.ts +0 -21
  65. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  66. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  67. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  68. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
  69. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
  70. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  71. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  72. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
  73. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  74. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  75. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  76. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  77. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  78. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  79. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  80. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  81. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  82. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -1,192 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
3
- import { join } from "node:path";
4
- import { detectRunContext } from "./context";
5
- import { classifyBash, isUnder, pathArg, WRITE_TOOLS } from "./sandbox-policy";
6
- import type { ConditionsRecord, RunRecord, ToolInvocation } from "./types";
7
- import { validateAgainstSchema } from "./validate-schema";
8
-
9
- function die(msg: string): never {
10
- console.error(`error: ${msg}`);
11
- process.exit(1);
12
- }
13
-
14
- export type StrayFinding = {
15
- tool: string;
16
- path?: string;
17
- command?: string;
18
- ordinal: number;
19
- reason: string;
20
- };
21
-
22
- export type RunFindings = {
23
- violations: StrayFinding[];
24
- warnings: StrayFinding[];
25
- };
26
-
27
- /**
28
- * Classify a run's tool invocations against its allowed outputs dir.
29
- *
30
- * - `violations`: file-write tools (Write/Edit/MultiEdit/NotebookEdit) whose
31
- * target path resolves outside `outputsDir`. High confidence — a run that
32
- * edits the real repo is a tainted data point.
33
- * - `warnings`: Bash commands matching a mutating pattern that don't reference
34
- * `outputsDir`. Heuristic — review before trusting.
35
- *
36
- * Relative paths resolve against `repoRoot` (the subagent's working dir);
37
- * Claude Code's write tools use absolute paths, so this is a best-effort
38
- * fallback only.
39
- */
40
- export function detectStrayWrites(
41
- invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
42
- outputsDir: string,
43
- repoRoot: string,
44
- ): RunFindings {
45
- const violations: StrayFinding[] = [];
46
- const warnings: StrayFinding[] = [];
47
-
48
- for (const inv of invocations) {
49
- if (WRITE_TOOLS.has(inv.name)) {
50
- const p = pathArg(inv.args);
51
- if (p && !isUnder(p, outputsDir, repoRoot)) {
52
- violations.push({
53
- tool: inv.name,
54
- path: p,
55
- ordinal: inv.ordinal,
56
- reason: "writes outside the run's outputs dir",
57
- });
58
- }
59
- continue;
60
- }
61
-
62
- if (inv.name === "Bash") {
63
- const args = inv.args as { command?: unknown } | undefined;
64
- const command = typeof args?.command === "string" ? args.command : "";
65
- const reason = classifyBash(command, [outputsDir]);
66
- if (reason)
67
- warnings.push({ tool: "Bash", command, ordinal: inv.ordinal, reason });
68
- }
69
- }
70
-
71
- return { violations, warnings };
72
- }
73
-
74
- if (import.meta.main) {
75
- const argv = Bun.argv.slice(2);
76
- const flag = (name: string): string | undefined => {
77
- const i = argv.indexOf(`--${name}`);
78
- return i === -1 ? undefined : argv[i + 1];
79
- };
80
- const iteration = flag("iteration");
81
- if (!iteration) die("missing --iteration");
82
- const ctx = detectRunContext(argv);
83
-
84
- const iterationDir = join(
85
- ctx.workspaceRoot,
86
- ctx.skillName,
87
- `iteration-${iteration}`,
88
- );
89
- if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
90
-
91
- const conditionsPath = join(iterationDir, "conditions.json");
92
- if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
93
- const conditions: ConditionsRecord = JSON.parse(
94
- readFileSync(conditionsPath, "utf8"),
95
- );
96
- const conditionNames = conditions.conditions.map((c) => c.name);
97
-
98
- // dispatch.json carries the authoritative outputs_dir per task; fall back to
99
- // the conventional <condDir>/outputs when it's absent (hand-authored runs).
100
- const dispatchPath = join(iterationDir, "dispatch.json");
101
- const outputsByKey = new Map<string, string>();
102
- if (existsSync(dispatchPath)) {
103
- try {
104
- const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
105
- tasks?: Array<{
106
- eval_id: string;
107
- condition: string;
108
- outputs_dir?: string;
109
- }>;
110
- };
111
- for (const t of dispatch.tasks ?? []) {
112
- if (t.outputs_dir)
113
- outputsByKey.set(`${t.eval_id}:${t.condition}`, t.outputs_dir);
114
- }
115
- } catch {
116
- // fall through to convention
117
- }
118
- }
119
-
120
- const repoRoot = process.cwd();
121
- const evalDirs = readdirSync(iterationDir).filter((d) =>
122
- d.startsWith("eval-"),
123
- );
124
-
125
- type RunReport = {
126
- eval_id: string;
127
- condition: string;
128
- violations: StrayFinding[];
129
- warnings: StrayFinding[];
130
- };
131
- const runs: RunReport[] = [];
132
- let totalViolations = 0;
133
- let totalWarnings = 0;
134
-
135
- for (const evalDir of evalDirs) {
136
- const evalId = evalDir.replace(/^eval-/, "");
137
- for (const cond of conditionNames) {
138
- const condDir = join(iterationDir, evalDir, cond);
139
- const runPath = join(condDir, "run.json");
140
- if (!existsSync(runPath)) continue;
141
- const run = validateAgainstSchema<RunRecord>(
142
- "run-record",
143
- JSON.parse(readFileSync(runPath, "utf8")),
144
- runPath,
145
- );
146
- const invocations = Array.isArray(run.tool_invocations)
147
- ? run.tool_invocations
148
- : [];
149
- const outputsDir =
150
- outputsByKey.get(`${evalId}:${cond}`) ?? join(condDir, "outputs");
151
- const findings = detectStrayWrites(invocations, outputsDir, repoRoot);
152
- if (findings.violations.length || findings.warnings.length) {
153
- runs.push({
154
- eval_id: evalId,
155
- condition: cond,
156
- violations: findings.violations,
157
- warnings: findings.warnings,
158
- });
159
- }
160
- totalViolations += findings.violations.length;
161
- totalWarnings += findings.warnings.length;
162
- }
163
- }
164
-
165
- const report = {
166
- generated: new Date().toISOString(),
167
- iteration: Number(iteration),
168
- totals: { violations: totalViolations, warnings: totalWarnings },
169
- runs,
170
- };
171
- const outPath = join(iterationDir, "stray-writes.json");
172
- validateAgainstSchema("stray-writes", report, outPath);
173
- writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`);
174
- console.log(`Wrote ${outPath}`);
175
-
176
- for (const r of runs) {
177
- for (const v of r.violations)
178
- console.warn(
179
- `✗ ${r.eval_id}/${r.condition}: ${v.tool} wrote outside outputs dir → ${v.path} (ordinal ${v.ordinal})`,
180
- );
181
- for (const w of r.warnings)
182
- console.warn(
183
- `⚠ ${r.eval_id}/${r.condition}: Bash ${w.reason} (ordinal ${w.ordinal}): ${w.command}`,
184
- );
185
- }
186
- if (totalViolations === 0 && totalWarnings === 0)
187
- console.log("✓ No out-of-bounds writes detected.");
188
- else
189
- console.warn(
190
- `\n${totalViolations} violation(s), ${totalWarnings} warning(s). Runs with violations edited files outside their sandbox — treat those data points as tainted.`,
191
- );
192
- }
@@ -1,73 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import { mkdirSync, rmSync, writeFileSync } from "node:fs";
3
- import { tmpdir } from "node:os";
4
- import { join } from "node:path";
5
- import { resolveAgentDescription } from "./fill-transcripts";
6
-
7
- const ROOT = join(tmpdir(), `fill-transcripts-test-${process.pid}`);
8
-
9
- beforeAll(() => mkdirSync(ROOT, { recursive: true }));
10
- afterAll(() => rmSync(ROOT, { recursive: true, force: true }));
11
-
12
- function writeDispatch(iterationDir: string, tasks: unknown[]) {
13
- mkdirSync(iterationDir, { recursive: true });
14
- writeFileSync(
15
- join(iterationDir, "dispatch.json"),
16
- JSON.stringify({ run_nonce: "abc123", tasks }, null, 2),
17
- );
18
- }
19
-
20
- describe("resolveAgentDescription", () => {
21
- test("returns the namespaced agent_description from dispatch.json", () => {
22
- const dir = join(ROOT, "iter-canonical");
23
- writeDispatch(dir, [
24
- {
25
- eval_id: "crash",
26
- condition: "with_skill",
27
- agent_description: "crash:with_skill:i3-abc123",
28
- },
29
- {
30
- eval_id: "crash",
31
- condition: "without_skill",
32
- agent_description: "crash:without_skill:i3-abc123",
33
- },
34
- ]);
35
- expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
36
- "crash:with_skill:i3-abc123",
37
- );
38
- expect(resolveAgentDescription(dir, "crash", "without_skill")).toBe(
39
- "crash:without_skill:i3-abc123",
40
- );
41
- });
42
-
43
- test("falls back to legacy reconstruction when dispatch.json is absent", () => {
44
- const dir = join(ROOT, "iter-no-dispatch");
45
- mkdirSync(dir, { recursive: true });
46
- expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
47
- "crash:with_skill",
48
- );
49
- });
50
-
51
- test("falls back when the task is missing from dispatch.json", () => {
52
- const dir = join(ROOT, "iter-partial");
53
- writeDispatch(dir, [
54
- {
55
- eval_id: "other",
56
- condition: "with_skill",
57
- agent_description: "other:with_skill:i1-x",
58
- },
59
- ]);
60
- expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
61
- "crash:with_skill",
62
- );
63
- });
64
-
65
- test("falls back when dispatch.json is malformed", () => {
66
- const dir = join(ROOT, "iter-malformed");
67
- mkdirSync(dir, { recursive: true });
68
- writeFileSync(join(dir, "dispatch.json"), "{ not valid json");
69
- expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
70
- "crash:with_skill",
71
- );
72
- });
73
- });
@@ -1,154 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
3
- import { join } from "node:path";
4
- import * as claudeAdapter from "./adapters/claude-code-transcript";
5
- import { detectRunContext } from "./context";
6
- import type { ConditionsRecord, RunRecord } from "./types";
7
- import { validateAgainstSchema } from "./validate-schema";
8
-
9
- function die(msg: string): never {
10
- console.error(`error: ${msg}`);
11
- process.exit(1);
12
- }
13
-
14
- type DispatchTaskRef = {
15
- eval_id: string;
16
- condition: string;
17
- agent_description?: string;
18
- };
19
-
20
- /**
21
- * The canonical dispatch description for an (eval, condition) run.
22
- *
23
- * The runner writes a unique `agent_description` per task into `dispatch.json`
24
- * (namespaced with the iteration + run nonce). Reading it back — rather than
25
- * reconstructing `<eval_id>:<condition>` — is what binds each run to the exact
26
- * agent that produced it, even when one parent session's shared subagents dir
27
- * holds colliding descriptions from other iterations. Falls back to the legacy
28
- * reconstruction when dispatch.json is absent (hand-authored/operator runs).
29
- */
30
- export function resolveAgentDescription(
31
- iterationDir: string,
32
- evalId: string,
33
- condition: string,
34
- ): string {
35
- const dispatchPath = join(iterationDir, "dispatch.json");
36
- if (existsSync(dispatchPath)) {
37
- try {
38
- const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
39
- tasks?: DispatchTaskRef[];
40
- };
41
- const task = dispatch.tasks?.find(
42
- (t) => t.eval_id === evalId && t.condition === condition,
43
- );
44
- if (task?.agent_description) return task.agent_description;
45
- } catch {
46
- // fall through to legacy reconstruction
47
- }
48
- }
49
- return `${evalId}:${condition}`;
50
- }
51
-
52
- function parseArgs(argv: string[]) {
53
- const flag = (name: string): string | undefined => {
54
- const i = argv.indexOf(`--${name}`);
55
- if (i === -1) return undefined;
56
- return argv[i + 1];
57
- };
58
- const has = (name: string) => argv.includes(`--${name}`);
59
- const iteration = flag("iteration");
60
- const subagentsDir = flag("subagents-dir");
61
- const overwrite = has("overwrite");
62
- if (!iteration) die("missing --iteration");
63
- if (!subagentsDir)
64
- die(
65
- "missing --subagents-dir (e.g. ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
66
- );
67
- return { iteration, subagentsDir, overwrite };
68
- }
69
-
70
- if (import.meta.main) {
71
- const fillArgv = Bun.argv.slice(2);
72
- const { iteration, subagentsDir, overwrite } = parseArgs(fillArgv);
73
- const fillCtx = detectRunContext(fillArgv);
74
- const skill = fillCtx.skillName;
75
-
76
- if (!existsSync(subagentsDir))
77
- die(`subagents-dir not found: ${subagentsDir}`);
78
-
79
- const adapter = claudeAdapter;
80
- console.log("Using harness transcript adapter: claude-code");
81
-
82
- const iterationDir = join(
83
- fillCtx.workspaceRoot,
84
- skill,
85
- `iteration-${iteration}`,
86
- );
87
- if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
88
-
89
- const conditionsPath = join(iterationDir, "conditions.json");
90
- if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
91
- const conditions: ConditionsRecord = JSON.parse(
92
- readFileSync(conditionsPath, "utf8"),
93
- );
94
- const conditionNames = conditions.conditions.map((c) => c.name);
95
-
96
- const evalDirs = readdirSync(iterationDir).filter((d) =>
97
- d.startsWith("eval-"),
98
- );
99
-
100
- let filled = 0;
101
- let skipped = 0;
102
- let missing = 0;
103
-
104
- for (const evalDir of evalDirs) {
105
- const evalId = evalDir.replace(/^eval-/, "");
106
- for (const cond of conditionNames) {
107
- const condDir = join(iterationDir, evalDir, cond);
108
- const runPath = join(condDir, "run.json");
109
- if (!existsSync(runPath)) continue;
110
-
111
- const run = validateAgainstSchema<RunRecord>(
112
- "run-record",
113
- JSON.parse(readFileSync(runPath, "utf8")),
114
- runPath,
115
- );
116
- const existing = Array.isArray(run.tool_invocations)
117
- ? run.tool_invocations
118
- : [];
119
- if (existing.length > 0 && !overwrite) {
120
- console.log(
121
- `skip ${evalId}/${cond}: already has ${existing.length} tool_invocations (use --overwrite to replace)`,
122
- );
123
- skipped++;
124
- continue;
125
- }
126
-
127
- const description = resolveAgentDescription(iterationDir, evalId, cond);
128
- const subagent = adapter.findByDescription(subagentsDir, description);
129
- if (!subagent) {
130
- console.warn(
131
- `miss ${evalId}/${cond}: no subagent transcript with description='${description}'`,
132
- );
133
- missing++;
134
- continue;
135
- }
136
-
137
- const invocations = adapter.parseTranscript(subagent.jsonlPath);
138
- run.tool_invocations = invocations;
139
- writeFileSync(runPath, `${JSON.stringify(run, null, 2)}\n`);
140
- console.log(
141
- `fill ${evalId}/${cond}: wrote ${invocations.length} tool_invocations from ${subagent.jsonlPath}`,
142
- );
143
- filled++;
144
- }
145
- }
146
-
147
- console.log(
148
- `\nFilled: ${filled}, skipped (already populated): ${skipped}, missing transcript: ${missing}`,
149
- );
150
- if (missing > 0)
151
- console.warn(
152
- "Missing transcripts mean the dispatching agent's dispatch `description` did not match the task's `agent_description` in dispatch.json (or dispatch.json is absent and the legacy `eval-id:condition` reconstruction found no match). transcript_check assertions for those runs will be graded unverifiable.",
153
- );
154
- }