@slowdini/slow-powers-opencode 0.1.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -13
- package/package.json +5 -1
- package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +22 -20
- package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/pressure-scenarios.md +1 -1
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/hardening-plans/SKILL.md +1 -1
- package/skills/systematic-debugging/SKILL.md +4 -0
- package/skills/test-driven-development/SKILL.md +2 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +99 -0
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +23 -0
- package/skills/verifying-development-work/evals/baseline/NOTES.md +87 -0
- package/skills/verifying-development-work/evals/baseline/benchmark.json +54 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +178 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
- package/skills/working-in-isolation/SKILL.md +2 -2
- package/skills/writing-skills/SKILL.md +2 -3
- package/skills/finishing-a-development-branch/SKILL.md +0 -96
- package/skills/finishing-a-development-branch/evals/evals.json +0 -41
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
- package/skills/verification-before-completion/SKILL.md +0 -65
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +0 -22
- package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
- package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/evals.json +0 -77
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import {
|
|
5
|
+
findByDescription,
|
|
6
|
+
parseTranscriptFull,
|
|
7
|
+
} from "./adapters/claude-code-transcript";
|
|
8
|
+
import { detectRunContext } from "./context";
|
|
9
|
+
import type { RunRecord, TimingRecord } from "./types";
|
|
10
|
+
import { validateAgainstSchema } from "./validate-schema";
|
|
11
|
+
|
|
12
|
+
function die(msg: string): never {
|
|
13
|
+
console.error(`error: ${msg}`);
|
|
14
|
+
process.exit(1);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/** The dispatch.json task shape record-runs consumes (see DispatchTask in
|
|
18
|
+
* run.ts — `dispatch_prompt` is stripped from the serialized file). */
|
|
19
|
+
type DispatchTask = {
|
|
20
|
+
eval_id: string;
|
|
21
|
+
condition: string;
|
|
22
|
+
skill_path: string | null;
|
|
23
|
+
user_prompt: string;
|
|
24
|
+
fixtures: string[];
|
|
25
|
+
outputs_dir: string;
|
|
26
|
+
run_record_path: string;
|
|
27
|
+
timing_path: string;
|
|
28
|
+
agent_description: string;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
export type RecordRunsResult = {
|
|
32
|
+
recorded: number;
|
|
33
|
+
skippedExisting: number;
|
|
34
|
+
skippedNoFinalMessage: number;
|
|
35
|
+
missingTranscript: number;
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Assembles a schema-valid `run.json` (and backfills `timing.json`) for every
|
|
40
|
+
* task in the iteration's `dispatch.json`, from sources already on disk:
|
|
41
|
+
*
|
|
42
|
+
* - carry-over fields (`prompt` ← `user_prompt`, `files` ← `fixtures`,
|
|
43
|
+
* `eval_id`/`condition`/`skill_path`) from `dispatch.json`;
|
|
44
|
+
* - `final_message` from `<outputs_dir>/final-message.md` (the dispatch prompt
|
|
45
|
+
* instructs the subagent to write it), falling back to the transcript's last
|
|
46
|
+
* assistant text;
|
|
47
|
+
* - `tool_invocations`, tokens, and duration from the persisted Claude Code
|
|
48
|
+
* transcript (Claude-Code-tier, like fill-transcripts — transcript-less
|
|
49
|
+
* harnesses keep hand-authoring these records).
|
|
50
|
+
*
|
|
51
|
+
* Existing records always win: an agent/operator-written `run.json` is skipped
|
|
52
|
+
* without `overwrite`, and `timing.json` is backfill-only — completion-event
|
|
53
|
+
* numbers captured at dispatch time are never replaced by transcript-derived
|
|
54
|
+
* ones, which include cache accounting and are not comparable 1:1.
|
|
55
|
+
*/
|
|
56
|
+
export function recordRuns(opts: {
|
|
57
|
+
iterationDir: string;
|
|
58
|
+
subagentsDir: string;
|
|
59
|
+
overwrite?: boolean;
|
|
60
|
+
}): RecordRunsResult {
|
|
61
|
+
const { iterationDir, subagentsDir, overwrite = false } = opts;
|
|
62
|
+
|
|
63
|
+
const dispatchPath = join(iterationDir, "dispatch.json");
|
|
64
|
+
if (!existsSync(dispatchPath)) {
|
|
65
|
+
throw new Error(
|
|
66
|
+
`${dispatchPath} not found — record-runs assembles records from dispatch.json and only supports runner-built iterations. For hand-authored runs, write run.json + timing.json manually (see schema/run-record.schema.json).`,
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
|
|
70
|
+
tasks?: DispatchTask[];
|
|
71
|
+
};
|
|
72
|
+
const tasks = dispatch.tasks ?? [];
|
|
73
|
+
|
|
74
|
+
const result: RecordRunsResult = {
|
|
75
|
+
recorded: 0,
|
|
76
|
+
skippedExisting: 0,
|
|
77
|
+
skippedNoFinalMessage: 0,
|
|
78
|
+
missingTranscript: 0,
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
for (const task of tasks) {
|
|
82
|
+
const slot = `${task.eval_id}/${task.condition}`;
|
|
83
|
+
|
|
84
|
+
const subagent = findByDescription(subagentsDir, task.agent_description);
|
|
85
|
+
const summary = subagent ? parseTranscriptFull(subagent.jsonlPath) : null;
|
|
86
|
+
if (!subagent) {
|
|
87
|
+
console.warn(
|
|
88
|
+
`miss ${slot}: no subagent transcript with description='${task.agent_description}'`,
|
|
89
|
+
);
|
|
90
|
+
result.missingTranscript++;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// run.json — skip if the agent/operator already wrote one.
|
|
94
|
+
if (existsSync(task.run_record_path) && !overwrite) {
|
|
95
|
+
console.log(
|
|
96
|
+
`skip ${slot}: run.json already exists (use --overwrite to replace)`,
|
|
97
|
+
);
|
|
98
|
+
result.skippedExisting++;
|
|
99
|
+
} else {
|
|
100
|
+
const finalMessagePath = join(task.outputs_dir, "final-message.md");
|
|
101
|
+
let finalMessage: string | null = null;
|
|
102
|
+
if (existsSync(finalMessagePath)) {
|
|
103
|
+
finalMessage = readFileSync(finalMessagePath, "utf8").trim();
|
|
104
|
+
} else if (summary?.final_text) {
|
|
105
|
+
console.warn(
|
|
106
|
+
`warn ${slot}: ${finalMessagePath} missing — using the transcript's last assistant text as final_message`,
|
|
107
|
+
);
|
|
108
|
+
finalMessage = summary.final_text;
|
|
109
|
+
}
|
|
110
|
+
if (finalMessage === null) {
|
|
111
|
+
console.warn(
|
|
112
|
+
`skip ${slot}: no final-message.md and no transcript text — was this task dispatched? Not writing a blank record.`,
|
|
113
|
+
);
|
|
114
|
+
result.skippedNoFinalMessage++;
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const record: RunRecord = {
|
|
119
|
+
eval_id: task.eval_id,
|
|
120
|
+
condition: task.condition,
|
|
121
|
+
skill_path: task.skill_path,
|
|
122
|
+
prompt: task.user_prompt,
|
|
123
|
+
files: task.fixtures,
|
|
124
|
+
final_message: finalMessage,
|
|
125
|
+
tool_invocations: summary?.tool_invocations ?? [],
|
|
126
|
+
// Timing lives in timing.json; run.json never carries it.
|
|
127
|
+
total_tokens: null,
|
|
128
|
+
duration_ms: null,
|
|
129
|
+
};
|
|
130
|
+
validateAgainstSchema<RunRecord>(
|
|
131
|
+
"run-record",
|
|
132
|
+
record,
|
|
133
|
+
task.run_record_path,
|
|
134
|
+
);
|
|
135
|
+
writeFileSync(
|
|
136
|
+
task.run_record_path,
|
|
137
|
+
`${JSON.stringify(record, null, 2)}\n`,
|
|
138
|
+
);
|
|
139
|
+
console.log(
|
|
140
|
+
`record ${slot}: wrote run.json with ${record.tool_invocations.length} tool_invocations`,
|
|
141
|
+
);
|
|
142
|
+
result.recorded++;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// timing.json — backfill only; completion-event numbers always win.
|
|
146
|
+
const timingExists = existsSync(task.timing_path);
|
|
147
|
+
if (summary && (!timingExists || overwrite)) {
|
|
148
|
+
const timing: TimingRecord = {
|
|
149
|
+
total_tokens: summary.total_tokens,
|
|
150
|
+
duration_ms: summary.duration_ms,
|
|
151
|
+
source: "transcript",
|
|
152
|
+
};
|
|
153
|
+
writeFileSync(task.timing_path, `${JSON.stringify(timing, null, 2)}\n`);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return result;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function parseArgs(argv: string[]) {
|
|
161
|
+
const flag = (name: string): string | undefined => {
|
|
162
|
+
const i = argv.indexOf(`--${name}`);
|
|
163
|
+
if (i === -1) return undefined;
|
|
164
|
+
return argv[i + 1];
|
|
165
|
+
};
|
|
166
|
+
const iteration = flag("iteration");
|
|
167
|
+
const subagentsDir = flag("subagents-dir");
|
|
168
|
+
const overwrite = argv.includes("--overwrite");
|
|
169
|
+
if (!iteration) die("missing --iteration");
|
|
170
|
+
if (!subagentsDir)
|
|
171
|
+
die(
|
|
172
|
+
"missing --subagents-dir (e.g. ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
|
|
173
|
+
);
|
|
174
|
+
return { iteration, subagentsDir, overwrite };
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
if (import.meta.main) {
|
|
178
|
+
const argv = Bun.argv.slice(2);
|
|
179
|
+
const { iteration, subagentsDir, overwrite } = parseArgs(argv);
|
|
180
|
+
const ctx = detectRunContext(argv);
|
|
181
|
+
|
|
182
|
+
if (!existsSync(subagentsDir))
|
|
183
|
+
die(`subagents-dir not found: ${subagentsDir}`);
|
|
184
|
+
|
|
185
|
+
const iterationDir = join(
|
|
186
|
+
ctx.workspaceRoot,
|
|
187
|
+
ctx.skillName,
|
|
188
|
+
`iteration-${iteration}`,
|
|
189
|
+
);
|
|
190
|
+
if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
|
|
191
|
+
|
|
192
|
+
let result: RecordRunsResult;
|
|
193
|
+
try {
|
|
194
|
+
result = recordRuns({ iterationDir, subagentsDir, overwrite });
|
|
195
|
+
} catch (err) {
|
|
196
|
+
die(err instanceof Error ? err.message : String(err));
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
console.log(
|
|
200
|
+
`\nRecorded: ${result.recorded}, skipped (existing run.json): ${result.skippedExisting}, skipped (no final message): ${result.skippedNoFinalMessage}, missing transcript: ${result.missingTranscript}`,
|
|
201
|
+
);
|
|
202
|
+
if (result.missingTranscript > 0)
|
|
203
|
+
console.warn(
|
|
204
|
+
"Missing transcripts mean the dispatching agent's dispatch `description` did not match the task's `agent_description` in dispatch.json. Those slots got empty tool_invocations (transcript_check assertions will grade unverifiable) and no transcript-derived timing.",
|
|
205
|
+
);
|
|
206
|
+
console.log(
|
|
207
|
+
`\nNext: bun run evals:detect-stray-writes -- --skill ${ctx.skillName} --iteration ${iteration}\nThen: bun run evals:grade -- --skill ${ctx.skillName} --iteration ${iteration}`,
|
|
208
|
+
);
|
|
209
|
+
}
|