@slowdini/slow-powers-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +174 -0
- package/bootstrap.md +16 -0
- package/opencode/plugins/slow-powers.js +86 -0
- package/package.json +66 -0
- package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
- package/skills/evaluating-skills/SKILL.md +448 -0
- package/skills/evaluating-skills/evals/evals.json +52 -0
- package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
- package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
- package/skills/evaluating-skills/harness-details/claude.md +135 -0
- package/skills/evaluating-skills/pressure-scenarios.md +163 -0
- package/skills/evaluating-skills/runner/README.md +140 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
- package/skills/evaluating-skills/runner/aggregate.ts +228 -0
- package/skills/evaluating-skills/runner/context.test.ts +181 -0
- package/skills/evaluating-skills/runner/context.ts +90 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
- package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
- package/skills/evaluating-skills/runner/grade.test.ts +347 -0
- package/skills/evaluating-skills/runner/grade.ts +603 -0
- package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
- package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
- package/skills/evaluating-skills/runner/guard/install.ts +147 -0
- package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
- package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
- package/skills/evaluating-skills/runner/run.test.ts +716 -0
- package/skills/evaluating-skills/runner/run.ts +814 -0
- package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
- package/skills/evaluating-skills/runner/types.ts +104 -0
- package/skills/evaluating-skills/runner/validate-all.ts +54 -0
- package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
- package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
- package/skills/evaluating-skills/runner/validate.test.ts +56 -0
- package/skills/evaluating-skills/runner/validate.ts +21 -0
- package/skills/evaluating-skills/schema/evals.schema.json +105 -0
- package/skills/evaluating-skills/schema/grading.schema.json +84 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
- package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
- package/skills/evaluating-skills/templates/evals.json.example +17 -0
- package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
- package/skills/finishing-a-development-branch/SKILL.md +96 -0
- package/skills/finishing-a-development-branch/evals/evals.json +41 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
- package/skills/hardening-plans/SKILL.md +72 -0
- package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
- package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
- package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
- package/skills/hardening-plans/evals/evals.json +114 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +84 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
- package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/systematic-debugging/evals/evals.json +45 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +93 -0
- package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
- package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
- package/skills/test-driven-development/evals/evals.json +77 -0
- package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
- package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +70 -0
- package/skills/using-git-worktrees/evals/evals.json +40 -0
- package/skills/verification-before-completion/SKILL.md +65 -0
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
- package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
- package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/evals.json +77 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
- package/skills/writing-skills/SKILL.md +306 -0
- package/skills/writing-skills/evals/evals.json +40 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/scripts/render-graphs.js +181 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import {
|
|
3
|
+
copyFileSync,
|
|
4
|
+
existsSync,
|
|
5
|
+
mkdirSync,
|
|
6
|
+
readdirSync,
|
|
7
|
+
readFileSync,
|
|
8
|
+
writeFileSync,
|
|
9
|
+
} from "node:fs";
|
|
10
|
+
import { join } from "node:path";
|
|
11
|
+
import { detectRunContext } from "./context";
|
|
12
|
+
import type { ConditionsRecord } from "./types";
|
|
13
|
+
|
|
14
|
+
function die(msg: string): never {
|
|
15
|
+
console.error(`error: ${msg}`);
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function ensureDir(path: string): void {
|
|
20
|
+
if (!existsSync(path)) mkdirSync(path, { recursive: true });
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function gitHead(cwd: string): string {
|
|
24
|
+
try {
|
|
25
|
+
const res = Bun.spawnSync(["git", "rev-parse", "--short", "HEAD"], {
|
|
26
|
+
cwd,
|
|
27
|
+
stdout: "pipe",
|
|
28
|
+
stderr: "ignore",
|
|
29
|
+
});
|
|
30
|
+
if (res.exitCode === 0) return res.stdout.toString().trim();
|
|
31
|
+
} catch {
|
|
32
|
+
// not a git repo / git unavailable — provenance still useful without it
|
|
33
|
+
}
|
|
34
|
+
return "unknown";
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export type PromoteOptions = {
|
|
38
|
+
workspaceRoot: string;
|
|
39
|
+
skillName: string;
|
|
40
|
+
skillSubdir: string;
|
|
41
|
+
iteration: string;
|
|
42
|
+
harness: string;
|
|
43
|
+
label: string | null;
|
|
44
|
+
/**
|
|
45
|
+
* Operator-declared models for provenance. The runner never dispatches the
|
|
46
|
+
* agent/judge itself, so it cannot observe these — record what was used.
|
|
47
|
+
*/
|
|
48
|
+
agentModel: string | null;
|
|
49
|
+
judgeModel: string | null;
|
|
50
|
+
/** Directory used to resolve the committing repo's git HEAD for provenance. */
|
|
51
|
+
gitCwd: string;
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Copies the durable, reference-worthy subset of a workspace iteration into the
|
|
56
|
+
* skill's version-controlled `evals/baseline/` directory: the aggregate
|
|
57
|
+
* `benchmark.json`, every per-run `grading.json` (judge rationales), and a
|
|
58
|
+
* `BASELINE.md` provenance file. Ephemeral scaffolding (dispatch files, timing,
|
|
59
|
+
* full run records, produced outputs, transcripts) is intentionally left behind
|
|
60
|
+
* in the gitignored workspace.
|
|
61
|
+
*/
|
|
62
|
+
export function promoteBaseline(opts: PromoteOptions): {
|
|
63
|
+
baselineDir: string;
|
|
64
|
+
gradingsCopied: number;
|
|
65
|
+
} {
|
|
66
|
+
const iterationDir = join(
|
|
67
|
+
opts.workspaceRoot,
|
|
68
|
+
opts.skillName,
|
|
69
|
+
`iteration-${opts.iteration}`,
|
|
70
|
+
);
|
|
71
|
+
if (!existsSync(iterationDir)) {
|
|
72
|
+
die(
|
|
73
|
+
`not found: ${iterationDir} (build/grade iteration-${opts.iteration} first)`,
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const benchmarkSrc = join(iterationDir, "benchmark.json");
|
|
78
|
+
if (!existsSync(benchmarkSrc)) {
|
|
79
|
+
die(
|
|
80
|
+
`missing benchmark.json in iteration-${opts.iteration} — run 'evals:aggregate' before promoting`,
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const conditionsSrc = join(iterationDir, "conditions.json");
|
|
85
|
+
const conditions: ConditionsRecord | null = existsSync(conditionsSrc)
|
|
86
|
+
? JSON.parse(readFileSync(conditionsSrc, "utf8"))
|
|
87
|
+
: null;
|
|
88
|
+
|
|
89
|
+
const baselineDir = join(opts.skillSubdir, "evals", "baseline");
|
|
90
|
+
const gradingDir = join(baselineDir, "grading");
|
|
91
|
+
ensureDir(gradingDir);
|
|
92
|
+
|
|
93
|
+
copyFileSync(benchmarkSrc, join(baselineDir, "benchmark.json"));
|
|
94
|
+
|
|
95
|
+
let gradingsCopied = 0;
|
|
96
|
+
for (const entry of readdirSync(iterationDir, { withFileTypes: true })) {
|
|
97
|
+
if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
|
|
98
|
+
const evalId = entry.name.slice("eval-".length);
|
|
99
|
+
const evalDir = join(iterationDir, entry.name);
|
|
100
|
+
for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
|
|
101
|
+
if (!cond.isDirectory()) continue;
|
|
102
|
+
const gradingSrc = join(evalDir, cond.name, "grading.json");
|
|
103
|
+
if (!existsSync(gradingSrc)) continue;
|
|
104
|
+
copyFileSync(
|
|
105
|
+
gradingSrc,
|
|
106
|
+
join(gradingDir, `${evalId}__${cond.name}.json`),
|
|
107
|
+
);
|
|
108
|
+
gradingsCopied++;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const head = gitHead(opts.gitCwd);
|
|
113
|
+
const mode = conditions?.mode ?? "unknown";
|
|
114
|
+
const timestamp = conditions?.timestamp ?? "unknown";
|
|
115
|
+
const conditionNames = conditions?.conditions.map((c) => c.name) ?? [];
|
|
116
|
+
const provenance = [
|
|
117
|
+
`# Baseline — ${opts.skillName}`,
|
|
118
|
+
"",
|
|
119
|
+
"Committed reference output from a canonical eval run. Regenerate with",
|
|
120
|
+
"`bun run evals:promote-baseline -- --skill " +
|
|
121
|
+
`${opts.skillName} --iteration <N>` +
|
|
122
|
+
"` after aggregating. The ephemeral workspace (run records, timing,",
|
|
123
|
+
"dispatch files, produced outputs) stays gitignored under `skills-workspace/`.",
|
|
124
|
+
"",
|
|
125
|
+
"| Field | Value |",
|
|
126
|
+
"|-------|-------|",
|
|
127
|
+
`| Mode | ${mode} |`,
|
|
128
|
+
`| Iteration | iteration-${opts.iteration} |`,
|
|
129
|
+
`| Harness | ${opts.harness} |`,
|
|
130
|
+
`| Agent model | ${opts.agentModel ?? "unspecified"} |`,
|
|
131
|
+
`| Judge model | ${opts.judgeModel ?? "unspecified"} |`,
|
|
132
|
+
`| Conditions | ${conditionNames.join(", ") || "unknown"} |`,
|
|
133
|
+
`| Run timestamp | ${timestamp} |`,
|
|
134
|
+
`| Label | ${opts.label ?? "(none)"} |`,
|
|
135
|
+
`| Promoted from commit | ${head} |`,
|
|
136
|
+
"",
|
|
137
|
+
"Files:",
|
|
138
|
+
"- `benchmark.json` — aggregate pass-rate / duration / token deltas.",
|
|
139
|
+
"- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.",
|
|
140
|
+
"",
|
|
141
|
+
].join("\n");
|
|
142
|
+
writeFileSync(join(baselineDir, "BASELINE.md"), `${provenance}\n`);
|
|
143
|
+
|
|
144
|
+
return { baselineDir, gradingsCopied };
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (import.meta.main) {
|
|
148
|
+
const argv = Bun.argv.slice(2);
|
|
149
|
+
let ctx: ReturnType<typeof detectRunContext>;
|
|
150
|
+
try {
|
|
151
|
+
ctx = detectRunContext(argv);
|
|
152
|
+
} catch (err) {
|
|
153
|
+
die(err instanceof Error ? err.message : String(err));
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const iterIdx = argv.indexOf("--iteration");
|
|
157
|
+
const iteration = iterIdx === -1 ? undefined : argv[iterIdx + 1];
|
|
158
|
+
if (!iteration) die("missing --iteration <N>");
|
|
159
|
+
|
|
160
|
+
const labelIdx = argv.indexOf("--label");
|
|
161
|
+
const label = labelIdx === -1 ? null : (argv[labelIdx + 1] ?? null);
|
|
162
|
+
|
|
163
|
+
const agentModelIdx = argv.indexOf("--agent-model");
|
|
164
|
+
const agentModel =
|
|
165
|
+
agentModelIdx === -1 ? null : (argv[agentModelIdx + 1] ?? null);
|
|
166
|
+
|
|
167
|
+
const judgeModelIdx = argv.indexOf("--judge-model");
|
|
168
|
+
const judgeModel =
|
|
169
|
+
judgeModelIdx === -1 ? null : (argv[judgeModelIdx + 1] ?? null);
|
|
170
|
+
|
|
171
|
+
const { baselineDir, gradingsCopied } = promoteBaseline({
|
|
172
|
+
workspaceRoot: ctx.workspaceRoot,
|
|
173
|
+
skillName: ctx.skillName,
|
|
174
|
+
skillSubdir: ctx.skillSubdir,
|
|
175
|
+
iteration,
|
|
176
|
+
harness: ctx.harness,
|
|
177
|
+
label,
|
|
178
|
+
agentModel,
|
|
179
|
+
judgeModel,
|
|
180
|
+
gitCwd: ctx.skillSubdir,
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
console.log(
|
|
184
|
+
`Promoted baseline for ${ctx.skillName} → ${baselineDir} (benchmark.json + ${gradingsCopied} grading file${gradingsCopied === 1 ? "" : "s"} + BASELINE.md)`,
|
|
185
|
+
);
|
|
186
|
+
}
|