@slowdini/slow-powers-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +174 -0
- package/bootstrap.md +16 -0
- package/opencode/plugins/slow-powers.js +86 -0
- package/package.json +66 -0
- package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
- package/skills/evaluating-skills/SKILL.md +448 -0
- package/skills/evaluating-skills/evals/evals.json +52 -0
- package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
- package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
- package/skills/evaluating-skills/harness-details/claude.md +135 -0
- package/skills/evaluating-skills/pressure-scenarios.md +163 -0
- package/skills/evaluating-skills/runner/README.md +140 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
- package/skills/evaluating-skills/runner/aggregate.ts +228 -0
- package/skills/evaluating-skills/runner/context.test.ts +181 -0
- package/skills/evaluating-skills/runner/context.ts +90 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
- package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
- package/skills/evaluating-skills/runner/grade.test.ts +347 -0
- package/skills/evaluating-skills/runner/grade.ts +603 -0
- package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
- package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
- package/skills/evaluating-skills/runner/guard/install.ts +147 -0
- package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
- package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
- package/skills/evaluating-skills/runner/run.test.ts +716 -0
- package/skills/evaluating-skills/runner/run.ts +814 -0
- package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
- package/skills/evaluating-skills/runner/types.ts +104 -0
- package/skills/evaluating-skills/runner/validate-all.ts +54 -0
- package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
- package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
- package/skills/evaluating-skills/runner/validate.test.ts +56 -0
- package/skills/evaluating-skills/runner/validate.ts +21 -0
- package/skills/evaluating-skills/schema/evals.schema.json +105 -0
- package/skills/evaluating-skills/schema/grading.schema.json +84 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
- package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
- package/skills/evaluating-skills/templates/evals.json.example +17 -0
- package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
- package/skills/finishing-a-development-branch/SKILL.md +96 -0
- package/skills/finishing-a-development-branch/evals/evals.json +41 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
- package/skills/hardening-plans/SKILL.md +72 -0
- package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
- package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
- package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
- package/skills/hardening-plans/evals/evals.json +114 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +84 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
- package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/systematic-debugging/evals/evals.json +45 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +93 -0
- package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
- package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
- package/skills/test-driven-development/evals/evals.json +77 -0
- package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
- package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +70 -0
- package/skills/using-git-worktrees/evals/evals.json +40 -0
- package/skills/verification-before-completion/SKILL.md +65 -0
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
- package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
- package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/evals.json +77 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
- package/skills/writing-skills/SKILL.md +306 -0
- package/skills/writing-skills/evals/evals.json +40 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/scripts/render-graphs.js +181 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { detectRunContext } from "./context";
|
|
5
|
+
import type { ConditionsRecord, GradingResult, TimingRecord } from "./types";
|
|
6
|
+
|
|
7
|
+
function die(msg: string): never {
|
|
8
|
+
console.error(`error: ${msg}`);
|
|
9
|
+
process.exit(1);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
function parseArgs(argv: string[]) {
|
|
13
|
+
const flag = (name: string): string | undefined => {
|
|
14
|
+
const i = argv.indexOf(`--${name}`);
|
|
15
|
+
if (i === -1) return undefined;
|
|
16
|
+
return argv[i + 1];
|
|
17
|
+
};
|
|
18
|
+
const iteration = flag("iteration");
|
|
19
|
+
if (!iteration) die("missing --iteration");
|
|
20
|
+
return { iteration };
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
type Series = number[];
|
|
24
|
+
|
|
25
|
+
function mean(values: Series): number {
|
|
26
|
+
if (values.length === 0) return 0;
|
|
27
|
+
return values.reduce((a, b) => a + b, 0) / values.length;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function stddev(values: Series, m = mean(values)): number {
|
|
31
|
+
if (values.length < 2) return 0;
|
|
32
|
+
const v = values.reduce((s, x) => s + (x - m) ** 2, 0) / values.length;
|
|
33
|
+
return Math.sqrt(v);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function round(n: number, dp: number): number {
|
|
37
|
+
const p = 10 ** dp;
|
|
38
|
+
return Math.round(n * p) / p;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function stats(values: Series, dp: number) {
|
|
42
|
+
const m = mean(values);
|
|
43
|
+
return {
|
|
44
|
+
mean: round(m, dp),
|
|
45
|
+
stddev: round(stddev(values, m), dp),
|
|
46
|
+
n: values.length,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const aggArgv = Bun.argv.slice(2);
|
|
51
|
+
const { iteration } = parseArgs(aggArgv);
|
|
52
|
+
const aggCtx = detectRunContext(aggArgv);
|
|
53
|
+
const iterationDir = join(
|
|
54
|
+
aggCtx.workspaceRoot,
|
|
55
|
+
aggCtx.skillName,
|
|
56
|
+
`iteration-${iteration}`,
|
|
57
|
+
);
|
|
58
|
+
if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
|
|
59
|
+
|
|
60
|
+
const conditionsPath = join(iterationDir, "conditions.json");
|
|
61
|
+
if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
|
|
62
|
+
const conditions: ConditionsRecord = JSON.parse(
|
|
63
|
+
readFileSync(conditionsPath, "utf8"),
|
|
64
|
+
);
|
|
65
|
+
const conditionNames = conditions.conditions.map((c) => c.name);
|
|
66
|
+
if (conditionNames.length !== 2)
|
|
67
|
+
die(`expected exactly 2 conditions, got ${conditionNames.length}`);
|
|
68
|
+
|
|
69
|
+
const evalDirs = readdirSync(iterationDir).filter((d) => d.startsWith("eval-"));
|
|
70
|
+
if (evalDirs.length === 0) die("no eval directories found");
|
|
71
|
+
|
|
72
|
+
type Bucket = {
|
|
73
|
+
passRates: Series;
|
|
74
|
+
durations: Series;
|
|
75
|
+
tokens: Series;
|
|
76
|
+
skillInvoked: boolean[];
|
|
77
|
+
hadSkillLoaded: boolean;
|
|
78
|
+
};
|
|
79
|
+
const byCondition: Record<string, Bucket> = {};
|
|
80
|
+
const conditionSkillPaths = new Map<string, string | null>();
|
|
81
|
+
for (const c of conditions.conditions) {
|
|
82
|
+
conditionSkillPaths.set(c.name, c.skill_path);
|
|
83
|
+
byCondition[c.name] = {
|
|
84
|
+
passRates: [],
|
|
85
|
+
durations: [],
|
|
86
|
+
tokens: [],
|
|
87
|
+
skillInvoked: [],
|
|
88
|
+
hadSkillLoaded: !!c.skill_path,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
let missingGradings = 0;
|
|
93
|
+
for (const evalDir of evalDirs) {
|
|
94
|
+
for (const cond of conditionNames) {
|
|
95
|
+
const condDir = join(iterationDir, evalDir, cond);
|
|
96
|
+
const gradingPath = join(condDir, "grading.json");
|
|
97
|
+
const timingPath = join(condDir, "timing.json");
|
|
98
|
+
if (!existsSync(gradingPath)) {
|
|
99
|
+
console.warn(`warn: missing grading for ${evalDir}/${cond}`);
|
|
100
|
+
missingGradings++;
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
const grading: GradingResult = JSON.parse(
|
|
104
|
+
readFileSync(gradingPath, "utf8"),
|
|
105
|
+
);
|
|
106
|
+
byCondition[cond].passRates.push(grading.summary.pass_rate);
|
|
107
|
+
if (grading.meta_summary?.skill_invoked != null)
|
|
108
|
+
byCondition[cond].skillInvoked.push(grading.meta_summary.skill_invoked);
|
|
109
|
+
if (existsSync(timingPath)) {
|
|
110
|
+
const timing: TimingRecord = JSON.parse(readFileSync(timingPath, "utf8"));
|
|
111
|
+
if (typeof timing.total_tokens === "number")
|
|
112
|
+
byCondition[cond].tokens.push(timing.total_tokens);
|
|
113
|
+
if (typeof timing.duration_ms === "number")
|
|
114
|
+
byCondition[cond].durations.push(timing.duration_ms);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
type ConditionSummary = {
|
|
120
|
+
pass_rate: ReturnType<typeof stats>;
|
|
121
|
+
duration_ms: ReturnType<typeof stats>;
|
|
122
|
+
total_tokens: ReturnType<typeof stats>;
|
|
123
|
+
skill_invocation_rate?: number | null;
|
|
124
|
+
skill_invocation_n?: number;
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
const runSummary: Record<string, ConditionSummary> = {};
|
|
128
|
+
for (const cond of conditionNames) {
|
|
129
|
+
const bucket = byCondition[cond];
|
|
130
|
+
const summary: ConditionSummary = {
|
|
131
|
+
pass_rate: stats(bucket.passRates, 3),
|
|
132
|
+
duration_ms: stats(bucket.durations, 0),
|
|
133
|
+
total_tokens: stats(bucket.tokens, 0),
|
|
134
|
+
};
|
|
135
|
+
if (bucket.hadSkillLoaded) {
|
|
136
|
+
summary.skill_invocation_n = bucket.skillInvoked.length;
|
|
137
|
+
summary.skill_invocation_rate =
|
|
138
|
+
bucket.skillInvoked.length === 0
|
|
139
|
+
? null
|
|
140
|
+
: round(
|
|
141
|
+
bucket.skillInvoked.filter(Boolean).length /
|
|
142
|
+
bucket.skillInvoked.length,
|
|
143
|
+
3,
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
runSummary[cond] = summary;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const [a, b] = conditionNames;
|
|
150
|
+
const delta = {
|
|
151
|
+
direction: `${a} - ${b}`,
|
|
152
|
+
pass_rate: round(
|
|
153
|
+
runSummary[a].pass_rate.mean - runSummary[b].pass_rate.mean,
|
|
154
|
+
3,
|
|
155
|
+
),
|
|
156
|
+
duration_ms: round(
|
|
157
|
+
runSummary[a].duration_ms.mean - runSummary[b].duration_ms.mean,
|
|
158
|
+
0,
|
|
159
|
+
),
|
|
160
|
+
total_tokens: round(
|
|
161
|
+
runSummary[a].total_tokens.mean - runSummary[b].total_tokens.mean,
|
|
162
|
+
0,
|
|
163
|
+
),
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
const validityWarnings: string[] = [];
|
|
167
|
+
for (const cond of conditionNames) {
|
|
168
|
+
const s = runSummary[cond];
|
|
169
|
+
if (s.skill_invocation_rate != null && s.skill_invocation_rate < 1) {
|
|
170
|
+
validityWarnings.push(
|
|
171
|
+
`condition '${cond}' had skill loaded but invocation rate ${(s.skill_invocation_rate * 100).toFixed(0)}% (${s.skill_invocation_n} runs checked) — substantive results may not reflect skill effectiveness.`,
|
|
172
|
+
);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Stray-write findings (from `evals:detect-stray-writes`, if it ran) taint a
|
|
177
|
+
// run the same way a missed skill invocation does: a subagent that edited the
|
|
178
|
+
// real repo or installed packages is no longer a clean data point.
|
|
179
|
+
const strayPath = join(iterationDir, "stray-writes.json");
|
|
180
|
+
if (existsSync(strayPath)) {
|
|
181
|
+
try {
|
|
182
|
+
const stray = JSON.parse(readFileSync(strayPath, "utf8")) as {
|
|
183
|
+
runs?: Array<{
|
|
184
|
+
eval_id: string;
|
|
185
|
+
condition: string;
|
|
186
|
+
violations?: unknown[];
|
|
187
|
+
}>;
|
|
188
|
+
};
|
|
189
|
+
for (const r of stray.runs ?? []) {
|
|
190
|
+
const n = r.violations?.length ?? 0;
|
|
191
|
+
if (n > 0)
|
|
192
|
+
validityWarnings.push(
|
|
193
|
+
`${r.eval_id}/${r.condition} wrote ${n} file(s) outside its outputs dir — data point may be tainted (see stray-writes.json).`,
|
|
194
|
+
);
|
|
195
|
+
}
|
|
196
|
+
} catch {
|
|
197
|
+
// ignore a malformed report rather than failing aggregation
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const benchmark = {
|
|
202
|
+
generated: new Date().toISOString(),
|
|
203
|
+
mode: conditions.mode,
|
|
204
|
+
baseline: conditions.baseline,
|
|
205
|
+
conditions_compared: [a, b],
|
|
206
|
+
missing_gradings: missingGradings,
|
|
207
|
+
validity_warnings: validityWarnings,
|
|
208
|
+
run_summary: runSummary,
|
|
209
|
+
delta,
|
|
210
|
+
};
|
|
211
|
+
|
|
212
|
+
const outPath = join(iterationDir, "benchmark.json");
|
|
213
|
+
writeFileSync(outPath, `${JSON.stringify(benchmark, null, 2)}\n`);
|
|
214
|
+
console.log(`Wrote ${outPath}`);
|
|
215
|
+
if (missingGradings > 0)
|
|
216
|
+
console.warn(
|
|
217
|
+
`note: ${missingGradings} grading.json file(s) were missing — benchmark is incomplete.`,
|
|
218
|
+
);
|
|
219
|
+
for (const warning of validityWarnings) console.warn(`⚠ ${warning}`);
|
|
220
|
+
if (validityWarnings.length === 0) {
|
|
221
|
+
for (const cond of conditionNames) {
|
|
222
|
+
const s = runSummary[cond];
|
|
223
|
+
if (s.skill_invocation_rate === 1)
|
|
224
|
+
console.log(
|
|
225
|
+
`✓ ${cond}: skill invocation rate 100% (${s.skill_invocation_n} runs) — substantive results are valid.`,
|
|
226
|
+
);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
+
import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import { join, resolve } from "node:path";
|
|
5
|
+
import { detectRunContext } from "./context";
|
|
6
|
+
|
|
7
|
+
const FIXTURE_ROOT = join(tmpdir(), `slow-powers-context-test-${process.pid}`);
|
|
8
|
+
|
|
9
|
+
function fixturePath(name: string): string {
|
|
10
|
+
return join(FIXTURE_ROOT, name);
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function makeSkillDir(root: string, skills: string[]): string {
|
|
14
|
+
const dir = join(root, "skill-dir");
|
|
15
|
+
mkdirSync(dir, { recursive: true });
|
|
16
|
+
for (const name of skills) {
|
|
17
|
+
const sub = join(dir, name);
|
|
18
|
+
mkdirSync(sub, { recursive: true });
|
|
19
|
+
writeFileSync(
|
|
20
|
+
join(sub, "SKILL.md"),
|
|
21
|
+
`---\nname: ${name}\ndescription: ${name} skill\n---\n\nbody\n`,
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
return dir;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
beforeAll(() => {
|
|
28
|
+
mkdirSync(FIXTURE_ROOT, { recursive: true });
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
afterAll(() => {
|
|
32
|
+
rmSync(FIXTURE_ROOT, { recursive: true, force: true });
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
describe("detectRunContext", () => {
|
|
36
|
+
test("dies when --skill-dir is missing", () => {
|
|
37
|
+
expect(() => detectRunContext(["--skill", "foo"])).toThrow(/--skill-dir/);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test("dies when --skill is missing", () => {
|
|
41
|
+
const root = fixturePath("missing-skill");
|
|
42
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
43
|
+
expect(() => detectRunContext(["--skill-dir", skillDir])).toThrow(
|
|
44
|
+
/--skill/,
|
|
45
|
+
);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
test("dies when --skill-dir is not a directory", () => {
|
|
49
|
+
expect(() =>
|
|
50
|
+
detectRunContext([
|
|
51
|
+
"--skill-dir",
|
|
52
|
+
"/nonexistent/does-not-exist-12345",
|
|
53
|
+
"--skill",
|
|
54
|
+
"foo",
|
|
55
|
+
]),
|
|
56
|
+
).toThrow(/--skill-dir/);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
test("dies when skill subdir does not exist", () => {
|
|
60
|
+
const root = fixturePath("missing-subdir");
|
|
61
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
62
|
+
expect(() =>
|
|
63
|
+
detectRunContext(["--skill-dir", skillDir, "--skill", "bar"]),
|
|
64
|
+
).toThrow(/skill not found/);
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
test("dies when --bootstrap path is passed but file does not exist", () => {
|
|
68
|
+
const root = fixturePath("bad-bootstrap");
|
|
69
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
70
|
+
expect(() =>
|
|
71
|
+
detectRunContext([
|
|
72
|
+
"--skill-dir",
|
|
73
|
+
skillDir,
|
|
74
|
+
"--skill",
|
|
75
|
+
"foo",
|
|
76
|
+
"--bootstrap",
|
|
77
|
+
"/nonexistent/no-bootstrap-12345.md",
|
|
78
|
+
]),
|
|
79
|
+
).toThrow(/--bootstrap/);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test("returns RunContext with absolute paths when --skill-dir and --skill are valid", () => {
|
|
83
|
+
const root = fixturePath("happy-path");
|
|
84
|
+
const skillDir = makeSkillDir(root, ["mr-review"]);
|
|
85
|
+
const ctx = detectRunContext([
|
|
86
|
+
"--skill-dir",
|
|
87
|
+
skillDir,
|
|
88
|
+
"--skill",
|
|
89
|
+
"mr-review",
|
|
90
|
+
]);
|
|
91
|
+
expect(ctx.skillDir).toBe(resolve(skillDir));
|
|
92
|
+
expect(ctx.skillName).toBe("mr-review");
|
|
93
|
+
expect(ctx.skillSubdir).toBe(resolve(skillDir, "mr-review"));
|
|
94
|
+
expect(ctx.siblingSkillNames).toEqual([]);
|
|
95
|
+
expect(ctx.bootstrapPath).toBeNull();
|
|
96
|
+
expect(ctx.harness).toBe("claude-code");
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
test("enumerates siblings excluding the skill-under-test", () => {
|
|
100
|
+
const root = fixturePath("siblings");
|
|
101
|
+
const skillDir = makeSkillDir(root, ["alpha", "beta", "gamma"]);
|
|
102
|
+
const ctx = detectRunContext(["--skill-dir", skillDir, "--skill", "beta"]);
|
|
103
|
+
expect(ctx.siblingSkillNames.sort()).toEqual(["alpha", "gamma"]);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
test("ignores entries in --skill-dir that do not have a SKILL.md", () => {
|
|
107
|
+
const root = fixturePath("not-skills");
|
|
108
|
+
const skillDir = makeSkillDir(root, ["real"]);
|
|
109
|
+
mkdirSync(join(skillDir, "node_modules"), { recursive: true });
|
|
110
|
+
mkdirSync(join(skillDir, "no-skill-md-here"), { recursive: true });
|
|
111
|
+
writeFileSync(join(skillDir, "loose-file.txt"), "hello");
|
|
112
|
+
const ctx = detectRunContext(["--skill-dir", skillDir, "--skill", "real"]);
|
|
113
|
+
expect(ctx.siblingSkillNames).toEqual([]);
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
test("workspaceRoot defaults to <CWD>/skills-workspace when --workspace-dir is omitted", () => {
|
|
117
|
+
const root = fixturePath("workspace-default");
|
|
118
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
119
|
+
const ctx = detectRunContext(["--skill-dir", skillDir, "--skill", "foo"]);
|
|
120
|
+
expect(ctx.workspaceRoot).toBe(resolve(process.cwd(), "skills-workspace"));
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
test("workspaceRoot honors --workspace-dir override (resolved absolute)", () => {
|
|
124
|
+
const root = fixturePath("workspace-override");
|
|
125
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
126
|
+
const customWs = join(root, "custom-ws");
|
|
127
|
+
mkdirSync(customWs, { recursive: true });
|
|
128
|
+
const ctx = detectRunContext([
|
|
129
|
+
"--skill-dir",
|
|
130
|
+
skillDir,
|
|
131
|
+
"--skill",
|
|
132
|
+
"foo",
|
|
133
|
+
"--workspace-dir",
|
|
134
|
+
customWs,
|
|
135
|
+
]);
|
|
136
|
+
expect(ctx.workspaceRoot).toBe(resolve(customWs));
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
test("stageRoot defaults to CWD", () => {
|
|
140
|
+
const root = fixturePath("stage-default");
|
|
141
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
142
|
+
const ctx = detectRunContext(["--skill-dir", skillDir, "--skill", "foo"]);
|
|
143
|
+
expect(ctx.stageRoot).toBe(resolve(process.cwd()));
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
test("--bootstrap path is resolved absolute when file exists", () => {
|
|
147
|
+
const root = fixturePath("bootstrap-ok");
|
|
148
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
149
|
+
const bootstrapPath = join(root, "my-bootstrap.md");
|
|
150
|
+
writeFileSync(bootstrapPath, "BOOT");
|
|
151
|
+
const ctx = detectRunContext([
|
|
152
|
+
"--skill-dir",
|
|
153
|
+
skillDir,
|
|
154
|
+
"--skill",
|
|
155
|
+
"foo",
|
|
156
|
+
"--bootstrap",
|
|
157
|
+
bootstrapPath,
|
|
158
|
+
]);
|
|
159
|
+
expect(ctx.bootstrapPath).toBe(resolve(bootstrapPath));
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
test("unknown --harness value is rejected", () => {
|
|
163
|
+
const root = fixturePath("harness-bad");
|
|
164
|
+
const skillDir = makeSkillDir(root, ["foo"]);
|
|
165
|
+
expect(() =>
|
|
166
|
+
detectRunContext([
|
|
167
|
+
"--skill-dir",
|
|
168
|
+
skillDir,
|
|
169
|
+
"--skill",
|
|
170
|
+
"foo",
|
|
171
|
+
"--harness",
|
|
172
|
+
"vscode",
|
|
173
|
+
]),
|
|
174
|
+
).toThrow(/harness/);
|
|
175
|
+
});
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
// Sanity: ensure existsSync helper from node:fs is what we expect
|
|
179
|
+
test.skip("smoke: existsSync points at node:fs", () => {
|
|
180
|
+
expect(typeof existsSync).toBe("function");
|
|
181
|
+
});
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { existsSync, readdirSync, statSync } from "node:fs";
|
|
2
|
+
import { resolve } from "node:path";
|
|
3
|
+
|
|
4
|
+
export type Harness = "claude-code";
|
|
5
|
+
|
|
6
|
+
export type RunContext = {
|
|
7
|
+
skillDir: string;
|
|
8
|
+
skillName: string;
|
|
9
|
+
skillSubdir: string;
|
|
10
|
+
siblingSkillNames: string[];
|
|
11
|
+
workspaceRoot: string;
|
|
12
|
+
stageRoot: string;
|
|
13
|
+
bootstrapPath: string | null;
|
|
14
|
+
harness: Harness;
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
function die(msg: string): never {
|
|
18
|
+
throw new Error(msg);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function flag(argv: string[], name: string): string | undefined {
|
|
22
|
+
const i = argv.indexOf(`--${name}`);
|
|
23
|
+
if (i === -1) return undefined;
|
|
24
|
+
const v = argv[i + 1];
|
|
25
|
+
if (v === undefined || v.startsWith("--")) {
|
|
26
|
+
die(`flag --${name} requires a value`);
|
|
27
|
+
}
|
|
28
|
+
return v;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function detectRunContext(argv: string[]): RunContext {
|
|
32
|
+
const skillDirRaw = flag(argv, "skill-dir");
|
|
33
|
+
if (!skillDirRaw) die("missing required flag --skill-dir <path>");
|
|
34
|
+
const skillDir = resolve(skillDirRaw);
|
|
35
|
+
if (!existsSync(skillDir) || !statSync(skillDir).isDirectory()) {
|
|
36
|
+
die(`--skill-dir is not a directory: ${skillDir}`);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const skillName = flag(argv, "skill");
|
|
40
|
+
if (!skillName) die("missing required flag --skill <name>");
|
|
41
|
+
|
|
42
|
+
const skillSubdir = resolve(skillDir, skillName);
|
|
43
|
+
const skillMd = resolve(skillSubdir, "SKILL.md");
|
|
44
|
+
if (!existsSync(skillMd)) {
|
|
45
|
+
die(`skill not found: ${skillMd}`);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const bootstrapRaw = flag(argv, "bootstrap");
|
|
49
|
+
let bootstrapPath: string | null = null;
|
|
50
|
+
if (bootstrapRaw) {
|
|
51
|
+
const resolved = resolve(bootstrapRaw);
|
|
52
|
+
if (!existsSync(resolved)) {
|
|
53
|
+
die(`--bootstrap file not found: ${resolved}`);
|
|
54
|
+
}
|
|
55
|
+
bootstrapPath = resolved;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const workspaceRaw = flag(argv, "workspace-dir");
|
|
59
|
+
const workspaceRoot = workspaceRaw
|
|
60
|
+
? resolve(workspaceRaw)
|
|
61
|
+
: resolve(process.cwd(), "skills-workspace");
|
|
62
|
+
|
|
63
|
+
const stageRoot = resolve(process.cwd());
|
|
64
|
+
|
|
65
|
+
const harnessRaw = flag(argv, "harness") ?? "claude-code";
|
|
66
|
+
if (harnessRaw !== "claude-code") {
|
|
67
|
+
die(`unknown --harness: ${harnessRaw}. Supported: claude-code`);
|
|
68
|
+
}
|
|
69
|
+
const harness = harnessRaw as Harness;
|
|
70
|
+
|
|
71
|
+
const siblingSkillNames: string[] = [];
|
|
72
|
+
for (const entry of readdirSync(skillDir)) {
|
|
73
|
+
if (entry === skillName) continue;
|
|
74
|
+
const sub = resolve(skillDir, entry);
|
|
75
|
+
if (!statSync(sub).isDirectory()) continue;
|
|
76
|
+
if (!existsSync(resolve(sub, "SKILL.md"))) continue;
|
|
77
|
+
siblingSkillNames.push(entry);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
skillDir,
|
|
82
|
+
skillName,
|
|
83
|
+
skillSubdir,
|
|
84
|
+
siblingSkillNames,
|
|
85
|
+
workspaceRoot,
|
|
86
|
+
stageRoot,
|
|
87
|
+
bootstrapPath,
|
|
88
|
+
harness,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { detectStrayWrites } from "./detect-stray-writes";
|
|
4
|
+
|
|
5
|
+
const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
|
|
6
|
+
const REPO = "/work/repo";
|
|
7
|
+
|
|
8
|
+
describe("detectStrayWrites", () => {
|
|
9
|
+
test("a Write inside the outputs dir is clean", () => {
|
|
10
|
+
const findings = detectStrayWrites(
|
|
11
|
+
[
|
|
12
|
+
{
|
|
13
|
+
name: "Write",
|
|
14
|
+
args: { file_path: join(OUTPUTS, "answer.md") },
|
|
15
|
+
ordinal: 0,
|
|
16
|
+
},
|
|
17
|
+
],
|
|
18
|
+
OUTPUTS,
|
|
19
|
+
REPO,
|
|
20
|
+
);
|
|
21
|
+
expect(findings.violations).toHaveLength(0);
|
|
22
|
+
expect(findings.warnings).toHaveLength(0);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
test("a Write outside the outputs dir is a violation", () => {
|
|
26
|
+
const findings = detectStrayWrites(
|
|
27
|
+
[
|
|
28
|
+
{
|
|
29
|
+
name: "Write",
|
|
30
|
+
args: { file_path: join(REPO, "runner/run.ts") },
|
|
31
|
+
ordinal: 2,
|
|
32
|
+
},
|
|
33
|
+
],
|
|
34
|
+
OUTPUTS,
|
|
35
|
+
REPO,
|
|
36
|
+
);
|
|
37
|
+
expect(findings.violations).toHaveLength(1);
|
|
38
|
+
expect(findings.violations[0]).toMatchObject({
|
|
39
|
+
tool: "Write",
|
|
40
|
+
path: join(REPO, "runner/run.ts"),
|
|
41
|
+
ordinal: 2,
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("an Edit/MultiEdit/NotebookEdit outside outputs is a violation", () => {
|
|
46
|
+
const findings = detectStrayWrites(
|
|
47
|
+
[
|
|
48
|
+
{ name: "Edit", args: { file_path: "/etc/hosts" }, ordinal: 0 },
|
|
49
|
+
{
|
|
50
|
+
name: "NotebookEdit",
|
|
51
|
+
args: { notebook_path: "/tmp/x.ipynb" },
|
|
52
|
+
ordinal: 1,
|
|
53
|
+
},
|
|
54
|
+
],
|
|
55
|
+
OUTPUTS,
|
|
56
|
+
REPO,
|
|
57
|
+
);
|
|
58
|
+
expect(findings.violations.map((v) => v.tool).sort()).toEqual([
|
|
59
|
+
"Edit",
|
|
60
|
+
"NotebookEdit",
|
|
61
|
+
]);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test("an install command is a warning", () => {
|
|
65
|
+
const findings = detectStrayWrites(
|
|
66
|
+
[{ name: "Bash", args: { command: "npm install left-pad" }, ordinal: 0 }],
|
|
67
|
+
OUTPUTS,
|
|
68
|
+
REPO,
|
|
69
|
+
);
|
|
70
|
+
expect(findings.warnings).toHaveLength(1);
|
|
71
|
+
expect(findings.warnings[0].tool).toBe("Bash");
|
|
72
|
+
expect(findings.warnings[0].reason).toMatch(/install/i);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
test("a mutating Bash command scoped to the outputs dir is not flagged", () => {
|
|
76
|
+
const findings = detectStrayWrites(
|
|
77
|
+
[
|
|
78
|
+
{
|
|
79
|
+
name: "Bash",
|
|
80
|
+
args: { command: `echo hi > ${join(OUTPUTS, "log.txt")}` },
|
|
81
|
+
ordinal: 0,
|
|
82
|
+
},
|
|
83
|
+
],
|
|
84
|
+
OUTPUTS,
|
|
85
|
+
REPO,
|
|
86
|
+
);
|
|
87
|
+
expect(findings.warnings).toHaveLength(0);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
test("read-only tools are never flagged", () => {
|
|
91
|
+
const findings = detectStrayWrites(
|
|
92
|
+
[
|
|
93
|
+
{ name: "Read", args: { file_path: "/anywhere" }, ordinal: 0 },
|
|
94
|
+
{ name: "Grep", args: { pattern: "x" }, ordinal: 1 },
|
|
95
|
+
{ name: "Bash", args: { command: "ls -la /" }, ordinal: 2 },
|
|
96
|
+
],
|
|
97
|
+
OUTPUTS,
|
|
98
|
+
REPO,
|
|
99
|
+
);
|
|
100
|
+
expect(findings.violations).toHaveLength(0);
|
|
101
|
+
expect(findings.warnings).toHaveLength(0);
|
|
102
|
+
});
|
|
103
|
+
});
|