@slowdini/slow-powers-opencode 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bootstrap.md +19 -20
- package/package.json +1 -1
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +2 -2
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +6 -4
- package/skills/evaluating-skills/evals/evals.json +1 -1
- package/skills/evaluating-skills/harness-details/claude.md +24 -1
- package/skills/evaluating-skills/runner/README.md +16 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +56 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +43 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +76 -0
- package/skills/evaluating-skills/runner/aggregate.ts +20 -0
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +228 -0
- package/skills/evaluating-skills/runner/plugin-shadow.ts +201 -0
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +11 -0
- package/skills/evaluating-skills/runner/run.test.ts +488 -24
- package/skills/evaluating-skills/runner/run.ts +281 -66
- package/skills/evaluating-skills/runner/types.ts +8 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +3 -7
- package/skills/finishing-a-development-branch/SKILL.md +1 -1
- package/skills/hardening-plans/evals/baseline/NOTES.md +7 -0
- package/skills/hardening-plans/evals/evals.json +0 -19
- package/skills/systematic-debugging/condition-based-waiting.md +10 -11
- package/skills/systematic-debugging/root-cause-tracing.md +31 -33
- package/skills/working-in-isolation/SKILL.md +58 -0
- package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
- package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
- package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/evals.json +87 -0
- package/skills/writing-skills/SKILL.md +179 -195
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +0 -24
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +0 -24
- package/skills/using-git-worktrees/SKILL.md +0 -70
- package/skills/using-git-worktrees/evals/evals.json +0 -40
- package/skills/writing-skills/graphviz-conventions.dot +0 -172
- package/skills/writing-skills/scripts/render-graphs.js +0 -181
|
@@ -13,11 +13,14 @@ import {
|
|
|
13
13
|
buildDispatchTask,
|
|
14
14
|
cleanupStagedSkills,
|
|
15
15
|
redactSkillFromBootstrap,
|
|
16
|
+
registerStagedSkillForCleanup,
|
|
16
17
|
STAGED_SIBLING_MANIFEST,
|
|
17
18
|
STAGED_SKILL_PREFIX,
|
|
19
|
+
selectEvals,
|
|
18
20
|
stageSiblingSkills,
|
|
19
21
|
stageSkillForCC,
|
|
20
22
|
} from "./run";
|
|
23
|
+
import type { Eval } from "./types";
|
|
21
24
|
|
|
22
25
|
const FIXTURE_ROOT = join(tmpdir(), `slow-powers-run-test-${process.pid}`);
|
|
23
26
|
|
|
@@ -29,6 +32,49 @@ afterAll(() => {
|
|
|
29
32
|
rmSync(FIXTURE_ROOT, { recursive: true, force: true });
|
|
30
33
|
});
|
|
31
34
|
|
|
35
|
+
describe("selectEvals", () => {
|
|
36
|
+
const mkEvals = (...ids: string[]): Eval[] =>
|
|
37
|
+
ids.map((id) => ({ id, prompt: `p-${id}`, expected_output: `o-${id}` }));
|
|
38
|
+
|
|
39
|
+
test("returns the full list unchanged when neither flag is set", () => {
|
|
40
|
+
const evals = mkEvals("a", "b", "c");
|
|
41
|
+
expect(selectEvals(evals, {})).toEqual(evals);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
test("--only keeps just the named ids, preserving config order", () => {
|
|
45
|
+
const evals = mkEvals("a", "b", "c");
|
|
46
|
+
const got = selectEvals(evals, { only: ["c", "a"] });
|
|
47
|
+
expect(got.map((e) => e.id)).toEqual(["a", "c"]);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test("--skip drops the named ids", () => {
|
|
51
|
+
const evals = mkEvals("a", "b", "c");
|
|
52
|
+
const got = selectEvals(evals, { skip: ["b"] });
|
|
53
|
+
expect(got.map((e) => e.id)).toEqual(["a", "c"]);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test("throws on an unknown id, listing the unknown and the available ids", () => {
|
|
57
|
+
const evals = mkEvals("a", "b");
|
|
58
|
+
expect(() => selectEvals(evals, { only: ["a", "nope"] })).toThrow(
|
|
59
|
+
/unknown eval id\(s\): nope\. Available ids: a, b/,
|
|
60
|
+
);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
test("throws when both --only and --skip are given", () => {
|
|
64
|
+
const evals = mkEvals("a", "b");
|
|
65
|
+
expect(() => selectEvals(evals, { only: ["a"], skip: ["b"] })).toThrow(
|
|
66
|
+
/only one of --only \/ --skip/,
|
|
67
|
+
);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test("throws when a flag resolves to an empty id list", () => {
|
|
71
|
+
const evals = mkEvals("a", "b");
|
|
72
|
+
expect(() => selectEvals(evals, { only: [] })).toThrow(
|
|
73
|
+
/at least one eval id/,
|
|
74
|
+
);
|
|
75
|
+
});
|
|
76
|
+
});
|
|
77
|
+
|
|
32
78
|
describe("stageSkillForCC", () => {
|
|
33
79
|
test("writes SKILL.md to <repoRoot>/.claude/skills/<slug>/SKILL.md and returns the slug", () => {
|
|
34
80
|
const repoRoot = join(FIXTURE_ROOT, "stage-basic");
|
|
@@ -74,6 +120,92 @@ describe("stageSkillForCC", () => {
|
|
|
74
120
|
const stagedPath = join(repoRoot, ".claude", "skills", slug, "SKILL.md");
|
|
75
121
|
expect(readFileSync(stagedPath, "utf8")).toBe("second");
|
|
76
122
|
});
|
|
123
|
+
|
|
124
|
+
test("stageNameOverride stages under the verbatim name instead of the eval slug", () => {
|
|
125
|
+
const repoRoot = join(FIXTURE_ROOT, "stage-override");
|
|
126
|
+
mkdirSync(repoRoot, { recursive: true });
|
|
127
|
+
const content =
|
|
128
|
+
"---\nname: example\ndescription: example skill\n---\n\nbody\n";
|
|
129
|
+
|
|
130
|
+
const slug = stageSkillForCC({
|
|
131
|
+
content,
|
|
132
|
+
iteration: 2,
|
|
133
|
+
condition: "with_skill",
|
|
134
|
+
skillName: "verification-before-completion",
|
|
135
|
+
repoRoot,
|
|
136
|
+
stageNameOverride: "verification-before-completion",
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
expect(slug).toBe("verification-before-completion");
|
|
140
|
+
const stagedPath = join(repoRoot, ".claude", "skills", slug, "SKILL.md");
|
|
141
|
+
expect(existsSync(stagedPath)).toBe(true);
|
|
142
|
+
expect(readFileSync(stagedPath, "utf8")).toBe(content);
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
describe("registerStagedSkillForCleanup", () => {
|
|
147
|
+
test("appends the custom dir to the manifest so cleanup removes it", () => {
|
|
148
|
+
const root = join(FIXTURE_ROOT, "register-cleanup");
|
|
149
|
+
const skillsDir = join(root, ".claude", "skills");
|
|
150
|
+
mkdirSync(skillsDir, { recursive: true });
|
|
151
|
+
// A sibling manifest already exists (written by stageSiblingSkills).
|
|
152
|
+
writeFileSync(
|
|
153
|
+
join(skillsDir, STAGED_SIBLING_MANIFEST),
|
|
154
|
+
`${JSON.stringify(
|
|
155
|
+
{
|
|
156
|
+
created_at: "x",
|
|
157
|
+
staged_under_test: "verification-before-completion",
|
|
158
|
+
created_entries: [{ name: "sibling-a", preexisting: false }],
|
|
159
|
+
},
|
|
160
|
+
null,
|
|
161
|
+
2,
|
|
162
|
+
)}\n`,
|
|
163
|
+
);
|
|
164
|
+
const customDir = join(skillsDir, "verification-before-completion");
|
|
165
|
+
mkdirSync(customDir, { recursive: true });
|
|
166
|
+
writeFileSync(join(customDir, "SKILL.md"), "staged");
|
|
167
|
+
|
|
168
|
+
registerStagedSkillForCleanup(root, "verification-before-completion");
|
|
169
|
+
|
|
170
|
+
const manifest = JSON.parse(
|
|
171
|
+
readFileSync(join(skillsDir, STAGED_SIBLING_MANIFEST), "utf8"),
|
|
172
|
+
) as { created_entries: Array<{ name: string }> };
|
|
173
|
+
expect(manifest.created_entries.map((e) => e.name).sort()).toEqual([
|
|
174
|
+
"sibling-a",
|
|
175
|
+
"verification-before-completion",
|
|
176
|
+
]);
|
|
177
|
+
|
|
178
|
+
cleanupStagedSkills(root);
|
|
179
|
+
expect(existsSync(customDir)).toBe(false);
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
test("is idempotent — registering the same name twice does not duplicate it", () => {
|
|
183
|
+
const root = join(FIXTURE_ROOT, "register-idempotent");
|
|
184
|
+
const skillsDir = join(root, ".claude", "skills");
|
|
185
|
+
mkdirSync(skillsDir, { recursive: true });
|
|
186
|
+
writeFileSync(
|
|
187
|
+
join(skillsDir, STAGED_SIBLING_MANIFEST),
|
|
188
|
+
`${JSON.stringify(
|
|
189
|
+
{
|
|
190
|
+
created_at: "x",
|
|
191
|
+
staged_under_test: "foo",
|
|
192
|
+
created_entries: [],
|
|
193
|
+
},
|
|
194
|
+
null,
|
|
195
|
+
2,
|
|
196
|
+
)}\n`,
|
|
197
|
+
);
|
|
198
|
+
|
|
199
|
+
registerStagedSkillForCleanup(root, "foo-staged");
|
|
200
|
+
registerStagedSkillForCleanup(root, "foo-staged");
|
|
201
|
+
|
|
202
|
+
const manifest = JSON.parse(
|
|
203
|
+
readFileSync(join(skillsDir, STAGED_SIBLING_MANIFEST), "utf8"),
|
|
204
|
+
) as { created_entries: Array<{ name: string }> };
|
|
205
|
+
expect(
|
|
206
|
+
manifest.created_entries.filter((e) => e.name === "foo-staged").length,
|
|
207
|
+
).toBe(1);
|
|
208
|
+
});
|
|
77
209
|
});
|
|
78
210
|
|
|
79
211
|
describe("cleanupStagedSkills", () => {
|
|
@@ -302,7 +434,7 @@ describe("buildDispatchTask bootstrap injection", () => {
|
|
|
302
434
|
expect(task.dispatch_prompt).not.toContain("<session-start-context>");
|
|
303
435
|
});
|
|
304
436
|
|
|
305
|
-
test("emits <session-start-context>
|
|
437
|
+
test("emits a harness-native available-skills block (no <session-start-context>) when bootstrapContent is null", () => {
|
|
306
438
|
const task = buildDispatchTask({
|
|
307
439
|
...baseOpts,
|
|
308
440
|
bootstrapContent: null,
|
|
@@ -310,15 +442,20 @@ describe("buildDispatchTask bootstrap injection", () => {
|
|
|
310
442
|
{ name: "foo", path: "/x/foo/SKILL.md", description: "the foo skill" },
|
|
311
443
|
],
|
|
312
444
|
});
|
|
313
|
-
|
|
314
|
-
expect(task.dispatch_prompt).toContain("
|
|
315
|
-
expect(task.dispatch_prompt).toContain(
|
|
316
|
-
|
|
445
|
+
// Without a bootstrap, there is no SessionStart block — only the skills list.
|
|
446
|
+
expect(task.dispatch_prompt).not.toContain("<session-start-context>");
|
|
447
|
+
expect(task.dispatch_prompt).toContain(
|
|
448
|
+
"The following skills are available for use with the Skill tool:",
|
|
449
|
+
);
|
|
450
|
+
expect(task.dispatch_prompt).toContain("- foo: the foo skill");
|
|
451
|
+
// The eval-flavored wording and custom format are gone.
|
|
452
|
+
expect(task.dispatch_prompt).not.toContain("staged and discoverable");
|
|
453
|
+
expect(task.dispatch_prompt).not.toContain("*Trigger:*");
|
|
317
454
|
// No product framing should appear without a bootstrap file.
|
|
318
455
|
expect(task.dispatch_prompt).not.toContain("loaded at session start");
|
|
319
456
|
});
|
|
320
457
|
|
|
321
|
-
test("
|
|
458
|
+
test("renders the available-skills block as its own section, outside <session-start-context>, after the verbatim bootstrap", () => {
|
|
322
459
|
const task = buildDispatchTask({
|
|
323
460
|
...baseOpts,
|
|
324
461
|
bootstrapContent: "BOOT-LOADED",
|
|
@@ -326,10 +463,18 @@ describe("buildDispatchTask bootstrap injection", () => {
|
|
|
326
463
|
{ name: "foo", path: "/x/foo/SKILL.md", description: "the foo skill" },
|
|
327
464
|
],
|
|
328
465
|
});
|
|
329
|
-
const
|
|
330
|
-
|
|
466
|
+
const prompt = task.dispatch_prompt;
|
|
467
|
+
// The skills list is a separate block, not bundled inside the SessionStart
|
|
468
|
+
// context (which carries bootstrap content only).
|
|
469
|
+
const sscEnd = prompt.indexOf("</session-start-context>");
|
|
470
|
+
const listIdx = prompt.indexOf(
|
|
471
|
+
"The following skills are available for use with the Skill tool:",
|
|
472
|
+
);
|
|
473
|
+
const bootIdx = prompt.indexOf("BOOT-LOADED");
|
|
474
|
+
expect(sscEnd).toBeGreaterThan(-1);
|
|
331
475
|
expect(bootIdx).toBeGreaterThan(-1);
|
|
332
|
-
expect(
|
|
476
|
+
expect(bootIdx).toBeLessThan(sscEnd);
|
|
477
|
+
expect(listIdx).toBeGreaterThan(sscEnd);
|
|
333
478
|
});
|
|
334
479
|
|
|
335
480
|
test("sets dispatch_prompt_path to dispatch-prompt.txt under the condition dir", () => {
|
|
@@ -388,25 +533,41 @@ describe("buildDispatchTask bootstrap injection", () => {
|
|
|
388
533
|
expect(withSkill.dispatch_prompt).toContain("test-driven-development");
|
|
389
534
|
});
|
|
390
535
|
|
|
391
|
-
test("
|
|
536
|
+
test("names the staged slug for disambiguation without instructing invocation", () => {
|
|
392
537
|
const task = buildDispatchTask({
|
|
393
538
|
...baseOpts,
|
|
394
539
|
bootstrapContent: "BOOT-LOADED",
|
|
395
540
|
});
|
|
541
|
+
// The slug is still surfaced so a deliberate invocation targets the staged
|
|
542
|
+
// version and the meta-check can find it — but we no longer assert a plugin
|
|
543
|
+
// is "loaded" or tell the agent to prefer the slug over the bare name, which
|
|
544
|
+
// invited it to hunt for a global copy (issue #144 global-plugin leakage).
|
|
396
545
|
expect(task.dispatch_prompt).toContain(
|
|
397
546
|
"slow-powers-eval-1-with_skill__foo",
|
|
398
547
|
);
|
|
548
|
+
// ...but the over-promoting invoke imperative (issue #119) is gone, so
|
|
549
|
+
// invocation reflects the skill's own triggering rather than an order.
|
|
550
|
+
expect(task.dispatch_prompt).not.toContain("invoke that slug");
|
|
551
|
+
expect(task.dispatch_prompt).not.toContain("if the skill applies");
|
|
552
|
+
expect(task.dispatch_prompt).not.toContain("under evaluation");
|
|
553
|
+
// ...and the leakage-inviting framing is gone (issue #144): no claim that a
|
|
554
|
+
// plugin is loaded, no "use the slug rather than the bare name" contrast.
|
|
555
|
+
expect(task.dispatch_prompt).not.toContain("plugin loaded");
|
|
556
|
+
expect(task.dispatch_prompt).not.toContain("rather than the bare name");
|
|
399
557
|
});
|
|
400
558
|
|
|
401
|
-
test("without-skill condition under realistic env
|
|
559
|
+
test("without-skill condition under realistic env carries no eval-announcing skill commentary", () => {
|
|
402
560
|
const task = buildDispatchTask({
|
|
403
561
|
...baseOpts,
|
|
404
562
|
skillPath: null,
|
|
405
563
|
stagedSkillSlug: null,
|
|
406
564
|
bootstrapContent: "BOOT-LOADED",
|
|
407
565
|
});
|
|
566
|
+
// The arm stays silent about the absent skill: the available-skills block
|
|
567
|
+
// already omits it, so nothing announces that this is an eval control arm.
|
|
408
568
|
expect(task.dispatch_prompt).not.toContain("No skill is loaded");
|
|
409
|
-
expect(task.dispatch_prompt.toLowerCase()).toContain("not available");
|
|
569
|
+
expect(task.dispatch_prompt.toLowerCase()).not.toContain("not available");
|
|
570
|
+
expect(task.dispatch_prompt).not.toContain("under evaluation");
|
|
410
571
|
});
|
|
411
572
|
|
|
412
573
|
test("without-skill condition without bootstrap (e.g. --no-stage) keeps the legacy 'No skill is loaded' wording", () => {
|
|
@@ -420,10 +581,87 @@ describe("buildDispatchTask bootstrap injection", () => {
|
|
|
420
581
|
});
|
|
421
582
|
});
|
|
422
583
|
|
|
584
|
+
describe("buildDispatchTask plan-mode injection", () => {
|
|
585
|
+
const baseOpts = {
|
|
586
|
+
evalId: "e1",
|
|
587
|
+
condition: "with_skill",
|
|
588
|
+
skillPath: null,
|
|
589
|
+
stagedSkillSlug: "slow-powers-eval-1-with_skill__foo" as string | null,
|
|
590
|
+
userPrompt: "BUILD-THE-TODO-APP",
|
|
591
|
+
fixtures: [] as string[],
|
|
592
|
+
outputsDir: "/tmp/out",
|
|
593
|
+
condDir: "/tmp/cond",
|
|
594
|
+
skillName: "foo",
|
|
595
|
+
bootstrapContent: null as string | null,
|
|
596
|
+
availableSkills: [
|
|
597
|
+
{ name: "foo", path: "/x/foo/SKILL.md", description: "the foo skill" },
|
|
598
|
+
] as { name: string; path: string; description: string }[],
|
|
599
|
+
};
|
|
600
|
+
|
|
601
|
+
test("omits the plan-mode block when planModeContent is null/absent", () => {
|
|
602
|
+
const task = buildDispatchTask({ ...baseOpts });
|
|
603
|
+
expect(task.dispatch_prompt).not.toContain("<system-reminder>");
|
|
604
|
+
const withNull = buildDispatchTask({ ...baseOpts, planModeContent: null });
|
|
605
|
+
expect(withNull.dispatch_prompt).not.toContain("<system-reminder>");
|
|
606
|
+
});
|
|
607
|
+
|
|
608
|
+
test("injects the rendered plan-mode block when planModeContent is provided", () => {
|
|
609
|
+
const task = buildDispatchTask({
|
|
610
|
+
...baseOpts,
|
|
611
|
+
planModeContent: "Plan mode is active. PLAN-RAIL-MARKER.",
|
|
612
|
+
});
|
|
613
|
+
expect(task.dispatch_prompt).toContain("<system-reminder>");
|
|
614
|
+
expect(task.dispatch_prompt).toContain("PLAN-RAIL-MARKER.");
|
|
615
|
+
expect(task.dispatch_prompt).toContain("</system-reminder>");
|
|
616
|
+
});
|
|
617
|
+
|
|
618
|
+
test("places the plan-mode block after the available-skills block and before the user request", () => {
|
|
619
|
+
const prompt = buildDispatchTask({
|
|
620
|
+
...baseOpts,
|
|
621
|
+
planModeContent: "PLAN-RAIL-MARKER",
|
|
622
|
+
}).dispatch_prompt;
|
|
623
|
+
const skillsIdx = prompt.indexOf(
|
|
624
|
+
"The following skills are available for use with the Skill tool:",
|
|
625
|
+
);
|
|
626
|
+
const planIdx = prompt.indexOf("<system-reminder>");
|
|
627
|
+
const promptIdx = prompt.indexOf("BUILD-THE-TODO-APP");
|
|
628
|
+
expect(skillsIdx).toBeGreaterThan(-1);
|
|
629
|
+
expect(planIdx).toBeGreaterThan(skillsIdx);
|
|
630
|
+
expect(promptIdx).toBeGreaterThan(planIdx);
|
|
631
|
+
});
|
|
632
|
+
|
|
633
|
+
test("injects an identical plan-mode block in the with- and without-skill arms", () => {
|
|
634
|
+
const planModeContent = "Plan mode is active. PLAN-RAIL-MARKER.";
|
|
635
|
+
const rendered =
|
|
636
|
+
"<system-reminder>\nPlan mode is active. PLAN-RAIL-MARKER.\n</system-reminder>";
|
|
637
|
+
const withSkill = buildDispatchTask({
|
|
638
|
+
...baseOpts,
|
|
639
|
+
condition: "with_skill",
|
|
640
|
+
stagedSkillSlug: "slow-powers-eval-1-with_skill__foo",
|
|
641
|
+
planModeContent,
|
|
642
|
+
});
|
|
643
|
+
const withoutSkill = buildDispatchTask({
|
|
644
|
+
...baseOpts,
|
|
645
|
+
condition: "without_skill",
|
|
646
|
+
skillPath: null,
|
|
647
|
+
stagedSkillSlug: null,
|
|
648
|
+
availableSkills: [],
|
|
649
|
+
planModeContent,
|
|
650
|
+
});
|
|
651
|
+
expect(withSkill.dispatch_prompt).toContain(rendered);
|
|
652
|
+
expect(withoutSkill.dispatch_prompt).toContain(rendered);
|
|
653
|
+
});
|
|
654
|
+
});
|
|
655
|
+
|
|
423
656
|
describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
|
|
424
657
|
const RUN_TS = join(import.meta.dir, "run.ts");
|
|
425
658
|
|
|
426
|
-
function setup(
|
|
659
|
+
function setup(
|
|
660
|
+
name: string,
|
|
661
|
+
evals: Eval[] = [
|
|
662
|
+
{ id: "e1", prompt: "review this MR", expected_output: "a review" },
|
|
663
|
+
],
|
|
664
|
+
): { skillDir: string; cwd: string } {
|
|
427
665
|
const root = join(FIXTURE_ROOT, name);
|
|
428
666
|
const skillDir = join(root, "skill-dir");
|
|
429
667
|
const skillSub = join(skillDir, "mr-review");
|
|
@@ -434,12 +672,7 @@ describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
|
|
|
434
672
|
);
|
|
435
673
|
writeFileSync(
|
|
436
674
|
join(skillSub, "evals", "evals.json"),
|
|
437
|
-
JSON.stringify({
|
|
438
|
-
skill_name: "mr-review",
|
|
439
|
-
evals: [
|
|
440
|
-
{ id: "e1", prompt: "review this MR", expected_output: "a review" },
|
|
441
|
-
],
|
|
442
|
-
}),
|
|
675
|
+
JSON.stringify({ skill_name: "mr-review", evals }),
|
|
443
676
|
);
|
|
444
677
|
const cwd = join(root, "work");
|
|
445
678
|
mkdirSync(cwd, { recursive: true });
|
|
@@ -486,6 +719,168 @@ describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
|
|
|
486
719
|
expect(entries).toEqual(["slow-powers-eval-1-with_skill__mr-review"]);
|
|
487
720
|
});
|
|
488
721
|
|
|
722
|
+
test("--plan-mode injects the resolved profile into every dispatch and records plan_mode in dispatch.json", () => {
|
|
723
|
+
const { skillDir, cwd } = setup("usermode-plan-mode");
|
|
724
|
+
const res = runCli(
|
|
725
|
+
[
|
|
726
|
+
"--skill-dir",
|
|
727
|
+
skillDir,
|
|
728
|
+
"--skill",
|
|
729
|
+
"mr-review",
|
|
730
|
+
"--mode",
|
|
731
|
+
"new-skill",
|
|
732
|
+
"--plan-mode",
|
|
733
|
+
"--dry-run",
|
|
734
|
+
],
|
|
735
|
+
cwd,
|
|
736
|
+
);
|
|
737
|
+
expect(res.exitCode).toBe(0);
|
|
738
|
+
|
|
739
|
+
const iterationDir = join(
|
|
740
|
+
cwd,
|
|
741
|
+
"skills-workspace",
|
|
742
|
+
"mr-review",
|
|
743
|
+
"iteration-1",
|
|
744
|
+
);
|
|
745
|
+
const dispatch = JSON.parse(
|
|
746
|
+
readFileSync(join(iterationDir, "dispatch.json"), "utf8"),
|
|
747
|
+
) as {
|
|
748
|
+
plan_mode: boolean;
|
|
749
|
+
tasks: Array<{ condition: string; dispatch_prompt_path: string }>;
|
|
750
|
+
};
|
|
751
|
+
expect(dispatch.plan_mode).toBe(true);
|
|
752
|
+
|
|
753
|
+
// Both arms carry the same harness-injected plan-mode operating context.
|
|
754
|
+
for (const t of dispatch.tasks) {
|
|
755
|
+
const prompt = readFileSync(t.dispatch_prompt_path, "utf8");
|
|
756
|
+
expect(prompt).toContain("<system-reminder>");
|
|
757
|
+
expect(prompt).toContain("Plan mode is active");
|
|
758
|
+
expect(prompt).toContain("ExitPlanMode");
|
|
759
|
+
}
|
|
760
|
+
});
|
|
761
|
+
|
|
762
|
+
test("without --plan-mode, dispatch.json records plan_mode:false and no plan-mode block is injected", () => {
|
|
763
|
+
const { skillDir, cwd } = setup("usermode-no-plan-mode");
|
|
764
|
+
const res = runCli(
|
|
765
|
+
[
|
|
766
|
+
"--skill-dir",
|
|
767
|
+
skillDir,
|
|
768
|
+
"--skill",
|
|
769
|
+
"mr-review",
|
|
770
|
+
"--mode",
|
|
771
|
+
"new-skill",
|
|
772
|
+
"--dry-run",
|
|
773
|
+
],
|
|
774
|
+
cwd,
|
|
775
|
+
);
|
|
776
|
+
expect(res.exitCode).toBe(0);
|
|
777
|
+
|
|
778
|
+
const iterationDir = join(
|
|
779
|
+
cwd,
|
|
780
|
+
"skills-workspace",
|
|
781
|
+
"mr-review",
|
|
782
|
+
"iteration-1",
|
|
783
|
+
);
|
|
784
|
+
const dispatch = JSON.parse(
|
|
785
|
+
readFileSync(join(iterationDir, "dispatch.json"), "utf8"),
|
|
786
|
+
) as {
|
|
787
|
+
plan_mode: boolean;
|
|
788
|
+
tasks: Array<{ dispatch_prompt_path: string }>;
|
|
789
|
+
};
|
|
790
|
+
expect(dispatch.plan_mode).toBe(false);
|
|
791
|
+
for (const t of dispatch.tasks) {
|
|
792
|
+
const prompt = readFileSync(t.dispatch_prompt_path, "utf8");
|
|
793
|
+
expect(prompt).not.toContain("<system-reminder>");
|
|
794
|
+
}
|
|
795
|
+
});
|
|
796
|
+
|
|
797
|
+
test("--stage-name stages the SUT under the verbatim name, threads it everywhere, and registers it for cleanup", () => {
|
|
798
|
+
const { skillDir, cwd } = setup("usermode-stage-name");
|
|
799
|
+
const res = runCli(
|
|
800
|
+
[
|
|
801
|
+
"--skill-dir",
|
|
802
|
+
skillDir,
|
|
803
|
+
"--skill",
|
|
804
|
+
"mr-review",
|
|
805
|
+
"--mode",
|
|
806
|
+
"new-skill",
|
|
807
|
+
"--stage-name",
|
|
808
|
+
"mr-review",
|
|
809
|
+
"--dry-run",
|
|
810
|
+
],
|
|
811
|
+
cwd,
|
|
812
|
+
);
|
|
813
|
+
expect(res.exitCode).toBe(0);
|
|
814
|
+
|
|
815
|
+
// Staged dir is the natural name, not the conspicuous eval slug.
|
|
816
|
+
const stagedSkillsDir = join(cwd, ".claude", "skills");
|
|
817
|
+
const entries = readdirSync(stagedSkillsDir).filter(
|
|
818
|
+
(e) => e !== STAGED_SIBLING_MANIFEST,
|
|
819
|
+
);
|
|
820
|
+
expect(entries).toEqual(["mr-review"]);
|
|
821
|
+
|
|
822
|
+
const iterationDir = join(
|
|
823
|
+
cwd,
|
|
824
|
+
"skills-workspace",
|
|
825
|
+
"mr-review",
|
|
826
|
+
"iteration-1",
|
|
827
|
+
);
|
|
828
|
+
|
|
829
|
+
// conditions.json carries the natural slug — the grader meta-check reads it.
|
|
830
|
+
const conditions = JSON.parse(
|
|
831
|
+
readFileSync(join(iterationDir, "conditions.json"), "utf8"),
|
|
832
|
+
) as {
|
|
833
|
+
conditions: Array<{ name: string; staged_skill_slug: string | null }>;
|
|
834
|
+
};
|
|
835
|
+
const withSkill = conditions.conditions.find(
|
|
836
|
+
(c) => c.name === "with_skill",
|
|
837
|
+
);
|
|
838
|
+
expect(withSkill?.staged_skill_slug).toBe("mr-review");
|
|
839
|
+
|
|
840
|
+
// The custom dir is registered for cleanup (prefix scan won't catch it).
|
|
841
|
+
const manifest = JSON.parse(
|
|
842
|
+
readFileSync(join(stagedSkillsDir, STAGED_SIBLING_MANIFEST), "utf8"),
|
|
843
|
+
) as { created_entries: Array<{ name: string }> };
|
|
844
|
+
expect(manifest.created_entries.map((e) => e.name)).toContain("mr-review");
|
|
845
|
+
|
|
846
|
+
// The dispatch prompt disambiguates to the natural identifier, not the slug.
|
|
847
|
+
const dispatch = JSON.parse(
|
|
848
|
+
readFileSync(join(iterationDir, "dispatch.json"), "utf8"),
|
|
849
|
+
) as {
|
|
850
|
+
tasks: Array<{ condition: string; dispatch_prompt_path: string }>;
|
|
851
|
+
};
|
|
852
|
+
const task = dispatch.tasks.find((t) => t.condition === "with_skill");
|
|
853
|
+
const prompt = readFileSync(task?.dispatch_prompt_path ?? "", "utf8");
|
|
854
|
+
expect(prompt).toContain("registered under the identifier `mr-review`");
|
|
855
|
+
expect(prompt).not.toContain("slow-powers-eval-");
|
|
856
|
+
});
|
|
857
|
+
|
|
858
|
+
test("--stage-name refuses to clobber a pre-existing same-named dir", () => {
|
|
859
|
+
const { skillDir, cwd } = setup("usermode-stage-name-clobber");
|
|
860
|
+
const preexisting = join(cwd, ".claude", "skills", "my-real-skill");
|
|
861
|
+
mkdirSync(preexisting, { recursive: true });
|
|
862
|
+
writeFileSync(join(preexisting, "SKILL.md"), "USER OWNED");
|
|
863
|
+
|
|
864
|
+
const res = runCli(
|
|
865
|
+
[
|
|
866
|
+
"--skill-dir",
|
|
867
|
+
skillDir,
|
|
868
|
+
"--skill",
|
|
869
|
+
"mr-review",
|
|
870
|
+
"--mode",
|
|
871
|
+
"new-skill",
|
|
872
|
+
"--stage-name",
|
|
873
|
+
"my-real-skill",
|
|
874
|
+
"--dry-run",
|
|
875
|
+
],
|
|
876
|
+
cwd,
|
|
877
|
+
);
|
|
878
|
+
expect(res.exitCode).not.toBe(0);
|
|
879
|
+
expect(readFileSync(join(preexisting, "SKILL.md"), "utf8")).toBe(
|
|
880
|
+
"USER OWNED",
|
|
881
|
+
);
|
|
882
|
+
});
|
|
883
|
+
|
|
489
884
|
test("dispatch prompt lists only the skill-under-test, no other skills, and no product framing without --bootstrap", () => {
|
|
490
885
|
const { skillDir, cwd } = setup("usermode-prompt");
|
|
491
886
|
const res = runCli(
|
|
@@ -526,8 +921,10 @@ describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
|
|
|
526
921
|
// The full prompt is no longer inlined in dispatch.json — it lives in a file.
|
|
527
922
|
expect(withSkill?.dispatch_prompt).toBeUndefined();
|
|
528
923
|
const prompt = readFileSync(withSkill?.dispatch_prompt_path ?? "", "utf8");
|
|
529
|
-
expect(prompt).toContain(
|
|
530
|
-
|
|
924
|
+
expect(prompt).toContain(
|
|
925
|
+
"The following skills are available for use with the Skill tool:",
|
|
926
|
+
);
|
|
927
|
+
expect(prompt).toContain("- mr-review:");
|
|
531
928
|
expect(prompt).not.toContain("test-driven-development");
|
|
532
929
|
expect(prompt).not.toContain("writing-skills");
|
|
533
930
|
// No product framing (EXTREMELY-IMPORTANT etc.) without a --bootstrap file.
|
|
@@ -670,7 +1067,7 @@ describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
|
|
|
670
1067
|
expect(conditions.run_nonce).toBe(dispatch.run_nonce);
|
|
671
1068
|
});
|
|
672
1069
|
|
|
673
|
-
test("--bootstrap content is prepended verbatim before the
|
|
1070
|
+
test("--bootstrap content is prepended verbatim before the available-skills block", () => {
|
|
674
1071
|
const { skillDir, cwd } = setup("usermode-bootstrap");
|
|
675
1072
|
const bootstrapPath = join(cwd, "my-bootstrap.md");
|
|
676
1073
|
writeFileSync(bootstrapPath, "MY CUSTOM EVAL FRAMING");
|
|
@@ -709,8 +1106,75 @@ describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
|
|
|
709
1106
|
? readFileSync(withSkill.dispatch_prompt_path, "utf8")
|
|
710
1107
|
: "";
|
|
711
1108
|
const bootIdx = prompt.indexOf("MY CUSTOM EVAL FRAMING");
|
|
712
|
-
const
|
|
1109
|
+
const listIdx = prompt.indexOf(
|
|
1110
|
+
"The following skills are available for use with the Skill tool:",
|
|
1111
|
+
);
|
|
713
1112
|
expect(bootIdx).toBeGreaterThan(-1);
|
|
714
|
-
expect(
|
|
1113
|
+
expect(listIdx).toBeGreaterThan(bootIdx);
|
|
1114
|
+
});
|
|
1115
|
+
|
|
1116
|
+
test("--only restricts dispatches to the named eval ids", () => {
|
|
1117
|
+
const { skillDir, cwd } = setup("usermode-only", [
|
|
1118
|
+
{ id: "e1", prompt: "review MR 1", expected_output: "a review" },
|
|
1119
|
+
{ id: "e2", prompt: "review MR 2", expected_output: "a review" },
|
|
1120
|
+
]);
|
|
1121
|
+
const res = runCli(
|
|
1122
|
+
[
|
|
1123
|
+
"--skill-dir",
|
|
1124
|
+
skillDir,
|
|
1125
|
+
"--skill",
|
|
1126
|
+
"mr-review",
|
|
1127
|
+
"--mode",
|
|
1128
|
+
"new-skill",
|
|
1129
|
+
"--only",
|
|
1130
|
+
"e1",
|
|
1131
|
+
"--dry-run",
|
|
1132
|
+
],
|
|
1133
|
+
cwd,
|
|
1134
|
+
);
|
|
1135
|
+
expect(res.exitCode).toBe(0);
|
|
1136
|
+
|
|
1137
|
+
const dispatch = JSON.parse(
|
|
1138
|
+
readFileSync(
|
|
1139
|
+
join(
|
|
1140
|
+
cwd,
|
|
1141
|
+
"skills-workspace",
|
|
1142
|
+
"mr-review",
|
|
1143
|
+
"iteration-1",
|
|
1144
|
+
"dispatch.json",
|
|
1145
|
+
),
|
|
1146
|
+
"utf8",
|
|
1147
|
+
),
|
|
1148
|
+
) as { tasks: Array<{ eval_id: string }> };
|
|
1149
|
+
|
|
1150
|
+
expect(dispatch.tasks.map((t) => t.eval_id).sort()).toEqual(["e1", "e1"]);
|
|
1151
|
+
// The "N evals × 2 conditions" line reflects the filtered set.
|
|
1152
|
+
expect(new TextDecoder().decode(res.stdout)).toContain(
|
|
1153
|
+
"1 evals × 2 conditions",
|
|
1154
|
+
);
|
|
1155
|
+
});
|
|
1156
|
+
|
|
1157
|
+
test("--only with an unknown id exits non-zero and names the unknown id", () => {
|
|
1158
|
+
const { skillDir, cwd } = setup("usermode-only-unknown", [
|
|
1159
|
+
{ id: "e1", prompt: "review MR 1", expected_output: "a review" },
|
|
1160
|
+
]);
|
|
1161
|
+
const res = runCli(
|
|
1162
|
+
[
|
|
1163
|
+
"--skill-dir",
|
|
1164
|
+
skillDir,
|
|
1165
|
+
"--skill",
|
|
1166
|
+
"mr-review",
|
|
1167
|
+
"--mode",
|
|
1168
|
+
"new-skill",
|
|
1169
|
+
"--only",
|
|
1170
|
+
"nope",
|
|
1171
|
+
"--dry-run",
|
|
1172
|
+
],
|
|
1173
|
+
cwd,
|
|
1174
|
+
);
|
|
1175
|
+
expect(res.exitCode).not.toBe(0);
|
|
1176
|
+
expect(new TextDecoder().decode(res.stderr)).toContain(
|
|
1177
|
+
"unknown eval id(s): nope",
|
|
1178
|
+
);
|
|
715
1179
|
});
|
|
716
1180
|
});
|