@slowdini/slow-powers-opencode 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/bootstrap.md +19 -20
- package/package.json +1 -1
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +8 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +2 -2
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +6 -4
- package/skills/evaluating-skills/evals/evals.json +1 -1
- package/skills/evaluating-skills/harness-details/claude.md +24 -1
- package/skills/evaluating-skills/runner/README.md +16 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +56 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +43 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +76 -0
- package/skills/evaluating-skills/runner/aggregate.ts +20 -0
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +228 -0
- package/skills/evaluating-skills/runner/plugin-shadow.ts +201 -0
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +11 -0
- package/skills/evaluating-skills/runner/run.test.ts +488 -24
- package/skills/evaluating-skills/runner/run.ts +281 -66
- package/skills/evaluating-skills/runner/types.ts +8 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +3 -7
- package/skills/finishing-a-development-branch/SKILL.md +1 -1
- package/skills/hardening-plans/evals/baseline/NOTES.md +7 -0
- package/skills/hardening-plans/evals/evals.json +0 -19
- package/skills/systematic-debugging/condition-based-waiting.md +10 -11
- package/skills/systematic-debugging/root-cause-tracing.md +31 -33
- package/skills/working-in-isolation/SKILL.md +58 -0
- package/skills/working-in-isolation/evals/baseline/BASELINE.md +22 -0
- package/skills/working-in-isolation/evals/baseline/NOTES.md +67 -0
- package/skills/working-in-isolation/evals/baseline/benchmark.json +51 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__with_skill.json +46 -0
- package/skills/working-in-isolation/evals/baseline/grading/base-branch-checkout__without_skill.json +31 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/dirty-tree-worktree__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/feature-branch-in-place__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__with_skill.json +39 -0
- package/skills/working-in-isolation/evals/baseline/grading/seeded-on-main-momentum__without_skill.json +24 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__with_skill.json +32 -0
- package/skills/working-in-isolation/evals/baseline/grading/typo-no-worktree__without_skill.json +17 -0
- package/skills/working-in-isolation/evals/evals.json +87 -0
- package/skills/writing-skills/SKILL.md +179 -195
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +0 -24
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +0 -24
- package/skills/using-git-worktrees/SKILL.md +0 -70
- package/skills/using-git-worktrees/evals/evals.json +0 -40
- package/skills/writing-skills/graphviz-conventions.dot +0 -172
- package/skills/writing-skills/scripts/render-graphs.js +0 -181
|
@@ -13,9 +13,23 @@ import {
|
|
|
13
13
|
} from "node:fs";
|
|
14
14
|
import { tmpdir } from "node:os";
|
|
15
15
|
import { basename, dirname, join } from "node:path";
|
|
16
|
-
import {
|
|
16
|
+
import {
|
|
17
|
+
renderAvailableSkillsBlock,
|
|
18
|
+
renderPlanModeContext,
|
|
19
|
+
} from "./adapters/claude-code-session";
|
|
20
|
+
import { detectRunContext, type Harness, type RunContext } from "./context";
|
|
17
21
|
import { installGuard, teardownGuard } from "./guard/install";
|
|
18
|
-
import
|
|
22
|
+
import {
|
|
23
|
+
detectPluginShadows,
|
|
24
|
+
formatShadowBanner,
|
|
25
|
+
resolveConfigDir,
|
|
26
|
+
} from "./plugin-shadow";
|
|
27
|
+
import type {
|
|
28
|
+
AvailableSkill,
|
|
29
|
+
ConditionsRecord,
|
|
30
|
+
Eval,
|
|
31
|
+
EvalsConfig,
|
|
32
|
+
} from "./types";
|
|
19
33
|
import { validateEvalsConfig } from "./validate";
|
|
20
34
|
|
|
21
35
|
export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
|
|
@@ -27,14 +41,51 @@ export function stageSkillForCC(opts: {
|
|
|
27
41
|
condition: string;
|
|
28
42
|
skillName: string;
|
|
29
43
|
repoRoot: string;
|
|
44
|
+
/**
|
|
45
|
+
* When set, stage under this verbatim identifier instead of the conspicuous
|
|
46
|
+
* `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
|
|
47
|
+
* against the eval-flagged one (issue #144 Step 2). A custom name is not
|
|
48
|
+
* caught by `cleanupStagedSkills`'s prefix scan, so the caller must also call
|
|
49
|
+
* `registerStagedSkillForCleanup` to have it removed on the next run.
|
|
50
|
+
*/
|
|
51
|
+
stageNameOverride?: string;
|
|
30
52
|
}): string {
|
|
31
|
-
const slug =
|
|
53
|
+
const slug =
|
|
54
|
+
opts.stageNameOverride ??
|
|
55
|
+
`${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
|
|
32
56
|
const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
|
|
33
57
|
mkdirSync(skillDir, { recursive: true });
|
|
34
58
|
writeFileSync(join(skillDir, "SKILL.md"), opts.content);
|
|
35
59
|
return slug;
|
|
36
60
|
}
|
|
37
61
|
|
|
62
|
+
/**
|
|
63
|
+
* Adds a custom-named staged skill dir (one created via `stageNameOverride`) to
|
|
64
|
+
* the sibling manifest's `created_entries` so the next run's
|
|
65
|
+
* `cleanupStagedSkills` removes it — the prefix scan only catches
|
|
66
|
+
* `slow-powers-eval-…` names. Idempotent: a name already recorded is left alone.
|
|
67
|
+
*/
|
|
68
|
+
export function registerStagedSkillForCleanup(
|
|
69
|
+
repoRoot: string,
|
|
70
|
+
name: string,
|
|
71
|
+
): void {
|
|
72
|
+
const skillsDir = join(repoRoot, ".claude", "skills");
|
|
73
|
+
const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
|
|
74
|
+
let manifest: SiblingManifest;
|
|
75
|
+
if (existsSync(manifestPath)) {
|
|
76
|
+
manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
|
|
77
|
+
} else {
|
|
78
|
+
manifest = {
|
|
79
|
+
created_at: new Date().toISOString(),
|
|
80
|
+
staged_under_test: name,
|
|
81
|
+
created_entries: [],
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
if (manifest.created_entries.some((e) => e.name === name)) return;
|
|
85
|
+
manifest.created_entries.push({ name, preexisting: false });
|
|
86
|
+
writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
|
|
87
|
+
}
|
|
88
|
+
|
|
38
89
|
type SiblingManifest = {
|
|
39
90
|
created_at: string;
|
|
40
91
|
staged_under_test: string;
|
|
@@ -139,9 +190,13 @@ type Args = {
|
|
|
139
190
|
baseline?: string;
|
|
140
191
|
label?: string;
|
|
141
192
|
iteration?: number;
|
|
193
|
+
only?: string[];
|
|
194
|
+
skip?: string[];
|
|
142
195
|
dryRun: boolean;
|
|
143
196
|
noStage: boolean;
|
|
144
197
|
guard: boolean;
|
|
198
|
+
stageName?: string;
|
|
199
|
+
planMode: boolean;
|
|
145
200
|
};
|
|
146
201
|
|
|
147
202
|
function die(msg: string): never {
|
|
@@ -176,15 +231,27 @@ function parseArgs(argv: string[]): Args {
|
|
|
176
231
|
if (iteration !== undefined && !Number.isInteger(iteration))
|
|
177
232
|
die(`--iteration must be an integer, got ${iterationFlag}`);
|
|
178
233
|
|
|
234
|
+
const parseIdList = (v: string | undefined): string[] | undefined =>
|
|
235
|
+
v === undefined
|
|
236
|
+
? undefined
|
|
237
|
+
: v
|
|
238
|
+
.split(",")
|
|
239
|
+
.map((s) => s.trim())
|
|
240
|
+
.filter(Boolean);
|
|
241
|
+
|
|
179
242
|
return {
|
|
180
243
|
command,
|
|
181
244
|
mode: flag("mode") as Mode | undefined,
|
|
182
245
|
baseline: flag("baseline"),
|
|
183
246
|
label: flag("label"),
|
|
184
247
|
iteration,
|
|
248
|
+
only: parseIdList(flag("only")),
|
|
249
|
+
skip: parseIdList(flag("skip")),
|
|
185
250
|
dryRun: has("dry-run"),
|
|
186
251
|
noStage: has("no-stage"),
|
|
187
252
|
guard: has("guard"),
|
|
253
|
+
stageName: flag("stage-name"),
|
|
254
|
+
planMode: has("plan-mode"),
|
|
188
255
|
};
|
|
189
256
|
}
|
|
190
257
|
|
|
@@ -273,6 +340,16 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
273
340
|
`warning: evals.json skill_name (${config.skill_name}) does not match the skill folder (${ctx.skillName}). Proceeding with ${ctx.skillName}.`,
|
|
274
341
|
);
|
|
275
342
|
|
|
343
|
+
let selectedEvals: Eval[];
|
|
344
|
+
try {
|
|
345
|
+
selectedEvals = selectEvals(config.evals, {
|
|
346
|
+
only: args.only,
|
|
347
|
+
skip: args.skip,
|
|
348
|
+
});
|
|
349
|
+
} catch (err) {
|
|
350
|
+
die(err instanceof Error ? err.message : String(err));
|
|
351
|
+
}
|
|
352
|
+
|
|
276
353
|
const workspaceSkillDir = join(ctx.workspaceRoot, ctx.skillName);
|
|
277
354
|
const iteration = nextIteration(workspaceSkillDir, args.iteration);
|
|
278
355
|
const iterationDir = join(workspaceSkillDir, `iteration-${iteration}`);
|
|
@@ -318,6 +395,14 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
318
395
|
);
|
|
319
396
|
console.log(` ${conditionA}: ${skillPathForA ?? "(no skill)"}`);
|
|
320
397
|
console.log(` ${conditionB}: ${skillPathForB ?? "(no skill)"}`);
|
|
398
|
+
if (selectedEvals.length !== config.evals.length) {
|
|
399
|
+
const [flagName, ids] = args.only
|
|
400
|
+
? ["--only", args.only]
|
|
401
|
+
: ["--skip", args.skip ?? []];
|
|
402
|
+
console.log(
|
|
403
|
+
` selection: ${selectedEvals.length} of ${config.evals.length} evals (${flagName} ${ids.join(", ")})`,
|
|
404
|
+
);
|
|
405
|
+
}
|
|
321
406
|
if (args.noStage)
|
|
322
407
|
console.log(
|
|
323
408
|
" staging: disabled (--no-stage) — skills will be inlined into dispatch_prompt for harnesses without project-local skill discovery",
|
|
@@ -343,6 +428,19 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
343
428
|
const bootstrapContent =
|
|
344
429
|
ctx.bootstrapPath !== null ? readFileSync(ctx.bootstrapPath, "utf8") : null;
|
|
345
430
|
|
|
431
|
+
// `--plan-mode` (issue #142): inject the harness's verbatim plan-mode
|
|
432
|
+
// procedure as an operating-context layer. The profile is a bundled asset
|
|
433
|
+
// resolved relative to this runner (mirroring the guard-script resolution
|
|
434
|
+
// below) and keyed by harness, so a harness without a profile simply has no
|
|
435
|
+
// `--plan-mode` and the portable dispatch contract is unchanged.
|
|
436
|
+
const planModeContent = args.planMode
|
|
437
|
+
? resolvePlanModeProfile(ctx.harness)
|
|
438
|
+
: null;
|
|
439
|
+
if (args.planMode)
|
|
440
|
+
console.log(
|
|
441
|
+
` plan-mode: injecting ${ctx.harness} plan-mode profile as operating context (issue #142; necessary-not-sufficient fidelity layer)`,
|
|
442
|
+
);
|
|
443
|
+
|
|
346
444
|
// Sibling skill metadata, shared across conditions. Empty when --no-stage
|
|
347
445
|
// (nothing is staged, so nothing is discoverable to list).
|
|
348
446
|
const siblingSkills: AvailableSkill[] = args.noStage
|
|
@@ -352,6 +450,26 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
352
450
|
return { name, path: p, description: getSkillDescription(p) };
|
|
353
451
|
});
|
|
354
452
|
|
|
453
|
+
// `--stage-name` overrides the conspicuous `slow-powers-eval-…` slug with a
|
|
454
|
+
// verbatim name (issue #144 Step 2: A/B a natural name against the eval slug).
|
|
455
|
+
// It targets the single staging condition, so reject the case where both
|
|
456
|
+
// conditions stage (e.g. revision mode) — one name can't cover two dirs — and
|
|
457
|
+
// refuse to clobber a dir that already exists (a real project skill the user
|
|
458
|
+
// owns; cleanup has already removed our own prior custom dirs by this point).
|
|
459
|
+
if (args.stageName !== undefined && !args.noStage) {
|
|
460
|
+
if (skillPathForA !== null && skillPathForB !== null) {
|
|
461
|
+
die(
|
|
462
|
+
"--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
|
|
463
|
+
);
|
|
464
|
+
}
|
|
465
|
+
const target = join(ctx.stageRoot, ".claude", "skills", args.stageName);
|
|
466
|
+
if (existsSync(target)) {
|
|
467
|
+
die(
|
|
468
|
+
`--stage-name "${args.stageName}": ${target} already exists; refusing to clobber it. Remove it or choose a different name.`,
|
|
469
|
+
);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
355
473
|
const stageFor = (
|
|
356
474
|
condName: string,
|
|
357
475
|
condSkillPath: string | null,
|
|
@@ -363,12 +481,22 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
363
481
|
condition: condName,
|
|
364
482
|
skillName: ctx.skillName,
|
|
365
483
|
repoRoot: ctx.stageRoot,
|
|
484
|
+
stageNameOverride: args.stageName,
|
|
366
485
|
});
|
|
367
486
|
};
|
|
368
487
|
|
|
369
488
|
const conditionASlug = stageFor(conditionA, skillPathForA);
|
|
370
489
|
const conditionBSlug = stageFor(conditionB, skillPathForB);
|
|
371
490
|
|
|
491
|
+
// A custom-named dir isn't caught by cleanupStagedSkills's prefix scan; record
|
|
492
|
+
// it in the sibling manifest so the next run removes it.
|
|
493
|
+
if (
|
|
494
|
+
args.stageName !== undefined &&
|
|
495
|
+
(conditionASlug === args.stageName || conditionBSlug === args.stageName)
|
|
496
|
+
) {
|
|
497
|
+
registerStagedSkillForCleanup(ctx.stageRoot, args.stageName);
|
|
498
|
+
}
|
|
499
|
+
|
|
372
500
|
const conditions: ConditionsRecord = {
|
|
373
501
|
mode: args.mode,
|
|
374
502
|
baseline: args.baseline,
|
|
@@ -408,7 +536,7 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
408
536
|
};
|
|
409
537
|
|
|
410
538
|
const tasks: DispatchTask[] = [];
|
|
411
|
-
for (const ev of
|
|
539
|
+
for (const ev of selectedEvals) {
|
|
412
540
|
const evalDir = join(iterationDir, `eval-${ev.id}`);
|
|
413
541
|
ensureDir(evalDir);
|
|
414
542
|
|
|
@@ -432,6 +560,7 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
432
560
|
outputsDir,
|
|
433
561
|
condDir,
|
|
434
562
|
bootstrapContent,
|
|
563
|
+
planModeContent,
|
|
435
564
|
skillName: ctx.skillName,
|
|
436
565
|
availableSkills: availableSkillsFor(condSkillPath),
|
|
437
566
|
runTag,
|
|
@@ -467,6 +596,7 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
467
596
|
iteration_dir: iterationDir,
|
|
468
597
|
mode: args.mode,
|
|
469
598
|
baseline: args.baseline ?? null,
|
|
599
|
+
plan_mode: args.planMode,
|
|
470
600
|
conditions: conditions.conditions,
|
|
471
601
|
harness: ctx.harness,
|
|
472
602
|
tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
|
|
@@ -495,11 +625,28 @@ function commandRun(args: Args, ctx: RunContext): void {
|
|
|
495
625
|
}
|
|
496
626
|
}
|
|
497
627
|
|
|
628
|
+
// Plugin-shadow preflight (Claude Code): a staged skill name that is also
|
|
629
|
+
// discoverable from an enabled plugin or the global skills dir contaminates the
|
|
630
|
+
// run — subagents inherit this session's plugins, so both copies are reachable.
|
|
631
|
+
// The runner can't unload a plugin from a live session; it only flags it. The
|
|
632
|
+
// report is persisted so the aggregator can surface it in validity_warnings.
|
|
633
|
+
if (ctx.harness === "claude-code") {
|
|
634
|
+
const shadowReport = detectPluginShadows({
|
|
635
|
+
configDir: resolveConfigDir(),
|
|
636
|
+
cwd: ctx.stageRoot,
|
|
637
|
+
stagedSkillNames: [ctx.skillName, ...ctx.siblingSkillNames],
|
|
638
|
+
});
|
|
639
|
+
if (shadowReport.shadowed.length > 0) {
|
|
640
|
+
writeJson(join(iterationDir, "plugin-shadow.json"), shadowReport);
|
|
641
|
+
console.warn(formatShadowBanner(shadowReport));
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
|
|
498
645
|
console.log(`\nWorkspace prepared: ${iterationDir}`);
|
|
499
646
|
console.log(`Dispatch manifest: ${manifestPath}`);
|
|
500
647
|
console.log(`Dispatch tasks: ${dispatchJsonPath}`);
|
|
501
648
|
console.log(
|
|
502
|
-
`\n${tasks.length} dispatches required (${
|
|
649
|
+
`\n${tasks.length} dispatches required (${selectedEvals.length} evals × 2 conditions).`,
|
|
503
650
|
);
|
|
504
651
|
|
|
505
652
|
if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
|
|
@@ -531,11 +678,40 @@ type DispatchTask = {
|
|
|
531
678
|
dispatch_prompt: string;
|
|
532
679
|
};
|
|
533
680
|
|
|
534
|
-
export type AvailableSkill
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
681
|
+
export type { AvailableSkill } from "./types";
|
|
682
|
+
|
|
683
|
+
/**
|
|
684
|
+
* Filters the eval list to the subset requested via `--only` / `--skip`. The
|
|
685
|
+
* two flags are mutually exclusive. Every requested id must exist in the config,
|
|
686
|
+
* so a typo'd id is caught up front rather than silently producing an empty or
|
|
687
|
+
* surprising run. Throws on invalid input; the caller routes the message to
|
|
688
|
+
* `die`. `--only` preserves the config's eval order, not the order ids were
|
|
689
|
+
* passed.
|
|
690
|
+
*/
|
|
691
|
+
export function selectEvals(
|
|
692
|
+
evals: Eval[],
|
|
693
|
+
opts: { only?: string[]; skip?: string[] },
|
|
694
|
+
): Eval[] {
|
|
695
|
+
if (opts.only && opts.skip)
|
|
696
|
+
throw new Error("use only one of --only / --skip, not both");
|
|
697
|
+
const requested = opts.only ?? opts.skip;
|
|
698
|
+
if (requested === undefined) return evals;
|
|
699
|
+
if (requested.length === 0)
|
|
700
|
+
throw new Error("--only/--skip requires at least one eval id");
|
|
701
|
+
|
|
702
|
+
const known = new Set(evals.map((e) => e.id));
|
|
703
|
+
const unknown = requested.filter((id) => !known.has(id));
|
|
704
|
+
if (unknown.length)
|
|
705
|
+
throw new Error(
|
|
706
|
+
`unknown eval id(s): ${unknown.join(", ")}. ` +
|
|
707
|
+
`Available ids: ${[...known].join(", ")}`,
|
|
708
|
+
);
|
|
709
|
+
|
|
710
|
+
const set = new Set(requested);
|
|
711
|
+
return opts.only
|
|
712
|
+
? evals.filter((e) => set.has(e.id))
|
|
713
|
+
: evals.filter((e) => !set.has(e.id));
|
|
714
|
+
}
|
|
539
715
|
|
|
540
716
|
function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
|
|
541
717
|
if (!ev.files || ev.files.length === 0) return [];
|
|
@@ -553,6 +729,32 @@ function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
|
|
|
553
729
|
return copied;
|
|
554
730
|
}
|
|
555
731
|
|
|
732
|
+
/**
|
|
733
|
+
* Resolve the verbatim plan-mode procedure profile for a harness (issue #142).
|
|
734
|
+
* The profile is a bundled supporting-file asset under
|
|
735
|
+
* `profiles/<harness>/plan-mode.md`, resolved relative to this runner exactly
|
|
736
|
+
* like the guard script (`join(import.meta.dir, "guard", "guard.ts")`). A
|
|
737
|
+
* harness without a profile gets a clear error rather than a silent no-op — the
|
|
738
|
+
* profile is Claude-tier fidelity, and a harness lacking one leaves the portable
|
|
739
|
+
* dispatch contract unchanged (no `<system-reminder>` plan-mode block emitted).
|
|
740
|
+
*/
|
|
741
|
+
function resolvePlanModeProfile(harness: Harness): string {
|
|
742
|
+
const profilePath = join(
|
|
743
|
+
import.meta.dir,
|
|
744
|
+
"profiles",
|
|
745
|
+
harness,
|
|
746
|
+
"plan-mode.md",
|
|
747
|
+
);
|
|
748
|
+
if (!existsSync(profilePath)) {
|
|
749
|
+
die(
|
|
750
|
+
`--plan-mode: no plan-mode profile exists for harness '${harness}' ` +
|
|
751
|
+
`(expected ${profilePath}). This is a Claude-tier fidelity layer; a ` +
|
|
752
|
+
"harness without a profile leaves the portable dispatch contract unchanged.",
|
|
753
|
+
);
|
|
754
|
+
}
|
|
755
|
+
return readFileSync(profilePath, "utf8");
|
|
756
|
+
}
|
|
757
|
+
|
|
556
758
|
function getSkillDescription(skillPath: string): string {
|
|
557
759
|
try {
|
|
558
760
|
const content = readFileSync(skillPath, "utf8");
|
|
@@ -611,6 +813,15 @@ export function buildDispatchTask(opts: {
|
|
|
611
813
|
outputsDir: string;
|
|
612
814
|
condDir: string;
|
|
613
815
|
bootstrapContent: string | null;
|
|
816
|
+
/**
|
|
817
|
+
* Verbatim plan-mode procedure profile (from
|
|
818
|
+
* `profiles/<harness>/plan-mode.md`) to inject as an operating-context layer,
|
|
819
|
+
* or null/undefined to omit it. Skill-agnostic, so it is identical across the
|
|
820
|
+
* with/without-skill arms and needs no redaction. Set by the `--plan-mode`
|
|
821
|
+
* flag (issue #142): the highest-fidelity in-runner approximation of a real
|
|
822
|
+
* plan mode, still text the agent reads — a necessary-not-sufficient signal.
|
|
823
|
+
*/
|
|
824
|
+
planModeContent?: string | null;
|
|
614
825
|
skillName: string;
|
|
615
826
|
availableSkills: AvailableSkill[];
|
|
616
827
|
/**
|
|
@@ -626,11 +837,16 @@ export function buildDispatchTask(opts: {
|
|
|
626
837
|
|
|
627
838
|
let skillBlock: string;
|
|
628
839
|
if (opts.stagedSkillSlug) {
|
|
840
|
+
// Neutral slug disambiguation only — no imperative to invoke. The skill is
|
|
841
|
+
// staged under a unique slug; surface that identifier so a deliberate
|
|
842
|
+
// invocation targets the staged copy and the __skill_invoked meta-check can
|
|
843
|
+
// find it. Do NOT assert a plugin is "loaded" or tell the agent to prefer the
|
|
844
|
+
// slug "rather than the bare name": in an isolated run there is no global copy,
|
|
845
|
+
// and that framing invited the agent to hunt for one (issue #144 global-plugin
|
|
846
|
+
// leakage). Whether to invoke is left to the skill's own triggering (dropping
|
|
847
|
+
// the old "invoke if it applies" directive was the issue #119 ceiling fix).
|
|
629
848
|
skillBlock = [
|
|
630
|
-
|
|
631
|
-
"discoverable via the Skill tool. The skill currently under evaluation is",
|
|
632
|
-
`staged under the unique slug "${opts.stagedSkillSlug}" — invoke that slug rather`,
|
|
633
|
-
"than the natural name if the skill applies to the user's request.",
|
|
849
|
+
`The \`${opts.skillName}\` skill is registered under the identifier \`${opts.stagedSkillSlug}\` and is discoverable via the Skill tool. If you invoke it, use that identifier.`,
|
|
634
850
|
].join("\n");
|
|
635
851
|
} else if (opts.skillPath) {
|
|
636
852
|
skillBlock = [
|
|
@@ -641,11 +857,11 @@ export function buildDispatchTask(opts: {
|
|
|
641
857
|
"</skill>",
|
|
642
858
|
].join("\n");
|
|
643
859
|
} else if (stagedSkills.length > 0 || opts.bootstrapContent) {
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
860
|
+
// Skill-absent arm in a realistic environment: stay silent. The
|
|
861
|
+
// available-skills block already omits the skill-under-test, so any
|
|
862
|
+
// commentary here would only announce the eval (and, in the control arm,
|
|
863
|
+
// draw attention to the very skill that is supposed to be absent).
|
|
864
|
+
skillBlock = "";
|
|
649
865
|
} else {
|
|
650
866
|
skillBlock = "No skill is loaded. Respond as you naturally would.";
|
|
651
867
|
}
|
|
@@ -654,73 +870,72 @@ export function buildDispatchTask(opts: {
|
|
|
654
870
|
? `Available fixture files:\n${opts.fixtures.map((f) => ` - ${f}`).join("\n")}`
|
|
655
871
|
: "Available fixture files: none";
|
|
656
872
|
|
|
657
|
-
//
|
|
658
|
-
//
|
|
659
|
-
//
|
|
873
|
+
// A dispatch mirrors a real session by carrying two *separate* surfaces, the
|
|
874
|
+
// way the harness actually delivers them:
|
|
875
|
+
// 1. The verbatim --bootstrap file (the SessionStart-hook equivalent),
|
|
876
|
+
// wrapped in <session-start-context>, if supplied.
|
|
877
|
+
// 2. The list of discoverable skills, rendered in the harness's native
|
|
878
|
+
// presentation as its own block (see adapters/claude-code-session.ts).
|
|
660
879
|
// A condition that does not load the skill-under-test (the new-skill
|
|
661
880
|
// `without_skill` arm, under staging or --no-stage) must carry zero reference
|
|
662
|
-
// to it
|
|
663
|
-
//
|
|
881
|
+
// to it. The skill-under-test is auto-omitted from the available-skills block
|
|
882
|
+
// (see `availableSkillsFor`). redactSkillFromBootstrap covers the other path:
|
|
883
|
+
// a *user-supplied* --bootstrap that names the skill in its own prose would
|
|
884
|
+
// otherwise leak it into the control arm. (The shipped bootstrap.md no longer
|
|
885
|
+
// enumerates skills, so that redaction is a no-op against it.)
|
|
664
886
|
const skillAbsent = !opts.skillPath && !opts.stagedSkillSlug;
|
|
665
887
|
const effectiveBootstrap =
|
|
666
888
|
opts.bootstrapContent && skillAbsent
|
|
667
889
|
? redactSkillFromBootstrap(opts.bootstrapContent, opts.skillName)
|
|
668
890
|
: opts.bootstrapContent;
|
|
669
891
|
|
|
670
|
-
const
|
|
892
|
+
const sections: string[] = [];
|
|
671
893
|
if (effectiveBootstrap) {
|
|
672
|
-
|
|
894
|
+
sections.push(
|
|
673
895
|
[
|
|
896
|
+
"<session-start-context>",
|
|
674
897
|
"The following guidelines were loaded at session start by the slow-powers plugin",
|
|
675
898
|
"(equivalent to the SessionStart hook firing in a real user's environment):",
|
|
676
899
|
"",
|
|
677
900
|
effectiveBootstrap.trim(),
|
|
678
|
-
].join("\n"),
|
|
679
|
-
);
|
|
680
|
-
}
|
|
681
|
-
if (stagedSkills.length > 0) {
|
|
682
|
-
const inventoryLines = stagedSkills.map(
|
|
683
|
-
(s) => `* \`${s.name}\`\n * *Trigger:* ${s.description}`,
|
|
684
|
-
);
|
|
685
|
-
startContextParts.push(
|
|
686
|
-
[
|
|
687
|
-
"The following skills are staged and discoverable in this eval environment:",
|
|
688
|
-
"",
|
|
689
|
-
...inventoryLines,
|
|
690
|
-
].join("\n"),
|
|
691
|
-
);
|
|
692
|
-
}
|
|
693
|
-
|
|
694
|
-
const sections: string[] = [];
|
|
695
|
-
if (startContextParts.length > 0) {
|
|
696
|
-
sections.push(
|
|
697
|
-
[
|
|
698
|
-
"<session-start-context>",
|
|
699
|
-
startContextParts.join("\n\n"),
|
|
700
901
|
"</session-start-context>",
|
|
701
902
|
"",
|
|
702
903
|
].join("\n"),
|
|
703
904
|
);
|
|
704
905
|
}
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
906
|
+
const availableSkillsBlock = renderAvailableSkillsBlock(stagedSkills);
|
|
907
|
+
if (availableSkillsBlock) {
|
|
908
|
+
sections.push(`${availableSkillsBlock}\n\n`);
|
|
909
|
+
}
|
|
910
|
+
// Plan-mode operating context (issue #142). Injected as its own block after
|
|
911
|
+
// the session-start surfaces and before the eval task framing, so it reads as
|
|
912
|
+
// a session-level mode active for this turn — layered the way the real harness
|
|
913
|
+
// delivers it, not as seed prose. Skill-agnostic: identical in both arms.
|
|
914
|
+
const planModeBlock = opts.planModeContent
|
|
915
|
+
? renderPlanModeContext(opts.planModeContent)
|
|
916
|
+
: "";
|
|
917
|
+
if (planModeBlock) {
|
|
918
|
+
sections.push(`${planModeBlock}\n\n`);
|
|
919
|
+
}
|
|
920
|
+
const taskLines = [
|
|
921
|
+
"You are executing a single test case for a skill evaluation framework.",
|
|
922
|
+
"Treat this as a real user request — do NOT optimize behavior for the eval.",
|
|
923
|
+
];
|
|
924
|
+
if (skillBlock) taskLines.push("", skillBlock);
|
|
925
|
+
taskLines.push(
|
|
926
|
+
"",
|
|
927
|
+
fixturesBlock,
|
|
928
|
+
`Output directory: ${opts.outputsDir}`,
|
|
929
|
+
"",
|
|
930
|
+
"Instructions:",
|
|
931
|
+
"- Write any files you produce into the output directory.",
|
|
932
|
+
`- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
|
|
933
|
+
"- Do not write outside the output directory.",
|
|
934
|
+
"",
|
|
935
|
+
"User request:",
|
|
936
|
+
opts.userPrompt,
|
|
723
937
|
);
|
|
938
|
+
sections.push(taskLines.join("\n"));
|
|
724
939
|
|
|
725
940
|
return {
|
|
726
941
|
eval_id: opts.evalId,
|
|
@@ -35,6 +35,14 @@ export type EvalsConfig = {
|
|
|
35
35
|
evals: Eval[];
|
|
36
36
|
};
|
|
37
37
|
|
|
38
|
+
/** A skill staged and discoverable for an eval — its natural name, on-disk
|
|
39
|
+
* SKILL.md path, and frontmatter description. */
|
|
40
|
+
export type AvailableSkill = {
|
|
41
|
+
name: string;
|
|
42
|
+
path: string;
|
|
43
|
+
description: string;
|
|
44
|
+
};
|
|
45
|
+
|
|
38
46
|
export type ConditionEntry = {
|
|
39
47
|
name: string;
|
|
40
48
|
skill_path: string | null;
|
|
@@ -32,19 +32,15 @@ You are executing a single test case for a skill evaluation framework.
|
|
|
32
32
|
Treat this as a real user request — do NOT optimize your behavior for the eval.
|
|
33
33
|
|
|
34
34
|
{{#if staged_skill_slug}}
|
|
35
|
-
|
|
36
|
-
discoverable via the Skill tool.
|
|
37
|
-
|
|
38
|
-
than the natural name if the skill applies to the user's request.
|
|
35
|
+
The `{{skill_name}}` skill is registered under the identifier
|
|
36
|
+
"{{staged_skill_slug}}" and is discoverable via the Skill tool. If you invoke it,
|
|
37
|
+
use that identifier.
|
|
39
38
|
{{else if skill_path}}
|
|
40
39
|
The following skill is loaded into your operating guidelines. Apply it where relevant.
|
|
41
40
|
<skill name="{{skill_name}}">
|
|
42
41
|
{{skill_content}}
|
|
43
42
|
</skill>
|
|
44
43
|
{{else if bootstrap_content}}
|
|
45
|
-
The skill currently under evaluation is NOT available in this environment.
|
|
46
|
-
Other skills from the plugin remain discoverable via the Skill tool; apply any
|
|
47
|
-
that fit the user's request.
|
|
48
44
|
{{else}}
|
|
49
45
|
No skill is loaded. Respond as you naturally would.
|
|
50
46
|
{{/if}}
|
|
@@ -85,7 +85,7 @@ git branch -D <feature-branch>
|
|
|
85
85
|
|
|
86
86
|
### Step 5: Clean Up Git Worktrees (Options 1 & 4 only)
|
|
87
87
|
|
|
88
|
-
> **REQUIRED BACKGROUND:** You must understand `slow-powers:
|
|
88
|
+
> **REQUIRED BACKGROUND:** You must understand `slow-powers:working-in-isolation` for workspace isolation and worktree management.
|
|
89
89
|
|
|
90
90
|
If the workspace is a worktree that you created (under `.worktrees/`, `worktrees/`, or `~/.config/slow-powers/worktrees/`), clean it up from the main repository root:
|
|
91
91
|
```bash
|
|
@@ -51,6 +51,13 @@ Re-adding the one-line debugging route to "When NOT to Use" would very likely
|
|
|
51
51
|
restore the negative guard at near-zero risk to the reframe. Deferred as a
|
|
52
52
|
separate change so 3b stays one-problem-per-PR; left to the maintainer's call.
|
|
53
53
|
|
|
54
|
+
**Update (2026-06-01):** `csv-parser-bug-no-plan` was retired per #111 — the
|
|
55
|
+
over-trigger guard went red on correct consult-then-defer behavior (the agent
|
|
56
|
+
loads `hardening-plans`, reads "When NOT to Use," and correctly routes to
|
|
57
|
+
`systematic-debugging`, which the strict rubric scored as a failure). Its grading
|
|
58
|
+
files were removed; the table and `benchmark.json` below still reflect the
|
|
59
|
+
original 3-case run as it happened.
|
|
60
|
+
|
|
54
61
|
## Provenance / scope
|
|
55
62
|
|
|
56
63
|
3-case cost-conscious subset (the runner has no per-case selector — tracked as a
|
|
@@ -90,25 +90,6 @@
|
|
|
90
90
|
"rubric": "Does the response make a real hand-off to the implementation skill — explicitly naming `slow-powers:test-driven-development` as the required next skill for the implementation phase — rather than merely dropping 'TDD' as an unexplained label or parenthetical? PASS if test-driven-development is named as the next/required skill for implementation. FAIL if 'TDD' appears only as a bare label with no hand-off, or is absent. (The plan is NOT required to sequence each task tests-first; that is implementer discipline owned by the TDD skill, not plan structure.)"
|
|
91
91
|
}
|
|
92
92
|
]
|
|
93
|
-
},
|
|
94
|
-
{
|
|
95
|
-
"id": "csv-parser-bug-no-plan",
|
|
96
|
-
"prompt": "Our users are reporting that the CSV parser is skipping rows with empty email fields instead of using the fallback 'anonymous' label. Can we fix this issue in csv-parser.ts?",
|
|
97
|
-
"expected_output": "The agent should immediately trigger `systematic-debugging` (root-cause analysis) rather than loading `hardening-plans` or writing an implementation plan.",
|
|
98
|
-
"skill_should_trigger": false,
|
|
99
|
-
"assertions": [
|
|
100
|
-
{
|
|
101
|
-
"id": "loads_systematic_debugging_skill",
|
|
102
|
-
"type": "transcript_check",
|
|
103
|
-
"check": "tool_invocation_matches",
|
|
104
|
-
"pattern": "(Skill.*systematic-debugging|view_file.*systematic-debugging)"
|
|
105
|
-
},
|
|
106
|
-
{
|
|
107
|
-
"id": "invokes_debugging_only",
|
|
108
|
-
"type": "llm_judge",
|
|
109
|
-
"rubric": "Did the agent identify that this is a systematic debugging task and load the `systematic-debugging` skill, while completely bypassing the `hardening-plans` skill and avoiding writing a full implementation plan? PASS if the agent focused on debugging and did not load planning guidelines. FAIL if the agent attempted to load `hardening-plans` or write a general implementation plan before debugging the root cause."
|
|
110
|
-
}
|
|
111
|
-
]
|
|
112
93
|
}
|
|
113
94
|
]
|
|
114
95
|
}
|
|
@@ -8,17 +8,16 @@ Flaky tests often guess at timing with arbitrary delays. This creates race condi
|
|
|
8
8
|
|
|
9
9
|
## When to Use
|
|
10
10
|
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
}
|
|
11
|
+
```mermaid
|
|
12
|
+
flowchart TD
|
|
13
|
+
sleep{Test uses setTimeout/sleep?}
|
|
14
|
+
timing{Testing timing behavior?}
|
|
15
|
+
document[Document WHY timeout needed]
|
|
16
|
+
use[Use condition-based waiting]
|
|
17
|
+
|
|
18
|
+
sleep -->|yes| timing
|
|
19
|
+
timing -->|yes| document
|
|
20
|
+
timing -->|no| use
|
|
22
21
|
```
|
|
23
22
|
|
|
24
23
|
**Use when:**
|