@slowdini/slow-powers-opencode 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,9 +13,23 @@ import {
13
13
  } from "node:fs";
14
14
  import { tmpdir } from "node:os";
15
15
  import { basename, dirname, join } from "node:path";
16
- import { detectRunContext, type RunContext } from "./context";
16
+ import {
17
+ renderAvailableSkillsBlock,
18
+ renderPlanModeContext,
19
+ } from "./adapters/claude-code-session";
20
+ import { detectRunContext, type Harness, type RunContext } from "./context";
17
21
  import { installGuard, teardownGuard } from "./guard/install";
18
- import type { ConditionsRecord, Eval, EvalsConfig } from "./types";
22
+ import {
23
+ detectPluginShadows,
24
+ formatShadowBanner,
25
+ resolveConfigDir,
26
+ } from "./plugin-shadow";
27
+ import type {
28
+ AvailableSkill,
29
+ ConditionsRecord,
30
+ Eval,
31
+ EvalsConfig,
32
+ } from "./types";
19
33
  import { validateEvalsConfig } from "./validate";
20
34
 
21
35
  export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
@@ -27,14 +41,51 @@ export function stageSkillForCC(opts: {
27
41
  condition: string;
28
42
  skillName: string;
29
43
  repoRoot: string;
44
+ /**
45
+ * When set, stage under this verbatim identifier instead of the conspicuous
46
+ * `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
47
+ * against the eval-flagged one (issue #144 Step 2). A custom name is not
48
+ * caught by `cleanupStagedSkills`'s prefix scan, so the caller must also call
49
+ * `registerStagedSkillForCleanup` to have it removed on the next run.
50
+ */
51
+ stageNameOverride?: string;
30
52
  }): string {
31
- const slug = `${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
53
+ const slug =
54
+ opts.stageNameOverride ??
55
+ `${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
32
56
  const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
33
57
  mkdirSync(skillDir, { recursive: true });
34
58
  writeFileSync(join(skillDir, "SKILL.md"), opts.content);
35
59
  return slug;
36
60
  }
37
61
 
62
+ /**
63
+ * Adds a custom-named staged skill dir (one created via `stageNameOverride`) to
64
+ * the sibling manifest's `created_entries` so the next run's
65
+ * `cleanupStagedSkills` removes it — the prefix scan only catches
66
+ * `slow-powers-eval-…` names. Idempotent: a name already recorded is left alone.
67
+ */
68
+ export function registerStagedSkillForCleanup(
69
+ repoRoot: string,
70
+ name: string,
71
+ ): void {
72
+ const skillsDir = join(repoRoot, ".claude", "skills");
73
+ const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
74
+ let manifest: SiblingManifest;
75
+ if (existsSync(manifestPath)) {
76
+ manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
77
+ } else {
78
+ manifest = {
79
+ created_at: new Date().toISOString(),
80
+ staged_under_test: name,
81
+ created_entries: [],
82
+ };
83
+ }
84
+ if (manifest.created_entries.some((e) => e.name === name)) return;
85
+ manifest.created_entries.push({ name, preexisting: false });
86
+ writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
87
+ }
88
+
38
89
  type SiblingManifest = {
39
90
  created_at: string;
40
91
  staged_under_test: string;
@@ -139,9 +190,13 @@ type Args = {
139
190
  baseline?: string;
140
191
  label?: string;
141
192
  iteration?: number;
193
+ only?: string[];
194
+ skip?: string[];
142
195
  dryRun: boolean;
143
196
  noStage: boolean;
144
197
  guard: boolean;
198
+ stageName?: string;
199
+ planMode: boolean;
145
200
  };
146
201
 
147
202
  function die(msg: string): never {
@@ -176,15 +231,27 @@ function parseArgs(argv: string[]): Args {
176
231
  if (iteration !== undefined && !Number.isInteger(iteration))
177
232
  die(`--iteration must be an integer, got ${iterationFlag}`);
178
233
 
234
+ const parseIdList = (v: string | undefined): string[] | undefined =>
235
+ v === undefined
236
+ ? undefined
237
+ : v
238
+ .split(",")
239
+ .map((s) => s.trim())
240
+ .filter(Boolean);
241
+
179
242
  return {
180
243
  command,
181
244
  mode: flag("mode") as Mode | undefined,
182
245
  baseline: flag("baseline"),
183
246
  label: flag("label"),
184
247
  iteration,
248
+ only: parseIdList(flag("only")),
249
+ skip: parseIdList(flag("skip")),
185
250
  dryRun: has("dry-run"),
186
251
  noStage: has("no-stage"),
187
252
  guard: has("guard"),
253
+ stageName: flag("stage-name"),
254
+ planMode: has("plan-mode"),
188
255
  };
189
256
  }
190
257
 
@@ -273,6 +340,16 @@ function commandRun(args: Args, ctx: RunContext): void {
273
340
  `warning: evals.json skill_name (${config.skill_name}) does not match the skill folder (${ctx.skillName}). Proceeding with ${ctx.skillName}.`,
274
341
  );
275
342
 
343
+ let selectedEvals: Eval[];
344
+ try {
345
+ selectedEvals = selectEvals(config.evals, {
346
+ only: args.only,
347
+ skip: args.skip,
348
+ });
349
+ } catch (err) {
350
+ die(err instanceof Error ? err.message : String(err));
351
+ }
352
+
276
353
  const workspaceSkillDir = join(ctx.workspaceRoot, ctx.skillName);
277
354
  const iteration = nextIteration(workspaceSkillDir, args.iteration);
278
355
  const iterationDir = join(workspaceSkillDir, `iteration-${iteration}`);
@@ -318,6 +395,14 @@ function commandRun(args: Args, ctx: RunContext): void {
318
395
  );
319
396
  console.log(` ${conditionA}: ${skillPathForA ?? "(no skill)"}`);
320
397
  console.log(` ${conditionB}: ${skillPathForB ?? "(no skill)"}`);
398
+ if (selectedEvals.length !== config.evals.length) {
399
+ const [flagName, ids] = args.only
400
+ ? ["--only", args.only]
401
+ : ["--skip", args.skip ?? []];
402
+ console.log(
403
+ ` selection: ${selectedEvals.length} of ${config.evals.length} evals (${flagName} ${ids.join(", ")})`,
404
+ );
405
+ }
321
406
  if (args.noStage)
322
407
  console.log(
323
408
  " staging: disabled (--no-stage) — skills will be inlined into dispatch_prompt for harnesses without project-local skill discovery",
@@ -343,6 +428,19 @@ function commandRun(args: Args, ctx: RunContext): void {
343
428
  const bootstrapContent =
344
429
  ctx.bootstrapPath !== null ? readFileSync(ctx.bootstrapPath, "utf8") : null;
345
430
 
431
+ // `--plan-mode` (issue #142): inject the harness's verbatim plan-mode
432
+ // procedure as an operating-context layer. The profile is a bundled asset
433
+ // resolved relative to this runner (mirroring the guard-script resolution
434
+ // below) and keyed by harness, so a harness without a profile simply has no
435
+ // `--plan-mode` and the portable dispatch contract is unchanged.
436
+ const planModeContent = args.planMode
437
+ ? resolvePlanModeProfile(ctx.harness)
438
+ : null;
439
+ if (args.planMode)
440
+ console.log(
441
+ ` plan-mode: injecting ${ctx.harness} plan-mode profile as operating context (issue #142; necessary-not-sufficient fidelity layer)`,
442
+ );
443
+
346
444
  // Sibling skill metadata, shared across conditions. Empty when --no-stage
347
445
  // (nothing is staged, so nothing is discoverable to list).
348
446
  const siblingSkills: AvailableSkill[] = args.noStage
@@ -352,6 +450,26 @@ function commandRun(args: Args, ctx: RunContext): void {
352
450
  return { name, path: p, description: getSkillDescription(p) };
353
451
  });
354
452
 
453
+ // `--stage-name` overrides the conspicuous `slow-powers-eval-…` slug with a
454
+ // verbatim name (issue #144 Step 2: A/B a natural name against the eval slug).
455
+ // It targets the single staging condition, so reject the case where both
456
+ // conditions stage (e.g. revision mode) — one name can't cover two dirs — and
457
+ // refuse to clobber a dir that already exists (a real project skill the user
458
+ // owns; cleanup has already removed our own prior custom dirs by this point).
459
+ if (args.stageName !== undefined && !args.noStage) {
460
+ if (skillPathForA !== null && skillPathForB !== null) {
461
+ die(
462
+ "--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
463
+ );
464
+ }
465
+ const target = join(ctx.stageRoot, ".claude", "skills", args.stageName);
466
+ if (existsSync(target)) {
467
+ die(
468
+ `--stage-name "${args.stageName}": ${target} already exists; refusing to clobber it. Remove it or choose a different name.`,
469
+ );
470
+ }
471
+ }
472
+
355
473
  const stageFor = (
356
474
  condName: string,
357
475
  condSkillPath: string | null,
@@ -363,12 +481,22 @@ function commandRun(args: Args, ctx: RunContext): void {
363
481
  condition: condName,
364
482
  skillName: ctx.skillName,
365
483
  repoRoot: ctx.stageRoot,
484
+ stageNameOverride: args.stageName,
366
485
  });
367
486
  };
368
487
 
369
488
  const conditionASlug = stageFor(conditionA, skillPathForA);
370
489
  const conditionBSlug = stageFor(conditionB, skillPathForB);
371
490
 
491
+ // A custom-named dir isn't caught by cleanupStagedSkills's prefix scan; record
492
+ // it in the sibling manifest so the next run removes it.
493
+ if (
494
+ args.stageName !== undefined &&
495
+ (conditionASlug === args.stageName || conditionBSlug === args.stageName)
496
+ ) {
497
+ registerStagedSkillForCleanup(ctx.stageRoot, args.stageName);
498
+ }
499
+
372
500
  const conditions: ConditionsRecord = {
373
501
  mode: args.mode,
374
502
  baseline: args.baseline,
@@ -408,7 +536,7 @@ function commandRun(args: Args, ctx: RunContext): void {
408
536
  };
409
537
 
410
538
  const tasks: DispatchTask[] = [];
411
- for (const ev of config.evals) {
539
+ for (const ev of selectedEvals) {
412
540
  const evalDir = join(iterationDir, `eval-${ev.id}`);
413
541
  ensureDir(evalDir);
414
542
 
@@ -432,6 +560,7 @@ function commandRun(args: Args, ctx: RunContext): void {
432
560
  outputsDir,
433
561
  condDir,
434
562
  bootstrapContent,
563
+ planModeContent,
435
564
  skillName: ctx.skillName,
436
565
  availableSkills: availableSkillsFor(condSkillPath),
437
566
  runTag,
@@ -467,6 +596,7 @@ function commandRun(args: Args, ctx: RunContext): void {
467
596
  iteration_dir: iterationDir,
468
597
  mode: args.mode,
469
598
  baseline: args.baseline ?? null,
599
+ plan_mode: args.planMode,
470
600
  conditions: conditions.conditions,
471
601
  harness: ctx.harness,
472
602
  tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
@@ -495,11 +625,28 @@ function commandRun(args: Args, ctx: RunContext): void {
495
625
  }
496
626
  }
497
627
 
628
+ // Plugin-shadow preflight (Claude Code): a staged skill name that is also
629
+ // discoverable from an enabled plugin or the global skills dir contaminates the
630
+ // run — subagents inherit this session's plugins, so both copies are reachable.
631
+ // The runner can't unload a plugin from a live session; it only flags it. The
632
+ // report is persisted so the aggregator can surface it in validity_warnings.
633
+ if (ctx.harness === "claude-code") {
634
+ const shadowReport = detectPluginShadows({
635
+ configDir: resolveConfigDir(),
636
+ cwd: ctx.stageRoot,
637
+ stagedSkillNames: [ctx.skillName, ...ctx.siblingSkillNames],
638
+ });
639
+ if (shadowReport.shadowed.length > 0) {
640
+ writeJson(join(iterationDir, "plugin-shadow.json"), shadowReport);
641
+ console.warn(formatShadowBanner(shadowReport));
642
+ }
643
+ }
644
+
498
645
  console.log(`\nWorkspace prepared: ${iterationDir}`);
499
646
  console.log(`Dispatch manifest: ${manifestPath}`);
500
647
  console.log(`Dispatch tasks: ${dispatchJsonPath}`);
501
648
  console.log(
502
- `\n${tasks.length} dispatches required (${config.evals.length} evals × 2 conditions).`,
649
+ `\n${tasks.length} dispatches required (${selectedEvals.length} evals × 2 conditions).`,
503
650
  );
504
651
 
505
652
  if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
@@ -531,11 +678,40 @@ type DispatchTask = {
531
678
  dispatch_prompt: string;
532
679
  };
533
680
 
534
- export type AvailableSkill = {
535
- name: string;
536
- path: string;
537
- description: string;
538
- };
681
+ export type { AvailableSkill } from "./types";
682
+
683
+ /**
684
+ * Filters the eval list to the subset requested via `--only` / `--skip`. The
685
+ * two flags are mutually exclusive. Every requested id must exist in the config,
686
+ * so a typo'd id is caught up front rather than silently producing an empty or
687
+ * surprising run. Throws on invalid input; the caller routes the message to
688
+ * `die`. `--only` preserves the config's eval order, not the order ids were
689
+ * passed.
690
+ */
691
+ export function selectEvals(
692
+ evals: Eval[],
693
+ opts: { only?: string[]; skip?: string[] },
694
+ ): Eval[] {
695
+ if (opts.only && opts.skip)
696
+ throw new Error("use only one of --only / --skip, not both");
697
+ const requested = opts.only ?? opts.skip;
698
+ if (requested === undefined) return evals;
699
+ if (requested.length === 0)
700
+ throw new Error("--only/--skip requires at least one eval id");
701
+
702
+ const known = new Set(evals.map((e) => e.id));
703
+ const unknown = requested.filter((id) => !known.has(id));
704
+ if (unknown.length)
705
+ throw new Error(
706
+ `unknown eval id(s): ${unknown.join(", ")}. ` +
707
+ `Available ids: ${[...known].join(", ")}`,
708
+ );
709
+
710
+ const set = new Set(requested);
711
+ return opts.only
712
+ ? evals.filter((e) => set.has(e.id))
713
+ : evals.filter((e) => !set.has(e.id));
714
+ }
539
715
 
540
716
  function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
541
717
  if (!ev.files || ev.files.length === 0) return [];
@@ -553,6 +729,32 @@ function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
553
729
  return copied;
554
730
  }
555
731
 
732
+ /**
733
+ * Resolve the verbatim plan-mode procedure profile for a harness (issue #142).
734
+ * The profile is a bundled supporting-file asset under
735
+ * `profiles/<harness>/plan-mode.md`, resolved relative to this runner exactly
736
+ * like the guard script (`join(import.meta.dir, "guard", "guard.ts")`). A
737
+ * harness without a profile gets a clear error rather than a silent no-op — the
738
+ * profile is Claude-tier fidelity, and a harness lacking one leaves the portable
739
+ * dispatch contract unchanged (no `<system-reminder>` plan-mode block emitted).
740
+ */
741
+ function resolvePlanModeProfile(harness: Harness): string {
742
+ const profilePath = join(
743
+ import.meta.dir,
744
+ "profiles",
745
+ harness,
746
+ "plan-mode.md",
747
+ );
748
+ if (!existsSync(profilePath)) {
749
+ die(
750
+ `--plan-mode: no plan-mode profile exists for harness '${harness}' ` +
751
+ `(expected ${profilePath}). This is a Claude-tier fidelity layer; a ` +
752
+ "harness without a profile leaves the portable dispatch contract unchanged.",
753
+ );
754
+ }
755
+ return readFileSync(profilePath, "utf8");
756
+ }
757
+
556
758
  function getSkillDescription(skillPath: string): string {
557
759
  try {
558
760
  const content = readFileSync(skillPath, "utf8");
@@ -611,6 +813,15 @@ export function buildDispatchTask(opts: {
611
813
  outputsDir: string;
612
814
  condDir: string;
613
815
  bootstrapContent: string | null;
816
+ /**
817
+ * Verbatim plan-mode procedure profile (from
818
+ * `profiles/<harness>/plan-mode.md`) to inject as an operating-context layer,
819
+ * or null/undefined to omit it. Skill-agnostic, so it is identical across the
820
+ * with/without-skill arms and needs no redaction. Set by the `--plan-mode`
821
+ * flag (issue #142): the highest-fidelity in-runner approximation of a real
822
+ * plan mode, still text the agent reads — a necessary-not-sufficient signal.
823
+ */
824
+ planModeContent?: string | null;
614
825
  skillName: string;
615
826
  availableSkills: AvailableSkill[];
616
827
  /**
@@ -626,11 +837,16 @@ export function buildDispatchTask(opts: {
626
837
 
627
838
  let skillBlock: string;
628
839
  if (opts.stagedSkillSlug) {
840
+ // Neutral slug disambiguation only — no imperative to invoke. The skill is
841
+ // staged under a unique slug; surface that identifier so a deliberate
842
+ // invocation targets the staged copy and the __skill_invoked meta-check can
843
+ // find it. Do NOT assert a plugin is "loaded" or tell the agent to prefer the
844
+ // slug "rather than the bare name": in an isolated run there is no global copy,
845
+ // and that framing invited the agent to hunt for one (issue #144 global-plugin
846
+ // leakage). Whether to invoke is left to the skill's own triggering (dropping
847
+ // the old "invoke if it applies" directive was the issue #119 ceiling fix).
629
848
  skillBlock = [
630
- "Your environment has the slow-powers plugin loaded. All slow-powers skills are",
631
- "discoverable via the Skill tool. The skill currently under evaluation is",
632
- `staged under the unique slug "${opts.stagedSkillSlug}" — invoke that slug rather`,
633
- "than the natural name if the skill applies to the user's request.",
849
+ `The \`${opts.skillName}\` skill is registered under the identifier \`${opts.stagedSkillSlug}\` and is discoverable via the Skill tool. If you invoke it, use that identifier.`,
634
850
  ].join("\n");
635
851
  } else if (opts.skillPath) {
636
852
  skillBlock = [
@@ -641,11 +857,11 @@ export function buildDispatchTask(opts: {
641
857
  "</skill>",
642
858
  ].join("\n");
643
859
  } else if (stagedSkills.length > 0 || opts.bootstrapContent) {
644
- skillBlock = [
645
- "The skill currently under evaluation is NOT available in this environment.",
646
- "Other staged skills remain discoverable via the Skill tool; apply any",
647
- "that fit the user's request.",
648
- ].join("\n");
860
+ // Skill-absent arm in a realistic environment: stay silent. The
861
+ // available-skills block already omits the skill-under-test, so any
862
+ // commentary here would only announce the eval (and, in the control arm,
863
+ // draw attention to the very skill that is supposed to be absent).
864
+ skillBlock = "";
649
865
  } else {
650
866
  skillBlock = "No skill is loaded. Respond as you naturally would.";
651
867
  }
@@ -654,73 +870,72 @@ export function buildDispatchTask(opts: {
654
870
  ? `Available fixture files:\n${opts.fixtures.map((f) => ` - ${f}`).join("\n")}`
655
871
  : "Available fixture files: none";
656
872
 
657
- // The session-start context carries two kinds of content:
658
- // 1. The verbatim --bootstrap file (product-specific framing), if supplied.
659
- // 2. An auto-built inventory of the skills staged for this eval.
873
+ // A dispatch mirrors a real session by carrying two *separate* surfaces, the
874
+ // way the harness actually delivers them:
875
+ // 1. The verbatim --bootstrap file (the SessionStart-hook equivalent),
876
+ // wrapped in <session-start-context>, if supplied.
877
+ // 2. The list of discoverable skills, rendered in the harness's native
878
+ // presentation as its own block (see adapters/claude-code-session.ts).
660
879
  // A condition that does not load the skill-under-test (the new-skill
661
880
  // `without_skill` arm, under staging or --no-stage) must carry zero reference
662
- // to it including in the verbatim bootstrap, which otherwise lists it in its
663
- // Active Skills Directory and leaks the skill into the control arm.
881
+ // to it. The skill-under-test is auto-omitted from the available-skills block
882
+ // (see `availableSkillsFor`). redactSkillFromBootstrap covers the other path:
883
+ // a *user-supplied* --bootstrap that names the skill in its own prose would
884
+ // otherwise leak it into the control arm. (The shipped bootstrap.md no longer
885
+ // enumerates skills, so that redaction is a no-op against it.)
664
886
  const skillAbsent = !opts.skillPath && !opts.stagedSkillSlug;
665
887
  const effectiveBootstrap =
666
888
  opts.bootstrapContent && skillAbsent
667
889
  ? redactSkillFromBootstrap(opts.bootstrapContent, opts.skillName)
668
890
  : opts.bootstrapContent;
669
891
 
670
- const startContextParts: string[] = [];
892
+ const sections: string[] = [];
671
893
  if (effectiveBootstrap) {
672
- startContextParts.push(
894
+ sections.push(
673
895
  [
896
+ "<session-start-context>",
674
897
  "The following guidelines were loaded at session start by the slow-powers plugin",
675
898
  "(equivalent to the SessionStart hook firing in a real user's environment):",
676
899
  "",
677
900
  effectiveBootstrap.trim(),
678
- ].join("\n"),
679
- );
680
- }
681
- if (stagedSkills.length > 0) {
682
- const inventoryLines = stagedSkills.map(
683
- (s) => `* \`${s.name}\`\n * *Trigger:* ${s.description}`,
684
- );
685
- startContextParts.push(
686
- [
687
- "The following skills are staged and discoverable in this eval environment:",
688
- "",
689
- ...inventoryLines,
690
- ].join("\n"),
691
- );
692
- }
693
-
694
- const sections: string[] = [];
695
- if (startContextParts.length > 0) {
696
- sections.push(
697
- [
698
- "<session-start-context>",
699
- startContextParts.join("\n\n"),
700
901
  "</session-start-context>",
701
902
  "",
702
903
  ].join("\n"),
703
904
  );
704
905
  }
705
- sections.push(
706
- [
707
- "You are executing a single test case for a skill evaluation framework.",
708
- "Treat this as a real user request — do NOT optimize behavior for the eval.",
709
- "",
710
- skillBlock,
711
- "",
712
- fixturesBlock,
713
- `Output directory: ${opts.outputsDir}`,
714
- "",
715
- "Instructions:",
716
- "- Write any files you produce into the output directory.",
717
- `- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
718
- "- Do not write outside the output directory.",
719
- "",
720
- "User request:",
721
- opts.userPrompt,
722
- ].join("\n"),
906
+ const availableSkillsBlock = renderAvailableSkillsBlock(stagedSkills);
907
+ if (availableSkillsBlock) {
908
+ sections.push(`${availableSkillsBlock}\n\n`);
909
+ }
910
+ // Plan-mode operating context (issue #142). Injected as its own block after
911
+ // the session-start surfaces and before the eval task framing, so it reads as
912
+ // a session-level mode active for this turn — layered the way the real harness
913
+ // delivers it, not as seed prose. Skill-agnostic: identical in both arms.
914
+ const planModeBlock = opts.planModeContent
915
+ ? renderPlanModeContext(opts.planModeContent)
916
+ : "";
917
+ if (planModeBlock) {
918
+ sections.push(`${planModeBlock}\n\n`);
919
+ }
920
+ const taskLines = [
921
+ "You are executing a single test case for a skill evaluation framework.",
922
+ "Treat this as a real user request — do NOT optimize behavior for the eval.",
923
+ ];
924
+ if (skillBlock) taskLines.push("", skillBlock);
925
+ taskLines.push(
926
+ "",
927
+ fixturesBlock,
928
+ `Output directory: ${opts.outputsDir}`,
929
+ "",
930
+ "Instructions:",
931
+ "- Write any files you produce into the output directory.",
932
+ `- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
933
+ "- Do not write outside the output directory.",
934
+ "",
935
+ "User request:",
936
+ opts.userPrompt,
723
937
  );
938
+ sections.push(taskLines.join("\n"));
724
939
 
725
940
  return {
726
941
  eval_id: opts.evalId,
@@ -35,6 +35,14 @@ export type EvalsConfig = {
35
35
  evals: Eval[];
36
36
  };
37
37
 
38
+ /** A skill staged and discoverable for an eval — its natural name, on-disk
39
+ * SKILL.md path, and frontmatter description. */
40
+ export type AvailableSkill = {
41
+ name: string;
42
+ path: string;
43
+ description: string;
44
+ };
45
+
38
46
  export type ConditionEntry = {
39
47
  name: string;
40
48
  skill_path: string | null;
@@ -32,19 +32,15 @@ You are executing a single test case for a skill evaluation framework.
32
32
  Treat this as a real user request — do NOT optimize your behavior for the eval.
33
33
 
34
34
  {{#if staged_skill_slug}}
35
- Your environment has the plugin under evaluation loaded. Its skills are
36
- discoverable via the Skill tool. The skill currently under evaluation is
37
- staged under the unique slug "{{staged_skill_slug}}" — invoke that slug rather
38
- than the natural name if the skill applies to the user's request.
35
+ The `{{skill_name}}` skill is registered under the identifier
36
+ "{{staged_skill_slug}}" and is discoverable via the Skill tool. If you invoke it,
37
+ use that identifier.
39
38
  {{else if skill_path}}
40
39
  The following skill is loaded into your operating guidelines. Apply it where relevant.
41
40
  <skill name="{{skill_name}}">
42
41
  {{skill_content}}
43
42
  </skill>
44
43
  {{else if bootstrap_content}}
45
- The skill currently under evaluation is NOT available in this environment.
46
- Other skills from the plugin remain discoverable via the Skill tool; apply any
47
- that fit the user's request.
48
44
  {{else}}
49
45
  No skill is loaded. Respond as you naturally would.
50
46
  {{/if}}
@@ -51,6 +51,13 @@ Re-adding the one-line debugging route to "When NOT to Use" would very likely
51
51
  restore the negative guard at near-zero risk to the reframe. Deferred as a
52
52
  separate change so 3b stays one-problem-per-PR; left to the maintainer's call.
53
53
 
54
+ **Update (2026-06-01):** `csv-parser-bug-no-plan` was retired per #111 — the
55
+ over-trigger guard went red on correct consult-then-defer behavior (the agent
56
+ loads `hardening-plans`, reads "When NOT to Use," and correctly routes to
57
+ `systematic-debugging`, which the strict rubric scored as a failure). Its grading
58
+ files were removed; the table and `benchmark.json` below still reflect the
59
+ original 3-case run as it happened.
60
+
54
61
  ## Provenance / scope
55
62
 
56
63
  3-case cost-conscious subset (the runner has no per-case selector — tracked as a
@@ -90,25 +90,6 @@
90
90
  "rubric": "Does the response make a real hand-off to the implementation skill — explicitly naming `slow-powers:test-driven-development` as the required next skill for the implementation phase — rather than merely dropping 'TDD' as an unexplained label or parenthetical? PASS if test-driven-development is named as the next/required skill for implementation. FAIL if 'TDD' appears only as a bare label with no hand-off, or is absent. (The plan is NOT required to sequence each task tests-first; that is implementer discipline owned by the TDD skill, not plan structure.)"
91
91
  }
92
92
  ]
93
- },
94
- {
95
- "id": "csv-parser-bug-no-plan",
96
- "prompt": "Our users are reporting that the CSV parser is skipping rows with empty email fields instead of using the fallback 'anonymous' label. Can we fix this issue in csv-parser.ts?",
97
- "expected_output": "The agent should immediately trigger `systematic-debugging` (root-cause analysis) rather than loading `hardening-plans` or writing an implementation plan.",
98
- "skill_should_trigger": false,
99
- "assertions": [
100
- {
101
- "id": "loads_systematic_debugging_skill",
102
- "type": "transcript_check",
103
- "check": "tool_invocation_matches",
104
- "pattern": "(Skill.*systematic-debugging|view_file.*systematic-debugging)"
105
- },
106
- {
107
- "id": "invokes_debugging_only",
108
- "type": "llm_judge",
109
- "rubric": "Did the agent identify that this is a systematic debugging task and load the `systematic-debugging` skill, while completely bypassing the `hardening-plans` skill and avoiding writing a full implementation plan? PASS if the agent focused on debugging and did not load planning guidelines. FAIL if the agent attempted to load `hardening-plans` or write a general implementation plan before debugging the root cause."
110
- }
111
- ]
112
93
  }
113
94
  ]
114
95
  }
@@ -1,24 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "loads_systematic_debugging_skill",
5
- "passed": false,
6
- "evidence": "no tool invocation matched /(Skill.*systematic-debugging|view_file.*systematic-debugging)/ across 5 invocation(s)",
7
- "confidence": 1,
8
- "grader": "transcript_check"
9
- },
10
- {
11
- "id": "invokes_debugging_only",
12
- "passed": false,
13
- "evidence": "The agent loaded 'slow-powers-eval-1-new_skill__hardening-plans' (tool_invocations ordinal 1) and wrote a five-step implementation plan before invoking it. The systematic-debugging skill was never loaded at any point in the run.",
14
- "confidence": 1,
15
- "grader": "llm_judge"
16
- }
17
- ],
18
- "summary": {
19
- "passed": 0,
20
- "failed": 2,
21
- "total": 2,
22
- "pass_rate": 0
23
- }
24
- }
@@ -1,24 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "loads_systematic_debugging_skill",
5
- "passed": true,
6
- "evidence": "matched ordinal 1: Skill {\"skill\":\"slow-powers:systematic-debugging\"}",
7
- "confidence": 1,
8
- "grader": "transcript_check"
9
- },
10
- {
11
- "id": "invokes_debugging_only",
12
- "passed": true,
13
- "evidence": "Tool invocation ordinal 1 shows the agent called the Skill tool with 'slow-powers:systematic-debugging'; no invocation of 'hardening-plans' appears anywhere in the tool list; the final_message asks for the file, a reproduction case, and logs — debugging triage, not a plan.",
14
- "confidence": 1,
15
- "grader": "llm_judge"
16
- }
17
- ],
18
- "summary": {
19
- "passed": 2,
20
- "failed": 0,
21
- "total": 2,
22
- "pass_rate": 1
23
- }
24
- }