@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +69 -5
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/hardening-plans/SKILL.md +29 -7
  16. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  17. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  18. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  19. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  20. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  21. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  22. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  24. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  25. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  26. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  27. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  28. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  29. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  30. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  31. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  32. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  33. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  34. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  35. package/skills/hardening-plans/evals/evals.json +46 -0
  36. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  37. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  38. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  39. package/skills/evaluating-skills/harness-parity.md +0 -155
  40. package/skills/evaluating-skills/runner/README.md +0 -163
  41. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  42. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  43. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  44. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  45. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  46. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  47. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  48. package/skills/evaluating-skills/runner/context.ts +0 -90
  49. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  50. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  51. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  52. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  53. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  54. package/skills/evaluating-skills/runner/grade.ts +0 -603
  55. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  56. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  57. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  58. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  59. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  60. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  61. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  62. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  63. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  64. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  65. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  66. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  67. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  68. package/skills/evaluating-skills/runner/run.ts +0 -1388
  69. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  70. package/skills/evaluating-skills/runner/types.ts +0 -121
  71. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  72. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  73. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  74. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  75. package/skills/evaluating-skills/runner/validate.ts +0 -21
  76. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  77. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  78. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  79. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  80. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  81. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  82. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  83. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  84. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  85. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,201 +0,0 @@
1
- // Plugin-shadow detector (Claude Code). The runner stages eval skills into the
2
- // project-local `.claude/skills/` dir, but eval subagents are dispatched via the
3
- // Task tool and run in-process — so they ALSO inherit whatever skills the
4
- // orchestrator session loaded from installed plugins and the global skills dir.
5
- // When a staged skill name collides with one of those, both copies are
6
- // discoverable: the with/without comparison is contaminated and the control arm
7
- // is not truly skill-absent.
8
- //
9
- // The runner cannot unload a plugin from a running session (plugins load at
10
- // session start), so this module only *detects and reports* the overlap. It
11
- // reads declared settings as a best-effort proxy for what the session loaded —
12
- // it can't observe the live-loaded set, so a session that changed settings
13
- // without restarting may differ. Isolation itself is a launch-time concern; see
14
- // harness-details/claude.md → "Isolating from installed plugins".
15
- import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
16
- import { homedir } from "node:os";
17
- import { join } from "node:path";
18
-
19
- export type ShadowSource =
20
- | { kind: "plugin"; plugin: string; skill_name: string; path: string }
21
- | { kind: "global-skill"; skill_name: string; path: string };
22
-
23
- export type PluginShadowReport = {
24
- config_dir: string;
25
- shadowed: ShadowSource[];
26
- };
27
-
28
- const ISOLATION_DOC =
29
- 'harness-details/claude.md → "Isolating from installed plugins"';
30
-
31
- /** The Claude Code config dir: `$CLAUDE_CONFIG_DIR` if set, else `~/.claude`. */
32
- export function resolveConfigDir(env: NodeJS.ProcessEnv = process.env): string {
33
- const override = env.CLAUDE_CONFIG_DIR;
34
- return override?.trim() ? override : join(homedir(), ".claude");
35
- }
36
-
37
- function readJsonSafe<T>(path: string): T | null {
38
- if (!existsSync(path)) return null;
39
- try {
40
- return JSON.parse(readFileSync(path, "utf8")) as T;
41
- } catch {
42
- return null;
43
- }
44
- }
45
-
46
- type Settings = { enabledPlugins?: Record<string, boolean> };
47
-
48
- /**
49
- * Effective `enabledPlugins` map, honoring Claude Code's settings precedence
50
- * (local > project > user). User scope lives under the config dir; project and
51
- * local scope live under `<cwd>/.claude/`. Later sources override earlier keys,
52
- * so a project-scope `false` correctly masks a user-scope `true`.
53
- */
54
- export function resolveEnabledPlugins(opts: {
55
- configDir: string;
56
- cwd: string;
57
- }): Record<string, boolean> {
58
- const sources = [
59
- join(opts.configDir, "settings.json"),
60
- join(opts.cwd, ".claude", "settings.json"),
61
- join(opts.cwd, ".claude", "settings.local.json"),
62
- ];
63
- let merged: Record<string, boolean> = {};
64
- for (const path of sources) {
65
- const s = readJsonSafe<Settings>(path);
66
- if (s?.enabledPlugins) merged = { ...merged, ...s.enabledPlugins };
67
- }
68
- return merged;
69
- }
70
-
71
- /** Names of skill folders (those holding a `SKILL.md`) directly under `dir`. */
72
- function skillFolderNames(dir: string): Array<{ name: string; path: string }> {
73
- if (!existsSync(dir)) return [];
74
- let entries: string[];
75
- try {
76
- entries = readdirSync(dir);
77
- } catch {
78
- return [];
79
- }
80
- const out: Array<{ name: string; path: string }> = [];
81
- for (const name of entries) {
82
- const skillDir = join(dir, name);
83
- try {
84
- if (!statSync(skillDir).isDirectory()) continue;
85
- } catch {
86
- continue;
87
- }
88
- if (existsSync(join(skillDir, "SKILL.md")))
89
- out.push({ name, path: skillDir });
90
- }
91
- return out;
92
- }
93
-
94
- type InstalledPlugins = {
95
- plugins?: Record<string, Array<{ installPath?: string }>>;
96
- };
97
-
98
- /** Skills exposed by currently-enabled installed plugins. */
99
- export function listEnabledPluginSkills(opts: {
100
- configDir: string;
101
- enabled: Record<string, boolean>;
102
- }): Array<{ plugin: string; skill_name: string; path: string }> {
103
- const manifest = readJsonSafe<InstalledPlugins>(
104
- join(opts.configDir, "plugins", "installed_plugins.json"),
105
- );
106
- const out: Array<{ plugin: string; skill_name: string; path: string }> = [];
107
- if (!manifest?.plugins) return out;
108
- for (const [key, installs] of Object.entries(manifest.plugins)) {
109
- if (opts.enabled[key] !== true) continue; // only enabled plugins shadow
110
- for (const inst of installs ?? []) {
111
- if (!inst.installPath) continue;
112
- for (const s of skillFolderNames(join(inst.installPath, "skills")))
113
- out.push({ plugin: key, skill_name: s.name, path: s.path });
114
- }
115
- }
116
- return out;
117
- }
118
-
119
- /** Skills under the global skills dir (`<configDir>/skills`). */
120
- export function listGlobalSkills(
121
- configDir: string,
122
- ): Array<{ skill_name: string; path: string }> {
123
- return skillFolderNames(join(configDir, "skills")).map((s) => ({
124
- skill_name: s.name,
125
- path: s.path,
126
- }));
127
- }
128
-
129
- /**
130
- * Which of `stagedSkillNames` are also discoverable from enabled plugins or the
131
- * global skills dir. Matches on the skill folder name (exact).
132
- */
133
- export function detectPluginShadows(opts: {
134
- configDir: string;
135
- cwd: string;
136
- stagedSkillNames: string[];
137
- }): PluginShadowReport {
138
- const staged = new Set(opts.stagedSkillNames);
139
- const enabled = resolveEnabledPlugins({
140
- configDir: opts.configDir,
141
- cwd: opts.cwd,
142
- });
143
- const shadowed: ShadowSource[] = [];
144
-
145
- for (const s of listEnabledPluginSkills({
146
- configDir: opts.configDir,
147
- enabled,
148
- }))
149
- if (staged.has(s.skill_name))
150
- shadowed.push({
151
- kind: "plugin",
152
- plugin: s.plugin,
153
- skill_name: s.skill_name,
154
- path: s.path,
155
- });
156
-
157
- for (const s of listGlobalSkills(opts.configDir))
158
- if (staged.has(s.skill_name))
159
- shadowed.push({
160
- kind: "global-skill",
161
- skill_name: s.skill_name,
162
- path: s.path,
163
- });
164
-
165
- return { config_dir: opts.configDir, shadowed };
166
- }
167
-
168
- function sourceLabel(s: ShadowSource): string {
169
- return s.kind === "plugin"
170
- ? `enabled plugin '${s.plugin}'`
171
- : "the global skills dir";
172
- }
173
-
174
- /** One `validity_warnings` line per shadowed skill (for benchmark.json). */
175
- export function shadowValidityWarnings(report: PluginShadowReport): string[] {
176
- return report.shadowed.map(
177
- (s) =>
178
- `staged skill '${s.skill_name}' is also provided by ${sourceLabel(s)} — ` +
179
- `eval subagents could discover both copies, so with/without results may be ` +
180
- `contaminated. Re-run from an isolated session (see ${ISOLATION_DOC}).`,
181
- );
182
- }
183
-
184
- /** Build-time banner for the runner. Empty string when nothing is shadowed. */
185
- export function formatShadowBanner(report: PluginShadowReport): string {
186
- if (report.shadowed.length === 0) return "";
187
- const lines = report.shadowed.map(
188
- (s) => ` • ${s.skill_name} — ${sourceLabel(s)}`,
189
- );
190
- return [
191
- "",
192
- "⚠ Plugin-shadow warning: skills staged for this eval are ALSO discoverable",
193
- " from your live environment:",
194
- ...lines,
195
- " Eval subagents (dispatched via the Task tool) inherit this session's plugins,",
196
- " so both the staged copy and the installed copy are discoverable — the",
197
- " with/without comparison may be contaminated and the control arm is not truly",
198
- " skill-absent. The runner cannot unload a plugin from a running session.",
199
- ` Re-run from an isolated session — see ${ISOLATION_DOC}.`,
200
- ].join("\n");
201
- }
@@ -1,11 +0,0 @@
1
- Plan mode is active. The user wants to review an approach before any code is written, so you must NOT execute yet: do not make any edits, do not run any non-read-only tool, and do not change configs or system state. The only file you may write is the plan file. This constraint supersedes any other instruction you have received this session.
2
-
3
- You are operating inside the harness's plan-mode workflow — a fixed, multi-phase procedure. Work through the phases in order:
4
-
5
- 1. **Understand.** Read the relevant code and gather context with read-only tools until you can describe the change concretely. Reuse what already exists rather than proposing new code.
6
- 2. **Design.** Decide the implementation approach and the trade-offs.
7
- 3. **Review.** Re-check the design against the user's request and resolve open questions with the user before finalizing.
8
- 4. **Write the plan.** Build the plan up incrementally in the plan file — this is the one file you are permitted to write. Name the files to change and how to verify the result.
9
- 5. **Hand off.** Call ExitPlanMode to submit the plan for the user's approval.
10
-
11
- Terminal rail: your turn must end in exactly one of two ways — by asking the user a question, or by calling ExitPlanMode to present the finished plan. Do not stop for any other reason and do not begin implementation until the user has approved the plan. The plan-mode workflow already governs how you research, design, and present the work; stay on this rail through to ExitPlanMode.
@@ -1,281 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import {
3
- existsSync,
4
- mkdirSync,
5
- readFileSync,
6
- rmSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { tmpdir } from "node:os";
10
- import { join } from "node:path";
11
- import { PROMOTED_MARKER } from "./workspace-teardown";
12
-
13
- const FIXTURE_ROOT = join(tmpdir(), `slow-powers-promote-test-${process.pid}`);
14
- const PROMOTE_TS = join(import.meta.dir, "promote-baseline.ts");
15
-
16
- beforeAll(() => {
17
- mkdirSync(FIXTURE_ROOT, { recursive: true });
18
- });
19
-
20
- afterAll(() => {
21
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
22
- });
23
-
24
- function writeJson(path: string, value: unknown) {
25
- writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
26
- }
27
-
28
- describe("promote-baseline.ts (--skill-dir, isolated CWD)", () => {
29
- test("copies benchmark + per-run gradings into the skill's committed baseline/", () => {
30
- const root = join(FIXTURE_ROOT, "promote-basic");
31
-
32
- // Skill dir + skill-under-test (detectRunContext validates SKILL.md exists).
33
- const skillDir = join(root, "skill-dir");
34
- const skillSub = join(skillDir, "mr-review");
35
- mkdirSync(skillSub, { recursive: true });
36
- writeFileSync(
37
- join(skillSub, "SKILL.md"),
38
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
39
- );
40
-
41
- // Working dir holding the workspace (mirrors workspaceRoot = <cwd>/skills-workspace).
42
- const cwd = join(root, "work");
43
- const iterationDir = join(
44
- cwd,
45
- "skills-workspace",
46
- "mr-review",
47
- "iteration-2",
48
- );
49
- mkdirSync(iterationDir, { recursive: true });
50
-
51
- const timestamp = "2026-05-27T00:00:00.000Z";
52
- writeJson(join(iterationDir, "conditions.json"), {
53
- mode: "new-skill",
54
- conditions: [
55
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
56
- { name: "without_skill", skill_path: null },
57
- ],
58
- timestamp,
59
- harness: "claude-code",
60
- });
61
- writeJson(join(iterationDir, "benchmark.json"), {
62
- run_summary: {
63
- with_skill: { pass_rate: { mean: 0.83 } },
64
- without_skill: { pass_rate: { mean: 0.33 } },
65
- },
66
- delta: { pass_rate: 0.5 },
67
- });
68
-
69
- const mkGrading = (evalId: string, cond: string, passRate: number) => {
70
- const condDir = join(iterationDir, `eval-${evalId}`, cond);
71
- mkdirSync(condDir, { recursive: true });
72
- writeJson(join(condDir, "grading.json"), {
73
- assertion_results: [
74
- {
75
- id: "a1",
76
- passed: passRate > 0,
77
- evidence: `${cond} evidence`,
78
- confidence: 1,
79
- },
80
- ],
81
- summary: { passed: 1, failed: 0, total: 1, pass_rate: passRate },
82
- });
83
- };
84
- mkGrading("e1", "with_skill", 1);
85
- mkGrading("e1", "without_skill", 0);
86
-
87
- const res = Bun.spawnSync(
88
- [
89
- "bun",
90
- "run",
91
- PROMOTE_TS,
92
- "--skill-dir",
93
- skillDir,
94
- "--skill",
95
- "mr-review",
96
- "--iteration",
97
- "2",
98
- ],
99
- { cwd, stdout: "pipe", stderr: "pipe" },
100
- );
101
- expect(res.stderr.toString()).toBe("");
102
- expect(res.exitCode).toBe(0);
103
-
104
- const baselineDir = join(skillSub, "evals", "baseline");
105
-
106
- // benchmark.json copied verbatim.
107
- const benchmarkPath = join(baselineDir, "benchmark.json");
108
- expect(existsSync(benchmarkPath)).toBe(true);
109
- const benchmark = JSON.parse(readFileSync(benchmarkPath, "utf8")) as {
110
- delta: { pass_rate: number };
111
- };
112
- expect(benchmark.delta.pass_rate).toBe(0.5);
113
-
114
- // Per-run gradings copied under grading/<eval-id>__<condition>.json.
115
- const withGrading = join(baselineDir, "grading", "e1__with_skill.json");
116
- const withoutGrading = join(
117
- baselineDir,
118
- "grading",
119
- "e1__without_skill.json",
120
- );
121
- expect(existsSync(withGrading)).toBe(true);
122
- expect(existsSync(withoutGrading)).toBe(true);
123
- const withParsed = JSON.parse(readFileSync(withGrading, "utf8")) as {
124
- summary: { pass_rate: number };
125
- };
126
- expect(withParsed.summary.pass_rate).toBe(1);
127
-
128
- // Provenance file records mode, iteration, harness, timestamp.
129
- const provenancePath = join(baselineDir, "BASELINE.md");
130
- expect(existsSync(provenancePath)).toBe(true);
131
- const provenance = readFileSync(provenancePath, "utf8");
132
- expect(provenance).toContain("new-skill");
133
- expect(provenance).toContain("iteration-2");
134
- expect(provenance).toContain("claude-code");
135
- expect(provenance).toContain(timestamp);
136
- // Model rows default to "unspecified" when no flags are passed.
137
- expect(provenance).toContain("Agent model | unspecified");
138
- expect(provenance).toContain("Judge model | unspecified");
139
- });
140
-
141
- test("drops a .promoted.json marker into the iteration dir for teardown", () => {
142
- const root = join(FIXTURE_ROOT, "promote-marker");
143
-
144
- const skillDir = join(root, "skill-dir");
145
- const skillSub = join(skillDir, "mr-review");
146
- mkdirSync(skillSub, { recursive: true });
147
- writeFileSync(
148
- join(skillSub, "SKILL.md"),
149
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
150
- );
151
-
152
- const cwd = join(root, "work");
153
- const iterationDir = join(
154
- cwd,
155
- "skills-workspace",
156
- "mr-review",
157
- "iteration-3",
158
- );
159
- mkdirSync(iterationDir, { recursive: true });
160
- writeJson(join(iterationDir, "benchmark.json"), {
161
- delta: { pass_rate: 0 },
162
- });
163
-
164
- const res = Bun.spawnSync(
165
- [
166
- "bun",
167
- "run",
168
- PROMOTE_TS,
169
- "--skill-dir",
170
- skillDir,
171
- "--skill",
172
- "mr-review",
173
- "--iteration",
174
- "3",
175
- ],
176
- { cwd, stdout: "pipe", stderr: "pipe" },
177
- );
178
- expect(res.stderr.toString()).toBe("");
179
- expect(res.exitCode).toBe(0);
180
-
181
- const markerPath = join(iterationDir, PROMOTED_MARKER);
182
- expect(existsSync(markerPath)).toBe(true);
183
- const marker = JSON.parse(readFileSync(markerPath, "utf8")) as {
184
- promoted_at: string;
185
- baseline_dir: string;
186
- };
187
- expect(marker.promoted_at).toBeTruthy();
188
- expect(marker.baseline_dir).toBe(join(skillSub, "evals", "baseline"));
189
- });
190
-
191
- test("records agent and judge models in provenance when flags are passed", () => {
192
- const root = join(FIXTURE_ROOT, "promote-models");
193
-
194
- const skillDir = join(root, "skill-dir");
195
- const skillSub = join(skillDir, "mr-review");
196
- mkdirSync(skillSub, { recursive: true });
197
- writeFileSync(
198
- join(skillSub, "SKILL.md"),
199
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
200
- );
201
-
202
- const cwd = join(root, "work");
203
- const iterationDir = join(
204
- cwd,
205
- "skills-workspace",
206
- "mr-review",
207
- "iteration-1",
208
- );
209
- mkdirSync(iterationDir, { recursive: true });
210
- writeJson(join(iterationDir, "conditions.json"), {
211
- mode: "new-skill",
212
- conditions: [
213
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
214
- { name: "without_skill", skill_path: null },
215
- ],
216
- timestamp: "2026-05-27T00:00:00.000Z",
217
- harness: "claude-code",
218
- });
219
- writeJson(join(iterationDir, "benchmark.json"), {
220
- delta: { pass_rate: 0 },
221
- });
222
-
223
- const res = Bun.spawnSync(
224
- [
225
- "bun",
226
- "run",
227
- PROMOTE_TS,
228
- "--skill-dir",
229
- skillDir,
230
- "--skill",
231
- "mr-review",
232
- "--iteration",
233
- "1",
234
- "--agent-model",
235
- "claude-haiku-4-5-20251001",
236
- "--judge-model",
237
- "claude-opus-4-7",
238
- ],
239
- { cwd, stdout: "pipe", stderr: "pipe" },
240
- );
241
- expect(res.stderr.toString()).toBe("");
242
- expect(res.exitCode).toBe(0);
243
-
244
- const provenance = readFileSync(
245
- join(skillSub, "evals", "baseline", "BASELINE.md"),
246
- "utf8",
247
- );
248
- expect(provenance).toContain("Agent model | claude-haiku-4-5-20251001");
249
- expect(provenance).toContain("Judge model | claude-opus-4-7");
250
- });
251
-
252
- test("fails clearly when the iteration directory is missing", () => {
253
- const root = join(FIXTURE_ROOT, "promote-missing");
254
- const skillDir = join(root, "skill-dir");
255
- const skillSub = join(skillDir, "mr-review");
256
- mkdirSync(skillSub, { recursive: true });
257
- writeFileSync(
258
- join(skillSub, "SKILL.md"),
259
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
260
- );
261
- const cwd = join(root, "work");
262
- mkdirSync(cwd, { recursive: true });
263
-
264
- const res = Bun.spawnSync(
265
- [
266
- "bun",
267
- "run",
268
- PROMOTE_TS,
269
- "--skill-dir",
270
- skillDir,
271
- "--skill",
272
- "mr-review",
273
- "--iteration",
274
- "9",
275
- ],
276
- { cwd, stdout: "pipe", stderr: "pipe" },
277
- );
278
- expect(res.exitCode).not.toBe(0);
279
- expect(res.stderr.toString()).toContain("iteration-9");
280
- });
281
- });
@@ -1,204 +0,0 @@
1
- #!/usr/bin/env bun
2
- import {
3
- copyFileSync,
4
- existsSync,
5
- mkdirSync,
6
- readdirSync,
7
- readFileSync,
8
- writeFileSync,
9
- } from "node:fs";
10
- import { join } from "node:path";
11
- import { detectRunContext } from "./context";
12
- import type { ConditionsRecord } from "./types";
13
- import { PROMOTED_MARKER } from "./workspace-teardown";
14
-
15
- function die(msg: string): never {
16
- console.error(`error: ${msg}`);
17
- process.exit(1);
18
- }
19
-
20
- function ensureDir(path: string): void {
21
- if (!existsSync(path)) mkdirSync(path, { recursive: true });
22
- }
23
-
24
- function gitHead(cwd: string): string {
25
- try {
26
- const res = Bun.spawnSync(["git", "rev-parse", "--short", "HEAD"], {
27
- cwd,
28
- stdout: "pipe",
29
- stderr: "ignore",
30
- });
31
- if (res.exitCode === 0) return res.stdout.toString().trim();
32
- } catch {
33
- // not a git repo / git unavailable — provenance still useful without it
34
- }
35
- return "unknown";
36
- }
37
-
38
- export type PromoteOptions = {
39
- workspaceRoot: string;
40
- skillName: string;
41
- skillSubdir: string;
42
- iteration: string;
43
- harness: string;
44
- label: string | null;
45
- /**
46
- * Operator-declared models for provenance. The runner never dispatches the
47
- * agent/judge itself, so it cannot observe these — record what was used.
48
- */
49
- agentModel: string | null;
50
- judgeModel: string | null;
51
- /** Directory used to resolve the committing repo's git HEAD for provenance. */
52
- gitCwd: string;
53
- };
54
-
55
- /**
56
- * Copies the durable, reference-worthy subset of a workspace iteration into the
57
- * skill's version-controlled `evals/baseline/` directory: the aggregate
58
- * `benchmark.json`, every per-run `grading.json` (judge rationales), and a
59
- * `BASELINE.md` provenance file. Ephemeral scaffolding (dispatch files, timing,
60
- * full run records, produced outputs, transcripts) is intentionally left behind
61
- * in the gitignored workspace.
62
- */
63
- export function promoteBaseline(opts: PromoteOptions): {
64
- baselineDir: string;
65
- gradingsCopied: number;
66
- } {
67
- const iterationDir = join(
68
- opts.workspaceRoot,
69
- opts.skillName,
70
- `iteration-${opts.iteration}`,
71
- );
72
- if (!existsSync(iterationDir)) {
73
- die(
74
- `not found: ${iterationDir} (build/grade iteration-${opts.iteration} first)`,
75
- );
76
- }
77
-
78
- const benchmarkSrc = join(iterationDir, "benchmark.json");
79
- if (!existsSync(benchmarkSrc)) {
80
- die(
81
- `missing benchmark.json in iteration-${opts.iteration} — run 'evals:aggregate' before promoting`,
82
- );
83
- }
84
-
85
- const conditionsSrc = join(iterationDir, "conditions.json");
86
- const conditions: ConditionsRecord | null = existsSync(conditionsSrc)
87
- ? JSON.parse(readFileSync(conditionsSrc, "utf8"))
88
- : null;
89
-
90
- const baselineDir = join(opts.skillSubdir, "evals", "baseline");
91
- const gradingDir = join(baselineDir, "grading");
92
- ensureDir(gradingDir);
93
-
94
- copyFileSync(benchmarkSrc, join(baselineDir, "benchmark.json"));
95
-
96
- let gradingsCopied = 0;
97
- for (const entry of readdirSync(iterationDir, { withFileTypes: true })) {
98
- if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
99
- const evalId = entry.name.slice("eval-".length);
100
- const evalDir = join(iterationDir, entry.name);
101
- for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
102
- if (!cond.isDirectory()) continue;
103
- const gradingSrc = join(evalDir, cond.name, "grading.json");
104
- if (!existsSync(gradingSrc)) continue;
105
- copyFileSync(
106
- gradingSrc,
107
- join(gradingDir, `${evalId}__${cond.name}.json`),
108
- );
109
- gradingsCopied++;
110
- }
111
- }
112
-
113
- const head = gitHead(opts.gitCwd);
114
- const mode = conditions?.mode ?? "unknown";
115
- const timestamp = conditions?.timestamp ?? "unknown";
116
- const conditionNames = conditions?.conditions.map((c) => c.name) ?? [];
117
- const provenance = [
118
- `# Baseline — ${opts.skillName}`,
119
- "",
120
- "Committed reference output from a canonical eval run. Regenerate with",
121
- "`bun run evals:promote-baseline -- --skill " +
122
- `${opts.skillName} --iteration <N>` +
123
- "` after aggregating. The ephemeral workspace (run records, timing,",
124
- "dispatch files, produced outputs) stays gitignored under `skills-workspace/`",
125
- "and is reclaimable by `evals:teardown` once promoted (this commit's marker).",
126
- "",
127
- "| Field | Value |",
128
- "|-------|-------|",
129
- `| Mode | ${mode} |`,
130
- `| Iteration | iteration-${opts.iteration} |`,
131
- `| Harness | ${opts.harness} |`,
132
- `| Agent model | ${opts.agentModel ?? "unspecified"} |`,
133
- `| Judge model | ${opts.judgeModel ?? "unspecified"} |`,
134
- `| Conditions | ${conditionNames.join(", ") || "unknown"} |`,
135
- `| Run timestamp | ${timestamp} |`,
136
- `| Label | ${opts.label ?? "(none)"} |`,
137
- `| Promoted from commit | ${head} |`,
138
- "",
139
- "Files:",
140
- "- `benchmark.json` — aggregate pass-rate / duration / token deltas.",
141
- "- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.",
142
- "",
143
- ].join("\n");
144
- writeFileSync(join(baselineDir, "BASELINE.md"), `${provenance}\n`);
145
-
146
- // Mark the iteration as committed so `teardown` can safely reclaim its
147
- // workspace — without this marker teardown preserves the iteration as
148
- // uncommitted results.
149
- writeFileSync(
150
- join(iterationDir, PROMOTED_MARKER),
151
- `${JSON.stringify(
152
- {
153
- promoted_at: new Date().toISOString(),
154
- baseline_dir: baselineDir,
155
- commit: head,
156
- },
157
- null,
158
- 2,
159
- )}\n`,
160
- );
161
-
162
- return { baselineDir, gradingsCopied };
163
- }
164
-
165
- if (import.meta.main) {
166
- const argv = Bun.argv.slice(2);
167
- let ctx: ReturnType<typeof detectRunContext>;
168
- try {
169
- ctx = detectRunContext(argv);
170
- } catch (err) {
171
- die(err instanceof Error ? err.message : String(err));
172
- }
173
-
174
- const iterIdx = argv.indexOf("--iteration");
175
- const iteration = iterIdx === -1 ? undefined : argv[iterIdx + 1];
176
- if (!iteration) die("missing --iteration <N>");
177
-
178
- const labelIdx = argv.indexOf("--label");
179
- const label = labelIdx === -1 ? null : (argv[labelIdx + 1] ?? null);
180
-
181
- const agentModelIdx = argv.indexOf("--agent-model");
182
- const agentModel =
183
- agentModelIdx === -1 ? null : (argv[agentModelIdx + 1] ?? null);
184
-
185
- const judgeModelIdx = argv.indexOf("--judge-model");
186
- const judgeModel =
187
- judgeModelIdx === -1 ? null : (argv[judgeModelIdx + 1] ?? null);
188
-
189
- const { baselineDir, gradingsCopied } = promoteBaseline({
190
- workspaceRoot: ctx.workspaceRoot,
191
- skillName: ctx.skillName,
192
- skillSubdir: ctx.skillSubdir,
193
- iteration,
194
- harness: ctx.harness,
195
- label,
196
- agentModel,
197
- judgeModel,
198
- gitCwd: ctx.skillSubdir,
199
- });
200
-
201
- console.log(
202
- `Promoted baseline for ${ctx.skillName} → ${baselineDir} (benchmark.json + ${gradingsCopied} grading file${gradingsCopied === 1 ? "" : "s"} + BASELINE.md)`,
203
- );
204
- }