@slowdini/slow-powers-opencode 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -72
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +69 -5
- package/package.json +14 -17
- package/skills/evaluating-skills/SKILL.md +90 -338
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/hardening-plans/SKILL.md +29 -7
- package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
- package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
- package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
- package/skills/hardening-plans/evals/evals.json +46 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -194
- package/skills/evaluating-skills/harness-parity.md +0 -155
- package/skills/evaluating-skills/runner/README.md +0 -163
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
- package/skills/evaluating-skills/runner/aggregate.ts +0 -269
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
- package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
- package/skills/evaluating-skills/runner/record-runs.ts +0 -209
- package/skills/evaluating-skills/runner/run.test.ts +0 -1703
- package/skills/evaluating-skills/runner/run.ts +0 -1388
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
- package/skills/evaluating-skills/runner/types.ts +0 -121
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
- package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
|
@@ -1,201 +0,0 @@
|
|
|
1
|
-
// Plugin-shadow detector (Claude Code). The runner stages eval skills into the
|
|
2
|
-
// project-local `.claude/skills/` dir, but eval subagents are dispatched via the
|
|
3
|
-
// Task tool and run in-process — so they ALSO inherit whatever skills the
|
|
4
|
-
// orchestrator session loaded from installed plugins and the global skills dir.
|
|
5
|
-
// When a staged skill name collides with one of those, both copies are
|
|
6
|
-
// discoverable: the with/without comparison is contaminated and the control arm
|
|
7
|
-
// is not truly skill-absent.
|
|
8
|
-
//
|
|
9
|
-
// The runner cannot unload a plugin from a running session (plugins load at
|
|
10
|
-
// session start), so this module only *detects and reports* the overlap. It
|
|
11
|
-
// reads declared settings as a best-effort proxy for what the session loaded —
|
|
12
|
-
// it can't observe the live-loaded set, so a session that changed settings
|
|
13
|
-
// without restarting may differ. Isolation itself is a launch-time concern; see
|
|
14
|
-
// harness-details/claude.md → "Isolating from installed plugins".
|
|
15
|
-
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
|
16
|
-
import { homedir } from "node:os";
|
|
17
|
-
import { join } from "node:path";
|
|
18
|
-
|
|
19
|
-
export type ShadowSource =
|
|
20
|
-
| { kind: "plugin"; plugin: string; skill_name: string; path: string }
|
|
21
|
-
| { kind: "global-skill"; skill_name: string; path: string };
|
|
22
|
-
|
|
23
|
-
export type PluginShadowReport = {
|
|
24
|
-
config_dir: string;
|
|
25
|
-
shadowed: ShadowSource[];
|
|
26
|
-
};
|
|
27
|
-
|
|
28
|
-
const ISOLATION_DOC =
|
|
29
|
-
'harness-details/claude.md → "Isolating from installed plugins"';
|
|
30
|
-
|
|
31
|
-
/** The Claude Code config dir: `$CLAUDE_CONFIG_DIR` if set, else `~/.claude`. */
|
|
32
|
-
export function resolveConfigDir(env: NodeJS.ProcessEnv = process.env): string {
|
|
33
|
-
const override = env.CLAUDE_CONFIG_DIR;
|
|
34
|
-
return override?.trim() ? override : join(homedir(), ".claude");
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
function readJsonSafe<T>(path: string): T | null {
|
|
38
|
-
if (!existsSync(path)) return null;
|
|
39
|
-
try {
|
|
40
|
-
return JSON.parse(readFileSync(path, "utf8")) as T;
|
|
41
|
-
} catch {
|
|
42
|
-
return null;
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
type Settings = { enabledPlugins?: Record<string, boolean> };
|
|
47
|
-
|
|
48
|
-
/**
|
|
49
|
-
* Effective `enabledPlugins` map, honoring Claude Code's settings precedence
|
|
50
|
-
* (local > project > user). User scope lives under the config dir; project and
|
|
51
|
-
* local scope live under `<cwd>/.claude/`. Later sources override earlier keys,
|
|
52
|
-
* so a project-scope `false` correctly masks a user-scope `true`.
|
|
53
|
-
*/
|
|
54
|
-
export function resolveEnabledPlugins(opts: {
|
|
55
|
-
configDir: string;
|
|
56
|
-
cwd: string;
|
|
57
|
-
}): Record<string, boolean> {
|
|
58
|
-
const sources = [
|
|
59
|
-
join(opts.configDir, "settings.json"),
|
|
60
|
-
join(opts.cwd, ".claude", "settings.json"),
|
|
61
|
-
join(opts.cwd, ".claude", "settings.local.json"),
|
|
62
|
-
];
|
|
63
|
-
let merged: Record<string, boolean> = {};
|
|
64
|
-
for (const path of sources) {
|
|
65
|
-
const s = readJsonSafe<Settings>(path);
|
|
66
|
-
if (s?.enabledPlugins) merged = { ...merged, ...s.enabledPlugins };
|
|
67
|
-
}
|
|
68
|
-
return merged;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
/** Names of skill folders (those holding a `SKILL.md`) directly under `dir`. */
|
|
72
|
-
function skillFolderNames(dir: string): Array<{ name: string; path: string }> {
|
|
73
|
-
if (!existsSync(dir)) return [];
|
|
74
|
-
let entries: string[];
|
|
75
|
-
try {
|
|
76
|
-
entries = readdirSync(dir);
|
|
77
|
-
} catch {
|
|
78
|
-
return [];
|
|
79
|
-
}
|
|
80
|
-
const out: Array<{ name: string; path: string }> = [];
|
|
81
|
-
for (const name of entries) {
|
|
82
|
-
const skillDir = join(dir, name);
|
|
83
|
-
try {
|
|
84
|
-
if (!statSync(skillDir).isDirectory()) continue;
|
|
85
|
-
} catch {
|
|
86
|
-
continue;
|
|
87
|
-
}
|
|
88
|
-
if (existsSync(join(skillDir, "SKILL.md")))
|
|
89
|
-
out.push({ name, path: skillDir });
|
|
90
|
-
}
|
|
91
|
-
return out;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
type InstalledPlugins = {
|
|
95
|
-
plugins?: Record<string, Array<{ installPath?: string }>>;
|
|
96
|
-
};
|
|
97
|
-
|
|
98
|
-
/** Skills exposed by currently-enabled installed plugins. */
|
|
99
|
-
export function listEnabledPluginSkills(opts: {
|
|
100
|
-
configDir: string;
|
|
101
|
-
enabled: Record<string, boolean>;
|
|
102
|
-
}): Array<{ plugin: string; skill_name: string; path: string }> {
|
|
103
|
-
const manifest = readJsonSafe<InstalledPlugins>(
|
|
104
|
-
join(opts.configDir, "plugins", "installed_plugins.json"),
|
|
105
|
-
);
|
|
106
|
-
const out: Array<{ plugin: string; skill_name: string; path: string }> = [];
|
|
107
|
-
if (!manifest?.plugins) return out;
|
|
108
|
-
for (const [key, installs] of Object.entries(manifest.plugins)) {
|
|
109
|
-
if (opts.enabled[key] !== true) continue; // only enabled plugins shadow
|
|
110
|
-
for (const inst of installs ?? []) {
|
|
111
|
-
if (!inst.installPath) continue;
|
|
112
|
-
for (const s of skillFolderNames(join(inst.installPath, "skills")))
|
|
113
|
-
out.push({ plugin: key, skill_name: s.name, path: s.path });
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
return out;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
/** Skills under the global skills dir (`<configDir>/skills`). */
|
|
120
|
-
export function listGlobalSkills(
|
|
121
|
-
configDir: string,
|
|
122
|
-
): Array<{ skill_name: string; path: string }> {
|
|
123
|
-
return skillFolderNames(join(configDir, "skills")).map((s) => ({
|
|
124
|
-
skill_name: s.name,
|
|
125
|
-
path: s.path,
|
|
126
|
-
}));
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
/**
|
|
130
|
-
* Which of `stagedSkillNames` are also discoverable from enabled plugins or the
|
|
131
|
-
* global skills dir. Matches on the skill folder name (exact).
|
|
132
|
-
*/
|
|
133
|
-
export function detectPluginShadows(opts: {
|
|
134
|
-
configDir: string;
|
|
135
|
-
cwd: string;
|
|
136
|
-
stagedSkillNames: string[];
|
|
137
|
-
}): PluginShadowReport {
|
|
138
|
-
const staged = new Set(opts.stagedSkillNames);
|
|
139
|
-
const enabled = resolveEnabledPlugins({
|
|
140
|
-
configDir: opts.configDir,
|
|
141
|
-
cwd: opts.cwd,
|
|
142
|
-
});
|
|
143
|
-
const shadowed: ShadowSource[] = [];
|
|
144
|
-
|
|
145
|
-
for (const s of listEnabledPluginSkills({
|
|
146
|
-
configDir: opts.configDir,
|
|
147
|
-
enabled,
|
|
148
|
-
}))
|
|
149
|
-
if (staged.has(s.skill_name))
|
|
150
|
-
shadowed.push({
|
|
151
|
-
kind: "plugin",
|
|
152
|
-
plugin: s.plugin,
|
|
153
|
-
skill_name: s.skill_name,
|
|
154
|
-
path: s.path,
|
|
155
|
-
});
|
|
156
|
-
|
|
157
|
-
for (const s of listGlobalSkills(opts.configDir))
|
|
158
|
-
if (staged.has(s.skill_name))
|
|
159
|
-
shadowed.push({
|
|
160
|
-
kind: "global-skill",
|
|
161
|
-
skill_name: s.skill_name,
|
|
162
|
-
path: s.path,
|
|
163
|
-
});
|
|
164
|
-
|
|
165
|
-
return { config_dir: opts.configDir, shadowed };
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
function sourceLabel(s: ShadowSource): string {
|
|
169
|
-
return s.kind === "plugin"
|
|
170
|
-
? `enabled plugin '${s.plugin}'`
|
|
171
|
-
: "the global skills dir";
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
/** One `validity_warnings` line per shadowed skill (for benchmark.json). */
|
|
175
|
-
export function shadowValidityWarnings(report: PluginShadowReport): string[] {
|
|
176
|
-
return report.shadowed.map(
|
|
177
|
-
(s) =>
|
|
178
|
-
`staged skill '${s.skill_name}' is also provided by ${sourceLabel(s)} — ` +
|
|
179
|
-
`eval subagents could discover both copies, so with/without results may be ` +
|
|
180
|
-
`contaminated. Re-run from an isolated session (see ${ISOLATION_DOC}).`,
|
|
181
|
-
);
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
/** Build-time banner for the runner. Empty string when nothing is shadowed. */
|
|
185
|
-
export function formatShadowBanner(report: PluginShadowReport): string {
|
|
186
|
-
if (report.shadowed.length === 0) return "";
|
|
187
|
-
const lines = report.shadowed.map(
|
|
188
|
-
(s) => ` • ${s.skill_name} — ${sourceLabel(s)}`,
|
|
189
|
-
);
|
|
190
|
-
return [
|
|
191
|
-
"",
|
|
192
|
-
"⚠ Plugin-shadow warning: skills staged for this eval are ALSO discoverable",
|
|
193
|
-
" from your live environment:",
|
|
194
|
-
...lines,
|
|
195
|
-
" Eval subagents (dispatched via the Task tool) inherit this session's plugins,",
|
|
196
|
-
" so both the staged copy and the installed copy are discoverable — the",
|
|
197
|
-
" with/without comparison may be contaminated and the control arm is not truly",
|
|
198
|
-
" skill-absent. The runner cannot unload a plugin from a running session.",
|
|
199
|
-
` Re-run from an isolated session — see ${ISOLATION_DOC}.`,
|
|
200
|
-
].join("\n");
|
|
201
|
-
}
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
Plan mode is active. The user wants to review an approach before any code is written, so you must NOT execute yet: do not make any edits, do not run any non-read-only tool, and do not change configs or system state. The only file you may write is the plan file. This constraint supersedes any other instruction you have received this session.
|
|
2
|
-
|
|
3
|
-
You are operating inside the harness's plan-mode workflow — a fixed, multi-phase procedure. Work through the phases in order:
|
|
4
|
-
|
|
5
|
-
1. **Understand.** Read the relevant code and gather context with read-only tools until you can describe the change concretely. Reuse what already exists rather than proposing new code.
|
|
6
|
-
2. **Design.** Decide the implementation approach and the trade-offs.
|
|
7
|
-
3. **Review.** Re-check the design against the user's request and resolve open questions with the user before finalizing.
|
|
8
|
-
4. **Write the plan.** Build the plan up incrementally in the plan file — this is the one file you are permitted to write. Name the files to change and how to verify the result.
|
|
9
|
-
5. **Hand off.** Call ExitPlanMode to submit the plan for the user's approval.
|
|
10
|
-
|
|
11
|
-
Terminal rail: your turn must end in exactly one of two ways — by asking the user a question, or by calling ExitPlanMode to present the finished plan. Do not stop for any other reason and do not begin implementation until the user has approved the plan. The plan-mode workflow already governs how you research, design, and present the work; stay on this rail through to ExitPlanMode.
|
|
@@ -1,281 +0,0 @@
|
|
|
1
|
-
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
-
import {
|
|
3
|
-
existsSync,
|
|
4
|
-
mkdirSync,
|
|
5
|
-
readFileSync,
|
|
6
|
-
rmSync,
|
|
7
|
-
writeFileSync,
|
|
8
|
-
} from "node:fs";
|
|
9
|
-
import { tmpdir } from "node:os";
|
|
10
|
-
import { join } from "node:path";
|
|
11
|
-
import { PROMOTED_MARKER } from "./workspace-teardown";
|
|
12
|
-
|
|
13
|
-
const FIXTURE_ROOT = join(tmpdir(), `slow-powers-promote-test-${process.pid}`);
|
|
14
|
-
const PROMOTE_TS = join(import.meta.dir, "promote-baseline.ts");
|
|
15
|
-
|
|
16
|
-
beforeAll(() => {
|
|
17
|
-
mkdirSync(FIXTURE_ROOT, { recursive: true });
|
|
18
|
-
});
|
|
19
|
-
|
|
20
|
-
afterAll(() => {
|
|
21
|
-
rmSync(FIXTURE_ROOT, { recursive: true, force: true });
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
function writeJson(path: string, value: unknown) {
|
|
25
|
-
writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
describe("promote-baseline.ts (--skill-dir, isolated CWD)", () => {
|
|
29
|
-
test("copies benchmark + per-run gradings into the skill's committed baseline/", () => {
|
|
30
|
-
const root = join(FIXTURE_ROOT, "promote-basic");
|
|
31
|
-
|
|
32
|
-
// Skill dir + skill-under-test (detectRunContext validates SKILL.md exists).
|
|
33
|
-
const skillDir = join(root, "skill-dir");
|
|
34
|
-
const skillSub = join(skillDir, "mr-review");
|
|
35
|
-
mkdirSync(skillSub, { recursive: true });
|
|
36
|
-
writeFileSync(
|
|
37
|
-
join(skillSub, "SKILL.md"),
|
|
38
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
39
|
-
);
|
|
40
|
-
|
|
41
|
-
// Working dir holding the workspace (mirrors workspaceRoot = <cwd>/skills-workspace).
|
|
42
|
-
const cwd = join(root, "work");
|
|
43
|
-
const iterationDir = join(
|
|
44
|
-
cwd,
|
|
45
|
-
"skills-workspace",
|
|
46
|
-
"mr-review",
|
|
47
|
-
"iteration-2",
|
|
48
|
-
);
|
|
49
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
50
|
-
|
|
51
|
-
const timestamp = "2026-05-27T00:00:00.000Z";
|
|
52
|
-
writeJson(join(iterationDir, "conditions.json"), {
|
|
53
|
-
mode: "new-skill",
|
|
54
|
-
conditions: [
|
|
55
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
56
|
-
{ name: "without_skill", skill_path: null },
|
|
57
|
-
],
|
|
58
|
-
timestamp,
|
|
59
|
-
harness: "claude-code",
|
|
60
|
-
});
|
|
61
|
-
writeJson(join(iterationDir, "benchmark.json"), {
|
|
62
|
-
run_summary: {
|
|
63
|
-
with_skill: { pass_rate: { mean: 0.83 } },
|
|
64
|
-
without_skill: { pass_rate: { mean: 0.33 } },
|
|
65
|
-
},
|
|
66
|
-
delta: { pass_rate: 0.5 },
|
|
67
|
-
});
|
|
68
|
-
|
|
69
|
-
const mkGrading = (evalId: string, cond: string, passRate: number) => {
|
|
70
|
-
const condDir = join(iterationDir, `eval-${evalId}`, cond);
|
|
71
|
-
mkdirSync(condDir, { recursive: true });
|
|
72
|
-
writeJson(join(condDir, "grading.json"), {
|
|
73
|
-
assertion_results: [
|
|
74
|
-
{
|
|
75
|
-
id: "a1",
|
|
76
|
-
passed: passRate > 0,
|
|
77
|
-
evidence: `${cond} evidence`,
|
|
78
|
-
confidence: 1,
|
|
79
|
-
},
|
|
80
|
-
],
|
|
81
|
-
summary: { passed: 1, failed: 0, total: 1, pass_rate: passRate },
|
|
82
|
-
});
|
|
83
|
-
};
|
|
84
|
-
mkGrading("e1", "with_skill", 1);
|
|
85
|
-
mkGrading("e1", "without_skill", 0);
|
|
86
|
-
|
|
87
|
-
const res = Bun.spawnSync(
|
|
88
|
-
[
|
|
89
|
-
"bun",
|
|
90
|
-
"run",
|
|
91
|
-
PROMOTE_TS,
|
|
92
|
-
"--skill-dir",
|
|
93
|
-
skillDir,
|
|
94
|
-
"--skill",
|
|
95
|
-
"mr-review",
|
|
96
|
-
"--iteration",
|
|
97
|
-
"2",
|
|
98
|
-
],
|
|
99
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
100
|
-
);
|
|
101
|
-
expect(res.stderr.toString()).toBe("");
|
|
102
|
-
expect(res.exitCode).toBe(0);
|
|
103
|
-
|
|
104
|
-
const baselineDir = join(skillSub, "evals", "baseline");
|
|
105
|
-
|
|
106
|
-
// benchmark.json copied verbatim.
|
|
107
|
-
const benchmarkPath = join(baselineDir, "benchmark.json");
|
|
108
|
-
expect(existsSync(benchmarkPath)).toBe(true);
|
|
109
|
-
const benchmark = JSON.parse(readFileSync(benchmarkPath, "utf8")) as {
|
|
110
|
-
delta: { pass_rate: number };
|
|
111
|
-
};
|
|
112
|
-
expect(benchmark.delta.pass_rate).toBe(0.5);
|
|
113
|
-
|
|
114
|
-
// Per-run gradings copied under grading/<eval-id>__<condition>.json.
|
|
115
|
-
const withGrading = join(baselineDir, "grading", "e1__with_skill.json");
|
|
116
|
-
const withoutGrading = join(
|
|
117
|
-
baselineDir,
|
|
118
|
-
"grading",
|
|
119
|
-
"e1__without_skill.json",
|
|
120
|
-
);
|
|
121
|
-
expect(existsSync(withGrading)).toBe(true);
|
|
122
|
-
expect(existsSync(withoutGrading)).toBe(true);
|
|
123
|
-
const withParsed = JSON.parse(readFileSync(withGrading, "utf8")) as {
|
|
124
|
-
summary: { pass_rate: number };
|
|
125
|
-
};
|
|
126
|
-
expect(withParsed.summary.pass_rate).toBe(1);
|
|
127
|
-
|
|
128
|
-
// Provenance file records mode, iteration, harness, timestamp.
|
|
129
|
-
const provenancePath = join(baselineDir, "BASELINE.md");
|
|
130
|
-
expect(existsSync(provenancePath)).toBe(true);
|
|
131
|
-
const provenance = readFileSync(provenancePath, "utf8");
|
|
132
|
-
expect(provenance).toContain("new-skill");
|
|
133
|
-
expect(provenance).toContain("iteration-2");
|
|
134
|
-
expect(provenance).toContain("claude-code");
|
|
135
|
-
expect(provenance).toContain(timestamp);
|
|
136
|
-
// Model rows default to "unspecified" when no flags are passed.
|
|
137
|
-
expect(provenance).toContain("Agent model | unspecified");
|
|
138
|
-
expect(provenance).toContain("Judge model | unspecified");
|
|
139
|
-
});
|
|
140
|
-
|
|
141
|
-
test("drops a .promoted.json marker into the iteration dir for teardown", () => {
|
|
142
|
-
const root = join(FIXTURE_ROOT, "promote-marker");
|
|
143
|
-
|
|
144
|
-
const skillDir = join(root, "skill-dir");
|
|
145
|
-
const skillSub = join(skillDir, "mr-review");
|
|
146
|
-
mkdirSync(skillSub, { recursive: true });
|
|
147
|
-
writeFileSync(
|
|
148
|
-
join(skillSub, "SKILL.md"),
|
|
149
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
150
|
-
);
|
|
151
|
-
|
|
152
|
-
const cwd = join(root, "work");
|
|
153
|
-
const iterationDir = join(
|
|
154
|
-
cwd,
|
|
155
|
-
"skills-workspace",
|
|
156
|
-
"mr-review",
|
|
157
|
-
"iteration-3",
|
|
158
|
-
);
|
|
159
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
160
|
-
writeJson(join(iterationDir, "benchmark.json"), {
|
|
161
|
-
delta: { pass_rate: 0 },
|
|
162
|
-
});
|
|
163
|
-
|
|
164
|
-
const res = Bun.spawnSync(
|
|
165
|
-
[
|
|
166
|
-
"bun",
|
|
167
|
-
"run",
|
|
168
|
-
PROMOTE_TS,
|
|
169
|
-
"--skill-dir",
|
|
170
|
-
skillDir,
|
|
171
|
-
"--skill",
|
|
172
|
-
"mr-review",
|
|
173
|
-
"--iteration",
|
|
174
|
-
"3",
|
|
175
|
-
],
|
|
176
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
177
|
-
);
|
|
178
|
-
expect(res.stderr.toString()).toBe("");
|
|
179
|
-
expect(res.exitCode).toBe(0);
|
|
180
|
-
|
|
181
|
-
const markerPath = join(iterationDir, PROMOTED_MARKER);
|
|
182
|
-
expect(existsSync(markerPath)).toBe(true);
|
|
183
|
-
const marker = JSON.parse(readFileSync(markerPath, "utf8")) as {
|
|
184
|
-
promoted_at: string;
|
|
185
|
-
baseline_dir: string;
|
|
186
|
-
};
|
|
187
|
-
expect(marker.promoted_at).toBeTruthy();
|
|
188
|
-
expect(marker.baseline_dir).toBe(join(skillSub, "evals", "baseline"));
|
|
189
|
-
});
|
|
190
|
-
|
|
191
|
-
test("records agent and judge models in provenance when flags are passed", () => {
|
|
192
|
-
const root = join(FIXTURE_ROOT, "promote-models");
|
|
193
|
-
|
|
194
|
-
const skillDir = join(root, "skill-dir");
|
|
195
|
-
const skillSub = join(skillDir, "mr-review");
|
|
196
|
-
mkdirSync(skillSub, { recursive: true });
|
|
197
|
-
writeFileSync(
|
|
198
|
-
join(skillSub, "SKILL.md"),
|
|
199
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
200
|
-
);
|
|
201
|
-
|
|
202
|
-
const cwd = join(root, "work");
|
|
203
|
-
const iterationDir = join(
|
|
204
|
-
cwd,
|
|
205
|
-
"skills-workspace",
|
|
206
|
-
"mr-review",
|
|
207
|
-
"iteration-1",
|
|
208
|
-
);
|
|
209
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
210
|
-
writeJson(join(iterationDir, "conditions.json"), {
|
|
211
|
-
mode: "new-skill",
|
|
212
|
-
conditions: [
|
|
213
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
214
|
-
{ name: "without_skill", skill_path: null },
|
|
215
|
-
],
|
|
216
|
-
timestamp: "2026-05-27T00:00:00.000Z",
|
|
217
|
-
harness: "claude-code",
|
|
218
|
-
});
|
|
219
|
-
writeJson(join(iterationDir, "benchmark.json"), {
|
|
220
|
-
delta: { pass_rate: 0 },
|
|
221
|
-
});
|
|
222
|
-
|
|
223
|
-
const res = Bun.spawnSync(
|
|
224
|
-
[
|
|
225
|
-
"bun",
|
|
226
|
-
"run",
|
|
227
|
-
PROMOTE_TS,
|
|
228
|
-
"--skill-dir",
|
|
229
|
-
skillDir,
|
|
230
|
-
"--skill",
|
|
231
|
-
"mr-review",
|
|
232
|
-
"--iteration",
|
|
233
|
-
"1",
|
|
234
|
-
"--agent-model",
|
|
235
|
-
"claude-haiku-4-5-20251001",
|
|
236
|
-
"--judge-model",
|
|
237
|
-
"claude-opus-4-7",
|
|
238
|
-
],
|
|
239
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
240
|
-
);
|
|
241
|
-
expect(res.stderr.toString()).toBe("");
|
|
242
|
-
expect(res.exitCode).toBe(0);
|
|
243
|
-
|
|
244
|
-
const provenance = readFileSync(
|
|
245
|
-
join(skillSub, "evals", "baseline", "BASELINE.md"),
|
|
246
|
-
"utf8",
|
|
247
|
-
);
|
|
248
|
-
expect(provenance).toContain("Agent model | claude-haiku-4-5-20251001");
|
|
249
|
-
expect(provenance).toContain("Judge model | claude-opus-4-7");
|
|
250
|
-
});
|
|
251
|
-
|
|
252
|
-
test("fails clearly when the iteration directory is missing", () => {
|
|
253
|
-
const root = join(FIXTURE_ROOT, "promote-missing");
|
|
254
|
-
const skillDir = join(root, "skill-dir");
|
|
255
|
-
const skillSub = join(skillDir, "mr-review");
|
|
256
|
-
mkdirSync(skillSub, { recursive: true });
|
|
257
|
-
writeFileSync(
|
|
258
|
-
join(skillSub, "SKILL.md"),
|
|
259
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
260
|
-
);
|
|
261
|
-
const cwd = join(root, "work");
|
|
262
|
-
mkdirSync(cwd, { recursive: true });
|
|
263
|
-
|
|
264
|
-
const res = Bun.spawnSync(
|
|
265
|
-
[
|
|
266
|
-
"bun",
|
|
267
|
-
"run",
|
|
268
|
-
PROMOTE_TS,
|
|
269
|
-
"--skill-dir",
|
|
270
|
-
skillDir,
|
|
271
|
-
"--skill",
|
|
272
|
-
"mr-review",
|
|
273
|
-
"--iteration",
|
|
274
|
-
"9",
|
|
275
|
-
],
|
|
276
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
277
|
-
);
|
|
278
|
-
expect(res.exitCode).not.toBe(0);
|
|
279
|
-
expect(res.stderr.toString()).toContain("iteration-9");
|
|
280
|
-
});
|
|
281
|
-
});
|
|
@@ -1,204 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
import {
|
|
3
|
-
copyFileSync,
|
|
4
|
-
existsSync,
|
|
5
|
-
mkdirSync,
|
|
6
|
-
readdirSync,
|
|
7
|
-
readFileSync,
|
|
8
|
-
writeFileSync,
|
|
9
|
-
} from "node:fs";
|
|
10
|
-
import { join } from "node:path";
|
|
11
|
-
import { detectRunContext } from "./context";
|
|
12
|
-
import type { ConditionsRecord } from "./types";
|
|
13
|
-
import { PROMOTED_MARKER } from "./workspace-teardown";
|
|
14
|
-
|
|
15
|
-
function die(msg: string): never {
|
|
16
|
-
console.error(`error: ${msg}`);
|
|
17
|
-
process.exit(1);
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
function ensureDir(path: string): void {
|
|
21
|
-
if (!existsSync(path)) mkdirSync(path, { recursive: true });
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
function gitHead(cwd: string): string {
|
|
25
|
-
try {
|
|
26
|
-
const res = Bun.spawnSync(["git", "rev-parse", "--short", "HEAD"], {
|
|
27
|
-
cwd,
|
|
28
|
-
stdout: "pipe",
|
|
29
|
-
stderr: "ignore",
|
|
30
|
-
});
|
|
31
|
-
if (res.exitCode === 0) return res.stdout.toString().trim();
|
|
32
|
-
} catch {
|
|
33
|
-
// not a git repo / git unavailable — provenance still useful without it
|
|
34
|
-
}
|
|
35
|
-
return "unknown";
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
export type PromoteOptions = {
|
|
39
|
-
workspaceRoot: string;
|
|
40
|
-
skillName: string;
|
|
41
|
-
skillSubdir: string;
|
|
42
|
-
iteration: string;
|
|
43
|
-
harness: string;
|
|
44
|
-
label: string | null;
|
|
45
|
-
/**
|
|
46
|
-
* Operator-declared models for provenance. The runner never dispatches the
|
|
47
|
-
* agent/judge itself, so it cannot observe these — record what was used.
|
|
48
|
-
*/
|
|
49
|
-
agentModel: string | null;
|
|
50
|
-
judgeModel: string | null;
|
|
51
|
-
/** Directory used to resolve the committing repo's git HEAD for provenance. */
|
|
52
|
-
gitCwd: string;
|
|
53
|
-
};
|
|
54
|
-
|
|
55
|
-
/**
|
|
56
|
-
* Copies the durable, reference-worthy subset of a workspace iteration into the
|
|
57
|
-
* skill's version-controlled `evals/baseline/` directory: the aggregate
|
|
58
|
-
* `benchmark.json`, every per-run `grading.json` (judge rationales), and a
|
|
59
|
-
* `BASELINE.md` provenance file. Ephemeral scaffolding (dispatch files, timing,
|
|
60
|
-
* full run records, produced outputs, transcripts) is intentionally left behind
|
|
61
|
-
* in the gitignored workspace.
|
|
62
|
-
*/
|
|
63
|
-
export function promoteBaseline(opts: PromoteOptions): {
|
|
64
|
-
baselineDir: string;
|
|
65
|
-
gradingsCopied: number;
|
|
66
|
-
} {
|
|
67
|
-
const iterationDir = join(
|
|
68
|
-
opts.workspaceRoot,
|
|
69
|
-
opts.skillName,
|
|
70
|
-
`iteration-${opts.iteration}`,
|
|
71
|
-
);
|
|
72
|
-
if (!existsSync(iterationDir)) {
|
|
73
|
-
die(
|
|
74
|
-
`not found: ${iterationDir} (build/grade iteration-${opts.iteration} first)`,
|
|
75
|
-
);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
const benchmarkSrc = join(iterationDir, "benchmark.json");
|
|
79
|
-
if (!existsSync(benchmarkSrc)) {
|
|
80
|
-
die(
|
|
81
|
-
`missing benchmark.json in iteration-${opts.iteration} — run 'evals:aggregate' before promoting`,
|
|
82
|
-
);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
const conditionsSrc = join(iterationDir, "conditions.json");
|
|
86
|
-
const conditions: ConditionsRecord | null = existsSync(conditionsSrc)
|
|
87
|
-
? JSON.parse(readFileSync(conditionsSrc, "utf8"))
|
|
88
|
-
: null;
|
|
89
|
-
|
|
90
|
-
const baselineDir = join(opts.skillSubdir, "evals", "baseline");
|
|
91
|
-
const gradingDir = join(baselineDir, "grading");
|
|
92
|
-
ensureDir(gradingDir);
|
|
93
|
-
|
|
94
|
-
copyFileSync(benchmarkSrc, join(baselineDir, "benchmark.json"));
|
|
95
|
-
|
|
96
|
-
let gradingsCopied = 0;
|
|
97
|
-
for (const entry of readdirSync(iterationDir, { withFileTypes: true })) {
|
|
98
|
-
if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
|
|
99
|
-
const evalId = entry.name.slice("eval-".length);
|
|
100
|
-
const evalDir = join(iterationDir, entry.name);
|
|
101
|
-
for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
|
|
102
|
-
if (!cond.isDirectory()) continue;
|
|
103
|
-
const gradingSrc = join(evalDir, cond.name, "grading.json");
|
|
104
|
-
if (!existsSync(gradingSrc)) continue;
|
|
105
|
-
copyFileSync(
|
|
106
|
-
gradingSrc,
|
|
107
|
-
join(gradingDir, `${evalId}__${cond.name}.json`),
|
|
108
|
-
);
|
|
109
|
-
gradingsCopied++;
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
const head = gitHead(opts.gitCwd);
|
|
114
|
-
const mode = conditions?.mode ?? "unknown";
|
|
115
|
-
const timestamp = conditions?.timestamp ?? "unknown";
|
|
116
|
-
const conditionNames = conditions?.conditions.map((c) => c.name) ?? [];
|
|
117
|
-
const provenance = [
|
|
118
|
-
`# Baseline — ${opts.skillName}`,
|
|
119
|
-
"",
|
|
120
|
-
"Committed reference output from a canonical eval run. Regenerate with",
|
|
121
|
-
"`bun run evals:promote-baseline -- --skill " +
|
|
122
|
-
`${opts.skillName} --iteration <N>` +
|
|
123
|
-
"` after aggregating. The ephemeral workspace (run records, timing,",
|
|
124
|
-
"dispatch files, produced outputs) stays gitignored under `skills-workspace/`",
|
|
125
|
-
"and is reclaimable by `evals:teardown` once promoted (this commit's marker).",
|
|
126
|
-
"",
|
|
127
|
-
"| Field | Value |",
|
|
128
|
-
"|-------|-------|",
|
|
129
|
-
`| Mode | ${mode} |`,
|
|
130
|
-
`| Iteration | iteration-${opts.iteration} |`,
|
|
131
|
-
`| Harness | ${opts.harness} |`,
|
|
132
|
-
`| Agent model | ${opts.agentModel ?? "unspecified"} |`,
|
|
133
|
-
`| Judge model | ${opts.judgeModel ?? "unspecified"} |`,
|
|
134
|
-
`| Conditions | ${conditionNames.join(", ") || "unknown"} |`,
|
|
135
|
-
`| Run timestamp | ${timestamp} |`,
|
|
136
|
-
`| Label | ${opts.label ?? "(none)"} |`,
|
|
137
|
-
`| Promoted from commit | ${head} |`,
|
|
138
|
-
"",
|
|
139
|
-
"Files:",
|
|
140
|
-
"- `benchmark.json` — aggregate pass-rate / duration / token deltas.",
|
|
141
|
-
"- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.",
|
|
142
|
-
"",
|
|
143
|
-
].join("\n");
|
|
144
|
-
writeFileSync(join(baselineDir, "BASELINE.md"), `${provenance}\n`);
|
|
145
|
-
|
|
146
|
-
// Mark the iteration as committed so `teardown` can safely reclaim its
|
|
147
|
-
// workspace — without this marker teardown preserves the iteration as
|
|
148
|
-
// uncommitted results.
|
|
149
|
-
writeFileSync(
|
|
150
|
-
join(iterationDir, PROMOTED_MARKER),
|
|
151
|
-
`${JSON.stringify(
|
|
152
|
-
{
|
|
153
|
-
promoted_at: new Date().toISOString(),
|
|
154
|
-
baseline_dir: baselineDir,
|
|
155
|
-
commit: head,
|
|
156
|
-
},
|
|
157
|
-
null,
|
|
158
|
-
2,
|
|
159
|
-
)}\n`,
|
|
160
|
-
);
|
|
161
|
-
|
|
162
|
-
return { baselineDir, gradingsCopied };
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
if (import.meta.main) {
|
|
166
|
-
const argv = Bun.argv.slice(2);
|
|
167
|
-
let ctx: ReturnType<typeof detectRunContext>;
|
|
168
|
-
try {
|
|
169
|
-
ctx = detectRunContext(argv);
|
|
170
|
-
} catch (err) {
|
|
171
|
-
die(err instanceof Error ? err.message : String(err));
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
const iterIdx = argv.indexOf("--iteration");
|
|
175
|
-
const iteration = iterIdx === -1 ? undefined : argv[iterIdx + 1];
|
|
176
|
-
if (!iteration) die("missing --iteration <N>");
|
|
177
|
-
|
|
178
|
-
const labelIdx = argv.indexOf("--label");
|
|
179
|
-
const label = labelIdx === -1 ? null : (argv[labelIdx + 1] ?? null);
|
|
180
|
-
|
|
181
|
-
const agentModelIdx = argv.indexOf("--agent-model");
|
|
182
|
-
const agentModel =
|
|
183
|
-
agentModelIdx === -1 ? null : (argv[agentModelIdx + 1] ?? null);
|
|
184
|
-
|
|
185
|
-
const judgeModelIdx = argv.indexOf("--judge-model");
|
|
186
|
-
const judgeModel =
|
|
187
|
-
judgeModelIdx === -1 ? null : (argv[judgeModelIdx + 1] ?? null);
|
|
188
|
-
|
|
189
|
-
const { baselineDir, gradingsCopied } = promoteBaseline({
|
|
190
|
-
workspaceRoot: ctx.workspaceRoot,
|
|
191
|
-
skillName: ctx.skillName,
|
|
192
|
-
skillSubdir: ctx.skillSubdir,
|
|
193
|
-
iteration,
|
|
194
|
-
harness: ctx.harness,
|
|
195
|
-
label,
|
|
196
|
-
agentModel,
|
|
197
|
-
judgeModel,
|
|
198
|
-
gitCwd: ctx.skillSubdir,
|
|
199
|
-
});
|
|
200
|
-
|
|
201
|
-
console.log(
|
|
202
|
-
`Promoted baseline for ${ctx.skillName} → ${baselineDir} (benchmark.json + ${gradingsCopied} grading file${gradingsCopied === 1 ? "" : "s"} + BASELINE.md)`,
|
|
203
|
-
);
|
|
204
|
-
}
|