@slowdini/slow-powers-opencode 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -72
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -17
- package/skills/evaluating-skills/SKILL.md +90 -338
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -194
- package/skills/evaluating-skills/harness-parity.md +0 -155
- package/skills/evaluating-skills/runner/README.md +0 -163
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
- package/skills/evaluating-skills/runner/aggregate.ts +0 -269
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
- package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
- package/skills/evaluating-skills/runner/record-runs.ts +0 -209
- package/skills/evaluating-skills/runner/run.test.ts +0 -1703
- package/skills/evaluating-skills/runner/run.ts +0 -1388
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
- package/skills/evaluating-skills/runner/types.ts +0 -121
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
- package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
-
import { mkdirSync, rmSync, writeFileSync } from "node:fs";
|
|
3
|
-
import { tmpdir } from "node:os";
|
|
4
|
-
import { join } from "node:path";
|
|
5
|
-
import { resolveAgentDescription } from "./fill-transcripts";
|
|
6
|
-
|
|
7
|
-
const ROOT = join(tmpdir(), `fill-transcripts-test-${process.pid}`);
|
|
8
|
-
|
|
9
|
-
beforeAll(() => mkdirSync(ROOT, { recursive: true }));
|
|
10
|
-
afterAll(() => rmSync(ROOT, { recursive: true, force: true }));
|
|
11
|
-
|
|
12
|
-
function writeDispatch(iterationDir: string, tasks: unknown[]) {
|
|
13
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
14
|
-
writeFileSync(
|
|
15
|
-
join(iterationDir, "dispatch.json"),
|
|
16
|
-
JSON.stringify({ run_nonce: "abc123", tasks }, null, 2),
|
|
17
|
-
);
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
describe("resolveAgentDescription", () => {
|
|
21
|
-
test("returns the namespaced agent_description from dispatch.json", () => {
|
|
22
|
-
const dir = join(ROOT, "iter-canonical");
|
|
23
|
-
writeDispatch(dir, [
|
|
24
|
-
{
|
|
25
|
-
eval_id: "crash",
|
|
26
|
-
condition: "with_skill",
|
|
27
|
-
agent_description: "crash:with_skill:i3-abc123",
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
eval_id: "crash",
|
|
31
|
-
condition: "without_skill",
|
|
32
|
-
agent_description: "crash:without_skill:i3-abc123",
|
|
33
|
-
},
|
|
34
|
-
]);
|
|
35
|
-
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
36
|
-
"crash:with_skill:i3-abc123",
|
|
37
|
-
);
|
|
38
|
-
expect(resolveAgentDescription(dir, "crash", "without_skill")).toBe(
|
|
39
|
-
"crash:without_skill:i3-abc123",
|
|
40
|
-
);
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
test("falls back to legacy reconstruction when dispatch.json is absent", () => {
|
|
44
|
-
const dir = join(ROOT, "iter-no-dispatch");
|
|
45
|
-
mkdirSync(dir, { recursive: true });
|
|
46
|
-
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
47
|
-
"crash:with_skill",
|
|
48
|
-
);
|
|
49
|
-
});
|
|
50
|
-
|
|
51
|
-
test("falls back when the task is missing from dispatch.json", () => {
|
|
52
|
-
const dir = join(ROOT, "iter-partial");
|
|
53
|
-
writeDispatch(dir, [
|
|
54
|
-
{
|
|
55
|
-
eval_id: "other",
|
|
56
|
-
condition: "with_skill",
|
|
57
|
-
agent_description: "other:with_skill:i1-x",
|
|
58
|
-
},
|
|
59
|
-
]);
|
|
60
|
-
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
61
|
-
"crash:with_skill",
|
|
62
|
-
);
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
test("falls back when dispatch.json is malformed", () => {
|
|
66
|
-
const dir = join(ROOT, "iter-malformed");
|
|
67
|
-
mkdirSync(dir, { recursive: true });
|
|
68
|
-
writeFileSync(join(dir, "dispatch.json"), "{ not valid json");
|
|
69
|
-
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
70
|
-
"crash:with_skill",
|
|
71
|
-
);
|
|
72
|
-
});
|
|
73
|
-
});
|
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
-
import { join } from "node:path";
|
|
4
|
-
import * as claudeAdapter from "./adapters/claude-code-transcript";
|
|
5
|
-
import { detectRunContext } from "./context";
|
|
6
|
-
import type { ConditionsRecord, RunRecord } from "./types";
|
|
7
|
-
import { validateAgainstSchema } from "./validate-schema";
|
|
8
|
-
|
|
9
|
-
function die(msg: string): never {
|
|
10
|
-
console.error(`error: ${msg}`);
|
|
11
|
-
process.exit(1);
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
type DispatchTaskRef = {
|
|
15
|
-
eval_id: string;
|
|
16
|
-
condition: string;
|
|
17
|
-
agent_description?: string;
|
|
18
|
-
};
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* The canonical dispatch description for an (eval, condition) run.
|
|
22
|
-
*
|
|
23
|
-
* The runner writes a unique `agent_description` per task into `dispatch.json`
|
|
24
|
-
* (namespaced with the iteration + run nonce). Reading it back — rather than
|
|
25
|
-
* reconstructing `<eval_id>:<condition>` — is what binds each run to the exact
|
|
26
|
-
* agent that produced it, even when one parent session's shared subagents dir
|
|
27
|
-
* holds colliding descriptions from other iterations. Falls back to the legacy
|
|
28
|
-
* reconstruction when dispatch.json is absent (hand-authored/operator runs).
|
|
29
|
-
*/
|
|
30
|
-
export function resolveAgentDescription(
|
|
31
|
-
iterationDir: string,
|
|
32
|
-
evalId: string,
|
|
33
|
-
condition: string,
|
|
34
|
-
): string {
|
|
35
|
-
const dispatchPath = join(iterationDir, "dispatch.json");
|
|
36
|
-
if (existsSync(dispatchPath)) {
|
|
37
|
-
try {
|
|
38
|
-
const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
|
|
39
|
-
tasks?: DispatchTaskRef[];
|
|
40
|
-
};
|
|
41
|
-
const task = dispatch.tasks?.find(
|
|
42
|
-
(t) => t.eval_id === evalId && t.condition === condition,
|
|
43
|
-
);
|
|
44
|
-
if (task?.agent_description) return task.agent_description;
|
|
45
|
-
} catch {
|
|
46
|
-
// fall through to legacy reconstruction
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
return `${evalId}:${condition}`;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
function parseArgs(argv: string[]) {
|
|
53
|
-
const flag = (name: string): string | undefined => {
|
|
54
|
-
const i = argv.indexOf(`--${name}`);
|
|
55
|
-
if (i === -1) return undefined;
|
|
56
|
-
return argv[i + 1];
|
|
57
|
-
};
|
|
58
|
-
const has = (name: string) => argv.includes(`--${name}`);
|
|
59
|
-
const iteration = flag("iteration");
|
|
60
|
-
const subagentsDir = flag("subagents-dir");
|
|
61
|
-
const overwrite = has("overwrite");
|
|
62
|
-
if (!iteration) die("missing --iteration");
|
|
63
|
-
if (!subagentsDir)
|
|
64
|
-
die(
|
|
65
|
-
"missing --subagents-dir (e.g. ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
|
|
66
|
-
);
|
|
67
|
-
return { iteration, subagentsDir, overwrite };
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
if (import.meta.main) {
|
|
71
|
-
const fillArgv = Bun.argv.slice(2);
|
|
72
|
-
const { iteration, subagentsDir, overwrite } = parseArgs(fillArgv);
|
|
73
|
-
const fillCtx = detectRunContext(fillArgv);
|
|
74
|
-
const skill = fillCtx.skillName;
|
|
75
|
-
|
|
76
|
-
if (!existsSync(subagentsDir))
|
|
77
|
-
die(`subagents-dir not found: ${subagentsDir}`);
|
|
78
|
-
|
|
79
|
-
const adapter = claudeAdapter;
|
|
80
|
-
console.log("Using harness transcript adapter: claude-code");
|
|
81
|
-
|
|
82
|
-
const iterationDir = join(
|
|
83
|
-
fillCtx.workspaceRoot,
|
|
84
|
-
skill,
|
|
85
|
-
`iteration-${iteration}`,
|
|
86
|
-
);
|
|
87
|
-
if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
|
|
88
|
-
|
|
89
|
-
const conditionsPath = join(iterationDir, "conditions.json");
|
|
90
|
-
if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
|
|
91
|
-
const conditions: ConditionsRecord = JSON.parse(
|
|
92
|
-
readFileSync(conditionsPath, "utf8"),
|
|
93
|
-
);
|
|
94
|
-
const conditionNames = conditions.conditions.map((c) => c.name);
|
|
95
|
-
|
|
96
|
-
const evalDirs = readdirSync(iterationDir).filter((d) =>
|
|
97
|
-
d.startsWith("eval-"),
|
|
98
|
-
);
|
|
99
|
-
|
|
100
|
-
let filled = 0;
|
|
101
|
-
let skipped = 0;
|
|
102
|
-
let missing = 0;
|
|
103
|
-
|
|
104
|
-
for (const evalDir of evalDirs) {
|
|
105
|
-
const evalId = evalDir.replace(/^eval-/, "");
|
|
106
|
-
for (const cond of conditionNames) {
|
|
107
|
-
const condDir = join(iterationDir, evalDir, cond);
|
|
108
|
-
const runPath = join(condDir, "run.json");
|
|
109
|
-
if (!existsSync(runPath)) continue;
|
|
110
|
-
|
|
111
|
-
const run = validateAgainstSchema<RunRecord>(
|
|
112
|
-
"run-record",
|
|
113
|
-
JSON.parse(readFileSync(runPath, "utf8")),
|
|
114
|
-
runPath,
|
|
115
|
-
);
|
|
116
|
-
const existing = Array.isArray(run.tool_invocations)
|
|
117
|
-
? run.tool_invocations
|
|
118
|
-
: [];
|
|
119
|
-
if (existing.length > 0 && !overwrite) {
|
|
120
|
-
console.log(
|
|
121
|
-
`skip ${evalId}/${cond}: already has ${existing.length} tool_invocations (use --overwrite to replace)`,
|
|
122
|
-
);
|
|
123
|
-
skipped++;
|
|
124
|
-
continue;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
const description = resolveAgentDescription(iterationDir, evalId, cond);
|
|
128
|
-
const subagent = adapter.findByDescription(subagentsDir, description);
|
|
129
|
-
if (!subagent) {
|
|
130
|
-
console.warn(
|
|
131
|
-
`miss ${evalId}/${cond}: no subagent transcript with description='${description}'`,
|
|
132
|
-
);
|
|
133
|
-
missing++;
|
|
134
|
-
continue;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
const invocations = adapter.parseTranscript(subagent.jsonlPath);
|
|
138
|
-
run.tool_invocations = invocations;
|
|
139
|
-
writeFileSync(runPath, `${JSON.stringify(run, null, 2)}\n`);
|
|
140
|
-
console.log(
|
|
141
|
-
`fill ${evalId}/${cond}: wrote ${invocations.length} tool_invocations from ${subagent.jsonlPath}`,
|
|
142
|
-
);
|
|
143
|
-
filled++;
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
console.log(
|
|
148
|
-
`\nFilled: ${filled}, skipped (already populated): ${skipped}, missing transcript: ${missing}`,
|
|
149
|
-
);
|
|
150
|
-
if (missing > 0)
|
|
151
|
-
console.warn(
|
|
152
|
-
"Missing transcripts mean the dispatching agent's dispatch `description` did not match the task's `agent_description` in dispatch.json (or dispatch.json is absent and the legacy `eval-id:condition` reconstruction found no match). transcript_check assertions for those runs will be graded unverifiable.",
|
|
153
|
-
);
|
|
154
|
-
}
|
|
@@ -1,347 +0,0 @@
|
|
|
1
|
-
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
-
import {
|
|
3
|
-
existsSync,
|
|
4
|
-
mkdirSync,
|
|
5
|
-
readFileSync,
|
|
6
|
-
rmSync,
|
|
7
|
-
writeFileSync,
|
|
8
|
-
} from "node:fs";
|
|
9
|
-
import { tmpdir } from "node:os";
|
|
10
|
-
import { join } from "node:path";
|
|
11
|
-
import { checkSkillInvokedFromTranscript } from "./grade";
|
|
12
|
-
import type { ToolInvocation } from "./types";
|
|
13
|
-
|
|
14
|
-
describe("checkSkillInvokedFromTranscript", () => {
|
|
15
|
-
test("returns true when transcript contains a Skill call with input.skill matching the slug", () => {
|
|
16
|
-
const slug =
|
|
17
|
-
"slow-powers-eval-1-with_skill__verification-before-completion";
|
|
18
|
-
const invocations: ToolInvocation[] = [
|
|
19
|
-
{ name: "Bash", args: { command: "ls" }, ordinal: 0 },
|
|
20
|
-
{ name: "Skill", args: { skill: slug }, ordinal: 1 },
|
|
21
|
-
{ name: "Read", args: { file_path: "/tmp/x" }, ordinal: 2 },
|
|
22
|
-
];
|
|
23
|
-
expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(true);
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
test("returns false when transcript has no Skill calls", () => {
|
|
27
|
-
const invocations: ToolInvocation[] = [
|
|
28
|
-
{ name: "Bash", args: { command: "ls" }, ordinal: 0 },
|
|
29
|
-
{ name: "Read", args: { file_path: "/tmp/x" }, ordinal: 1 },
|
|
30
|
-
];
|
|
31
|
-
expect(
|
|
32
|
-
checkSkillInvokedFromTranscript(
|
|
33
|
-
invocations,
|
|
34
|
-
"slow-powers-eval-1-with_skill__foo",
|
|
35
|
-
),
|
|
36
|
-
).toBe(false);
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
test("returns false when Skill call references a different slug", () => {
|
|
40
|
-
const slug =
|
|
41
|
-
"slow-powers-eval-1-with_skill__verification-before-completion";
|
|
42
|
-
const invocations: ToolInvocation[] = [
|
|
43
|
-
{
|
|
44
|
-
name: "Skill",
|
|
45
|
-
args: { skill: "slow-powers:writing-skills" },
|
|
46
|
-
ordinal: 0,
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
name: "Skill",
|
|
50
|
-
args: { skill: "slow-powers-eval-2-old_skill__other" },
|
|
51
|
-
ordinal: 1,
|
|
52
|
-
},
|
|
53
|
-
];
|
|
54
|
-
expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(false);
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
test("returns false on empty invocations array", () => {
|
|
58
|
-
expect(checkSkillInvokedFromTranscript([], "anything")).toBe(false);
|
|
59
|
-
});
|
|
60
|
-
|
|
61
|
-
test("tolerates Skill invocations whose args are missing or malformed", () => {
|
|
62
|
-
const slug = "slow-powers-eval-1-with_skill__foo";
|
|
63
|
-
const invocations: ToolInvocation[] = [
|
|
64
|
-
{ name: "Skill", ordinal: 0 },
|
|
65
|
-
{ name: "Skill", args: "not-an-object", ordinal: 1 },
|
|
66
|
-
{ name: "Skill", args: { other: "field" }, ordinal: 2 },
|
|
67
|
-
];
|
|
68
|
-
expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(false);
|
|
69
|
-
});
|
|
70
|
-
});
|
|
71
|
-
|
|
72
|
-
const GRADE_FIXTURE_ROOT = join(
|
|
73
|
-
tmpdir(),
|
|
74
|
-
`slow-powers-grade-test-${process.pid}`,
|
|
75
|
-
);
|
|
76
|
-
const GRADE_TS = join(import.meta.dir, "grade.ts");
|
|
77
|
-
|
|
78
|
-
beforeAll(() => {
|
|
79
|
-
mkdirSync(GRADE_FIXTURE_ROOT, { recursive: true });
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
afterAll(() => {
|
|
83
|
-
rmSync(GRADE_FIXTURE_ROOT, { recursive: true, force: true });
|
|
84
|
-
});
|
|
85
|
-
|
|
86
|
-
function writeJsonFile(path: string, value: unknown) {
|
|
87
|
-
writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
describe("emitJudgeTasks skill-invocation meta-check gating", () => {
|
|
91
|
-
test("omits the skill-invocation meta-check for evals marked skill_should_trigger: false", () => {
|
|
92
|
-
const root = join(GRADE_FIXTURE_ROOT, "negative-eval");
|
|
93
|
-
const skill = "mr-review";
|
|
94
|
-
const skillDir = join(root, "skill-dir");
|
|
95
|
-
const skillSub = join(skillDir, skill);
|
|
96
|
-
mkdirSync(join(skillSub, "evals"), { recursive: true });
|
|
97
|
-
writeFileSync(
|
|
98
|
-
join(skillSub, "SKILL.md"),
|
|
99
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
100
|
-
);
|
|
101
|
-
// Two evals: a positive one (skill should fire) and a negative one
|
|
102
|
-
// (skill should NOT fire — non-invocation is the desired behavior).
|
|
103
|
-
writeJsonFile(join(skillSub, "evals", "evals.json"), {
|
|
104
|
-
skill_name: skill,
|
|
105
|
-
evals: [
|
|
106
|
-
{
|
|
107
|
-
id: "pos-eval",
|
|
108
|
-
prompt: "Fix the failing build.",
|
|
109
|
-
expected_output: "Agent debugs systematically.",
|
|
110
|
-
assertions: [
|
|
111
|
-
{ id: "a1", type: "llm_judge", rubric: "Did it debug?" },
|
|
112
|
-
],
|
|
113
|
-
},
|
|
114
|
-
{
|
|
115
|
-
id: "neg-eval",
|
|
116
|
-
prompt: "Add a --verbose flag.",
|
|
117
|
-
expected_output: "Agent treats it as a feature, no debugging.",
|
|
118
|
-
skill_should_trigger: false,
|
|
119
|
-
assertions: [
|
|
120
|
-
{ id: "a2", type: "llm_judge", rubric: "Did it avoid debugging?" },
|
|
121
|
-
],
|
|
122
|
-
},
|
|
123
|
-
],
|
|
124
|
-
});
|
|
125
|
-
|
|
126
|
-
const cwd = join(root, "work");
|
|
127
|
-
const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
|
|
128
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
129
|
-
writeJsonFile(join(iterationDir, "conditions.json"), {
|
|
130
|
-
mode: "new-skill",
|
|
131
|
-
conditions: [
|
|
132
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
133
|
-
{ name: "without_skill", skill_path: null },
|
|
134
|
-
],
|
|
135
|
-
timestamp: new Date().toISOString(),
|
|
136
|
-
harness: "claude-code",
|
|
137
|
-
});
|
|
138
|
-
|
|
139
|
-
for (const evalId of ["pos-eval", "neg-eval"]) {
|
|
140
|
-
for (const cond of ["with_skill", "without_skill"]) {
|
|
141
|
-
const condDir = join(iterationDir, `eval-${evalId}`, cond);
|
|
142
|
-
mkdirSync(condDir, { recursive: true });
|
|
143
|
-
// Empty tool_invocations => meta routed to a judge task (not code-checked).
|
|
144
|
-
writeJsonFile(join(condDir, "run.json"), {
|
|
145
|
-
eval_id: evalId,
|
|
146
|
-
condition: cond,
|
|
147
|
-
skill_path: cond === "with_skill" ? join(skillSub, "SKILL.md") : null,
|
|
148
|
-
prompt: "p",
|
|
149
|
-
files: [],
|
|
150
|
-
final_message: "done",
|
|
151
|
-
tool_invocations: [],
|
|
152
|
-
total_tokens: 100,
|
|
153
|
-
duration_ms: 1000,
|
|
154
|
-
});
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const res = Bun.spawnSync(
|
|
159
|
-
[
|
|
160
|
-
"bun",
|
|
161
|
-
"run",
|
|
162
|
-
GRADE_TS,
|
|
163
|
-
"--skill-dir",
|
|
164
|
-
skillDir,
|
|
165
|
-
"--skill",
|
|
166
|
-
skill,
|
|
167
|
-
"--iteration",
|
|
168
|
-
"1",
|
|
169
|
-
],
|
|
170
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
171
|
-
);
|
|
172
|
-
expect(res.exitCode).toBe(0);
|
|
173
|
-
|
|
174
|
-
const tasks = JSON.parse(
|
|
175
|
-
readFileSync(join(iterationDir, "judge-tasks.json"), "utf8"),
|
|
176
|
-
) as { tasks: Array<{ eval_id: string; is_meta: boolean }> };
|
|
177
|
-
const metaTasks = tasks.tasks.filter((t) => t.is_meta);
|
|
178
|
-
// Exactly one meta-check, and only for the positive eval.
|
|
179
|
-
expect(metaTasks.map((t) => t.eval_id)).toEqual(["pos-eval"]);
|
|
180
|
-
});
|
|
181
|
-
});
|
|
182
|
-
|
|
183
|
-
describe("emitJudgeTasks run.json validation", () => {
|
|
184
|
-
test("fails fast with a schema error when a run.json is malformed", () => {
|
|
185
|
-
const root = join(GRADE_FIXTURE_ROOT, "bad-run-record");
|
|
186
|
-
const skill = "mr-review";
|
|
187
|
-
const skillDir = join(root, "skill-dir");
|
|
188
|
-
const skillSub = join(skillDir, skill);
|
|
189
|
-
mkdirSync(join(skillSub, "evals"), { recursive: true });
|
|
190
|
-
writeFileSync(
|
|
191
|
-
join(skillSub, "SKILL.md"),
|
|
192
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
193
|
-
);
|
|
194
|
-
writeJsonFile(join(skillSub, "evals", "evals.json"), {
|
|
195
|
-
skill_name: skill,
|
|
196
|
-
evals: [
|
|
197
|
-
{
|
|
198
|
-
id: "pos-eval",
|
|
199
|
-
prompt: "Fix the failing build.",
|
|
200
|
-
expected_output: "Agent debugs systematically.",
|
|
201
|
-
assertions: [
|
|
202
|
-
{ id: "a1", type: "llm_judge", rubric: "Did it debug?" },
|
|
203
|
-
],
|
|
204
|
-
},
|
|
205
|
-
],
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
const cwd = join(root, "work");
|
|
209
|
-
const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
|
|
210
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
211
|
-
writeJsonFile(join(iterationDir, "conditions.json"), {
|
|
212
|
-
mode: "new-skill",
|
|
213
|
-
conditions: [
|
|
214
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
215
|
-
{ name: "without_skill", skill_path: null },
|
|
216
|
-
],
|
|
217
|
-
timestamp: new Date().toISOString(),
|
|
218
|
-
harness: "claude-code",
|
|
219
|
-
});
|
|
220
|
-
|
|
221
|
-
for (const cond of ["with_skill", "without_skill"]) {
|
|
222
|
-
const condDir = join(iterationDir, "eval-pos-eval", cond);
|
|
223
|
-
mkdirSync(condDir, { recursive: true });
|
|
224
|
-
// Missing required `final_message` and `files` — must be rejected.
|
|
225
|
-
writeJsonFile(join(condDir, "run.json"), {
|
|
226
|
-
eval_id: "pos-eval",
|
|
227
|
-
condition: cond,
|
|
228
|
-
skill_path: null,
|
|
229
|
-
prompt: "p",
|
|
230
|
-
tool_invocations: [],
|
|
231
|
-
});
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
const res = Bun.spawnSync(
|
|
235
|
-
[
|
|
236
|
-
"bun",
|
|
237
|
-
"run",
|
|
238
|
-
GRADE_TS,
|
|
239
|
-
"--skill-dir",
|
|
240
|
-
skillDir,
|
|
241
|
-
"--skill",
|
|
242
|
-
skill,
|
|
243
|
-
"--iteration",
|
|
244
|
-
"1",
|
|
245
|
-
],
|
|
246
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
247
|
-
);
|
|
248
|
-
expect(res.exitCode).not.toBe(0);
|
|
249
|
-
expect(res.stderr.toString()).toContain("run-record schema");
|
|
250
|
-
});
|
|
251
|
-
});
|
|
252
|
-
|
|
253
|
-
describe("emitJudgeTasks file-pointer dispatch", () => {
|
|
254
|
-
test("writes each judge prompt to a file and drops the inline prompt from judge-tasks.json", () => {
|
|
255
|
-
const root = join(GRADE_FIXTURE_ROOT, "judge-prompt-file");
|
|
256
|
-
const skill = "mr-review";
|
|
257
|
-
const skillDir = join(root, "skill-dir");
|
|
258
|
-
const skillSub = join(skillDir, skill);
|
|
259
|
-
mkdirSync(join(skillSub, "evals"), { recursive: true });
|
|
260
|
-
writeFileSync(
|
|
261
|
-
join(skillSub, "SKILL.md"),
|
|
262
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
263
|
-
);
|
|
264
|
-
writeJsonFile(join(skillSub, "evals", "evals.json"), {
|
|
265
|
-
skill_name: skill,
|
|
266
|
-
evals: [
|
|
267
|
-
{
|
|
268
|
-
id: "pos-eval",
|
|
269
|
-
prompt: "Fix the failing build.",
|
|
270
|
-
expected_output: "Agent debugs systematically.",
|
|
271
|
-
assertions: [
|
|
272
|
-
{ id: "a1", type: "llm_judge", rubric: "Did it debug?" },
|
|
273
|
-
],
|
|
274
|
-
},
|
|
275
|
-
],
|
|
276
|
-
});
|
|
277
|
-
|
|
278
|
-
const cwd = join(root, "work");
|
|
279
|
-
const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
|
|
280
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
281
|
-
writeJsonFile(join(iterationDir, "conditions.json"), {
|
|
282
|
-
mode: "new-skill",
|
|
283
|
-
conditions: [
|
|
284
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
285
|
-
{ name: "without_skill", skill_path: null },
|
|
286
|
-
],
|
|
287
|
-
timestamp: new Date().toISOString(),
|
|
288
|
-
harness: "claude-code",
|
|
289
|
-
});
|
|
290
|
-
|
|
291
|
-
for (const cond of ["with_skill", "without_skill"]) {
|
|
292
|
-
const condDir = join(iterationDir, "eval-pos-eval", cond);
|
|
293
|
-
mkdirSync(condDir, { recursive: true });
|
|
294
|
-
writeJsonFile(join(condDir, "run.json"), {
|
|
295
|
-
eval_id: "pos-eval",
|
|
296
|
-
condition: cond,
|
|
297
|
-
skill_path: cond === "with_skill" ? join(skillSub, "SKILL.md") : null,
|
|
298
|
-
prompt: "p",
|
|
299
|
-
files: [],
|
|
300
|
-
final_message: "done",
|
|
301
|
-
tool_invocations: [],
|
|
302
|
-
total_tokens: 100,
|
|
303
|
-
duration_ms: 1000,
|
|
304
|
-
});
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
const res = Bun.spawnSync(
|
|
308
|
-
[
|
|
309
|
-
"bun",
|
|
310
|
-
"run",
|
|
311
|
-
GRADE_TS,
|
|
312
|
-
"--skill-dir",
|
|
313
|
-
skillDir,
|
|
314
|
-
"--skill",
|
|
315
|
-
skill,
|
|
316
|
-
"--iteration",
|
|
317
|
-
"1",
|
|
318
|
-
],
|
|
319
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
320
|
-
);
|
|
321
|
-
expect(res.exitCode).toBe(0);
|
|
322
|
-
|
|
323
|
-
const tasks = JSON.parse(
|
|
324
|
-
readFileSync(join(iterationDir, "judge-tasks.json"), "utf8"),
|
|
325
|
-
) as {
|
|
326
|
-
tasks: Array<{
|
|
327
|
-
assertion_id: string;
|
|
328
|
-
response_path: string;
|
|
329
|
-
dispatch_prompt?: string;
|
|
330
|
-
dispatch_prompt_path: string;
|
|
331
|
-
}>;
|
|
332
|
-
};
|
|
333
|
-
|
|
334
|
-
expect(tasks.tasks.length).toBeGreaterThan(0);
|
|
335
|
-
for (const t of tasks.tasks) {
|
|
336
|
-
// Nothing inlined; the orchestrator reads the prompt from a file.
|
|
337
|
-
expect(t.dispatch_prompt).toBeUndefined();
|
|
338
|
-
expect(t.dispatch_prompt_path.endsWith(`${t.assertion_id}.txt`)).toBe(
|
|
339
|
-
true,
|
|
340
|
-
);
|
|
341
|
-
expect(existsSync(t.dispatch_prompt_path)).toBe(true);
|
|
342
|
-
const contents = readFileSync(t.dispatch_prompt_path, "utf8");
|
|
343
|
-
// The judge still learns where to write its verdict from the prompt text.
|
|
344
|
-
expect(contents).toContain(t.response_path);
|
|
345
|
-
}
|
|
346
|
-
});
|
|
347
|
-
});
|