@slowdini/slow-powers-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +174 -0
- package/bootstrap.md +16 -0
- package/opencode/plugins/slow-powers.js +86 -0
- package/package.json +66 -0
- package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
- package/skills/evaluating-skills/SKILL.md +448 -0
- package/skills/evaluating-skills/evals/evals.json +52 -0
- package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
- package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
- package/skills/evaluating-skills/harness-details/claude.md +135 -0
- package/skills/evaluating-skills/pressure-scenarios.md +163 -0
- package/skills/evaluating-skills/runner/README.md +140 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
- package/skills/evaluating-skills/runner/aggregate.ts +228 -0
- package/skills/evaluating-skills/runner/context.test.ts +181 -0
- package/skills/evaluating-skills/runner/context.ts +90 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
- package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
- package/skills/evaluating-skills/runner/grade.test.ts +347 -0
- package/skills/evaluating-skills/runner/grade.ts +603 -0
- package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
- package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
- package/skills/evaluating-skills/runner/guard/install.ts +147 -0
- package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
- package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
- package/skills/evaluating-skills/runner/run.test.ts +716 -0
- package/skills/evaluating-skills/runner/run.ts +814 -0
- package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
- package/skills/evaluating-skills/runner/types.ts +104 -0
- package/skills/evaluating-skills/runner/validate-all.ts +54 -0
- package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
- package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
- package/skills/evaluating-skills/runner/validate.test.ts +56 -0
- package/skills/evaluating-skills/runner/validate.ts +21 -0
- package/skills/evaluating-skills/schema/evals.schema.json +105 -0
- package/skills/evaluating-skills/schema/grading.schema.json +84 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
- package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
- package/skills/evaluating-skills/templates/evals.json.example +17 -0
- package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
- package/skills/finishing-a-development-branch/SKILL.md +96 -0
- package/skills/finishing-a-development-branch/evals/evals.json +41 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
- package/skills/hardening-plans/SKILL.md +72 -0
- package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
- package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
- package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
- package/skills/hardening-plans/evals/evals.json +114 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +84 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
- package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/systematic-debugging/evals/evals.json +45 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +93 -0
- package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
- package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
- package/skills/test-driven-development/evals/evals.json +77 -0
- package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
- package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +70 -0
- package/skills/using-git-worktrees/evals/evals.json +40 -0
- package/skills/verification-before-completion/SKILL.md +65 -0
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
- package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
- package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/evals.json +77 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
- package/skills/writing-skills/SKILL.md +306 -0
- package/skills/writing-skills/evals/evals.json +40 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/scripts/render-graphs.js +181 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { detectRunContext } from "./context";
|
|
5
|
+
import { classifyBash, isUnder, pathArg, WRITE_TOOLS } from "./sandbox-policy";
|
|
6
|
+
import type { ConditionsRecord, RunRecord, ToolInvocation } from "./types";
|
|
7
|
+
import { validateAgainstSchema } from "./validate-schema";
|
|
8
|
+
|
|
9
|
+
function die(msg: string): never {
|
|
10
|
+
console.error(`error: ${msg}`);
|
|
11
|
+
process.exit(1);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export type StrayFinding = {
|
|
15
|
+
tool: string;
|
|
16
|
+
path?: string;
|
|
17
|
+
command?: string;
|
|
18
|
+
ordinal: number;
|
|
19
|
+
reason: string;
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
export type RunFindings = {
|
|
23
|
+
violations: StrayFinding[];
|
|
24
|
+
warnings: StrayFinding[];
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Classify a run's tool invocations against its allowed outputs dir.
|
|
29
|
+
*
|
|
30
|
+
* - `violations`: file-write tools (Write/Edit/MultiEdit/NotebookEdit) whose
|
|
31
|
+
* target path resolves outside `outputsDir`. High confidence — a run that
|
|
32
|
+
* edits the real repo is a tainted data point.
|
|
33
|
+
* - `warnings`: Bash commands matching a mutating pattern that don't reference
|
|
34
|
+
* `outputsDir`. Heuristic — review before trusting.
|
|
35
|
+
*
|
|
36
|
+
* Relative paths resolve against `repoRoot` (the subagent's working dir);
|
|
37
|
+
* Claude Code's write tools use absolute paths, so this is a best-effort
|
|
38
|
+
* fallback only.
|
|
39
|
+
*/
|
|
40
|
+
export function detectStrayWrites(
|
|
41
|
+
invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
|
|
42
|
+
outputsDir: string,
|
|
43
|
+
repoRoot: string,
|
|
44
|
+
): RunFindings {
|
|
45
|
+
const violations: StrayFinding[] = [];
|
|
46
|
+
const warnings: StrayFinding[] = [];
|
|
47
|
+
|
|
48
|
+
for (const inv of invocations) {
|
|
49
|
+
if (WRITE_TOOLS.has(inv.name)) {
|
|
50
|
+
const p = pathArg(inv.args);
|
|
51
|
+
if (p && !isUnder(p, outputsDir, repoRoot)) {
|
|
52
|
+
violations.push({
|
|
53
|
+
tool: inv.name,
|
|
54
|
+
path: p,
|
|
55
|
+
ordinal: inv.ordinal,
|
|
56
|
+
reason: "writes outside the run's outputs dir",
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (inv.name === "Bash") {
|
|
63
|
+
const args = inv.args as { command?: unknown } | undefined;
|
|
64
|
+
const command = typeof args?.command === "string" ? args.command : "";
|
|
65
|
+
const reason = classifyBash(command, [outputsDir]);
|
|
66
|
+
if (reason)
|
|
67
|
+
warnings.push({ tool: "Bash", command, ordinal: inv.ordinal, reason });
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return { violations, warnings };
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (import.meta.main) {
|
|
75
|
+
const argv = Bun.argv.slice(2);
|
|
76
|
+
const flag = (name: string): string | undefined => {
|
|
77
|
+
const i = argv.indexOf(`--${name}`);
|
|
78
|
+
return i === -1 ? undefined : argv[i + 1];
|
|
79
|
+
};
|
|
80
|
+
const iteration = flag("iteration");
|
|
81
|
+
if (!iteration) die("missing --iteration");
|
|
82
|
+
const ctx = detectRunContext(argv);
|
|
83
|
+
|
|
84
|
+
const iterationDir = join(
|
|
85
|
+
ctx.workspaceRoot,
|
|
86
|
+
ctx.skillName,
|
|
87
|
+
`iteration-${iteration}`,
|
|
88
|
+
);
|
|
89
|
+
if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
|
|
90
|
+
|
|
91
|
+
const conditionsPath = join(iterationDir, "conditions.json");
|
|
92
|
+
if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
|
|
93
|
+
const conditions: ConditionsRecord = JSON.parse(
|
|
94
|
+
readFileSync(conditionsPath, "utf8"),
|
|
95
|
+
);
|
|
96
|
+
const conditionNames = conditions.conditions.map((c) => c.name);
|
|
97
|
+
|
|
98
|
+
// dispatch.json carries the authoritative outputs_dir per task; fall back to
|
|
99
|
+
// the conventional <condDir>/outputs when it's absent (hand-authored runs).
|
|
100
|
+
const dispatchPath = join(iterationDir, "dispatch.json");
|
|
101
|
+
const outputsByKey = new Map<string, string>();
|
|
102
|
+
if (existsSync(dispatchPath)) {
|
|
103
|
+
try {
|
|
104
|
+
const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
|
|
105
|
+
tasks?: Array<{
|
|
106
|
+
eval_id: string;
|
|
107
|
+
condition: string;
|
|
108
|
+
outputs_dir?: string;
|
|
109
|
+
}>;
|
|
110
|
+
};
|
|
111
|
+
for (const t of dispatch.tasks ?? []) {
|
|
112
|
+
if (t.outputs_dir)
|
|
113
|
+
outputsByKey.set(`${t.eval_id}:${t.condition}`, t.outputs_dir);
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
// fall through to convention
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const repoRoot = process.cwd();
|
|
121
|
+
const evalDirs = readdirSync(iterationDir).filter((d) =>
|
|
122
|
+
d.startsWith("eval-"),
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
type RunReport = {
|
|
126
|
+
eval_id: string;
|
|
127
|
+
condition: string;
|
|
128
|
+
violations: StrayFinding[];
|
|
129
|
+
warnings: StrayFinding[];
|
|
130
|
+
};
|
|
131
|
+
const runs: RunReport[] = [];
|
|
132
|
+
let totalViolations = 0;
|
|
133
|
+
let totalWarnings = 0;
|
|
134
|
+
|
|
135
|
+
for (const evalDir of evalDirs) {
|
|
136
|
+
const evalId = evalDir.replace(/^eval-/, "");
|
|
137
|
+
for (const cond of conditionNames) {
|
|
138
|
+
const condDir = join(iterationDir, evalDir, cond);
|
|
139
|
+
const runPath = join(condDir, "run.json");
|
|
140
|
+
if (!existsSync(runPath)) continue;
|
|
141
|
+
const run = validateAgainstSchema<RunRecord>(
|
|
142
|
+
"run-record",
|
|
143
|
+
JSON.parse(readFileSync(runPath, "utf8")),
|
|
144
|
+
runPath,
|
|
145
|
+
);
|
|
146
|
+
const invocations = Array.isArray(run.tool_invocations)
|
|
147
|
+
? run.tool_invocations
|
|
148
|
+
: [];
|
|
149
|
+
const outputsDir =
|
|
150
|
+
outputsByKey.get(`${evalId}:${cond}`) ?? join(condDir, "outputs");
|
|
151
|
+
const findings = detectStrayWrites(invocations, outputsDir, repoRoot);
|
|
152
|
+
if (findings.violations.length || findings.warnings.length) {
|
|
153
|
+
runs.push({
|
|
154
|
+
eval_id: evalId,
|
|
155
|
+
condition: cond,
|
|
156
|
+
violations: findings.violations,
|
|
157
|
+
warnings: findings.warnings,
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
totalViolations += findings.violations.length;
|
|
161
|
+
totalWarnings += findings.warnings.length;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const report = {
|
|
166
|
+
generated: new Date().toISOString(),
|
|
167
|
+
iteration: Number(iteration),
|
|
168
|
+
totals: { violations: totalViolations, warnings: totalWarnings },
|
|
169
|
+
runs,
|
|
170
|
+
};
|
|
171
|
+
const outPath = join(iterationDir, "stray-writes.json");
|
|
172
|
+
validateAgainstSchema("stray-writes", report, outPath);
|
|
173
|
+
writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
174
|
+
console.log(`Wrote ${outPath}`);
|
|
175
|
+
|
|
176
|
+
for (const r of runs) {
|
|
177
|
+
for (const v of r.violations)
|
|
178
|
+
console.warn(
|
|
179
|
+
`✗ ${r.eval_id}/${r.condition}: ${v.tool} wrote outside outputs dir → ${v.path} (ordinal ${v.ordinal})`,
|
|
180
|
+
);
|
|
181
|
+
for (const w of r.warnings)
|
|
182
|
+
console.warn(
|
|
183
|
+
`⚠ ${r.eval_id}/${r.condition}: Bash ${w.reason} (ordinal ${w.ordinal}): ${w.command}`,
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
if (totalViolations === 0 && totalWarnings === 0)
|
|
187
|
+
console.log("✓ No out-of-bounds writes detected.");
|
|
188
|
+
else
|
|
189
|
+
console.warn(
|
|
190
|
+
`\n${totalViolations} violation(s), ${totalWarnings} warning(s). Runs with violations edited files outside their sandbox — treat those data points as tainted.`,
|
|
191
|
+
);
|
|
192
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
+
import { mkdirSync, rmSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import { resolveAgentDescription } from "./fill-transcripts";
|
|
6
|
+
|
|
7
|
+
const ROOT = join(tmpdir(), `fill-transcripts-test-${process.pid}`);
|
|
8
|
+
|
|
9
|
+
beforeAll(() => mkdirSync(ROOT, { recursive: true }));
|
|
10
|
+
afterAll(() => rmSync(ROOT, { recursive: true, force: true }));
|
|
11
|
+
|
|
12
|
+
function writeDispatch(iterationDir: string, tasks: unknown[]) {
|
|
13
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
14
|
+
writeFileSync(
|
|
15
|
+
join(iterationDir, "dispatch.json"),
|
|
16
|
+
JSON.stringify({ run_nonce: "abc123", tasks }, null, 2),
|
|
17
|
+
);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
describe("resolveAgentDescription", () => {
|
|
21
|
+
test("returns the namespaced agent_description from dispatch.json", () => {
|
|
22
|
+
const dir = join(ROOT, "iter-canonical");
|
|
23
|
+
writeDispatch(dir, [
|
|
24
|
+
{
|
|
25
|
+
eval_id: "crash",
|
|
26
|
+
condition: "with_skill",
|
|
27
|
+
agent_description: "crash:with_skill:i3-abc123",
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
eval_id: "crash",
|
|
31
|
+
condition: "without_skill",
|
|
32
|
+
agent_description: "crash:without_skill:i3-abc123",
|
|
33
|
+
},
|
|
34
|
+
]);
|
|
35
|
+
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
36
|
+
"crash:with_skill:i3-abc123",
|
|
37
|
+
);
|
|
38
|
+
expect(resolveAgentDescription(dir, "crash", "without_skill")).toBe(
|
|
39
|
+
"crash:without_skill:i3-abc123",
|
|
40
|
+
);
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
test("falls back to legacy reconstruction when dispatch.json is absent", () => {
|
|
44
|
+
const dir = join(ROOT, "iter-no-dispatch");
|
|
45
|
+
mkdirSync(dir, { recursive: true });
|
|
46
|
+
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
47
|
+
"crash:with_skill",
|
|
48
|
+
);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
test("falls back when the task is missing from dispatch.json", () => {
|
|
52
|
+
const dir = join(ROOT, "iter-partial");
|
|
53
|
+
writeDispatch(dir, [
|
|
54
|
+
{
|
|
55
|
+
eval_id: "other",
|
|
56
|
+
condition: "with_skill",
|
|
57
|
+
agent_description: "other:with_skill:i1-x",
|
|
58
|
+
},
|
|
59
|
+
]);
|
|
60
|
+
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
61
|
+
"crash:with_skill",
|
|
62
|
+
);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test("falls back when dispatch.json is malformed", () => {
|
|
66
|
+
const dir = join(ROOT, "iter-malformed");
|
|
67
|
+
mkdirSync(dir, { recursive: true });
|
|
68
|
+
writeFileSync(join(dir, "dispatch.json"), "{ not valid json");
|
|
69
|
+
expect(resolveAgentDescription(dir, "crash", "with_skill")).toBe(
|
|
70
|
+
"crash:with_skill",
|
|
71
|
+
);
|
|
72
|
+
});
|
|
73
|
+
});
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import * as claudeAdapter from "./adapters/claude-code-transcript";
|
|
5
|
+
import { detectRunContext } from "./context";
|
|
6
|
+
import type { ConditionsRecord, RunRecord } from "./types";
|
|
7
|
+
import { validateAgainstSchema } from "./validate-schema";
|
|
8
|
+
|
|
9
|
+
function die(msg: string): never {
|
|
10
|
+
console.error(`error: ${msg}`);
|
|
11
|
+
process.exit(1);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
type DispatchTaskRef = {
|
|
15
|
+
eval_id: string;
|
|
16
|
+
condition: string;
|
|
17
|
+
agent_description?: string;
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* The canonical dispatch description for an (eval, condition) run.
|
|
22
|
+
*
|
|
23
|
+
* The runner writes a unique `agent_description` per task into `dispatch.json`
|
|
24
|
+
* (namespaced with the iteration + run nonce). Reading it back — rather than
|
|
25
|
+
* reconstructing `<eval_id>:<condition>` — is what binds each run to the exact
|
|
26
|
+
* agent that produced it, even when one parent session's shared subagents dir
|
|
27
|
+
* holds colliding descriptions from other iterations. Falls back to the legacy
|
|
28
|
+
* reconstruction when dispatch.json is absent (hand-authored/operator runs).
|
|
29
|
+
*/
|
|
30
|
+
export function resolveAgentDescription(
|
|
31
|
+
iterationDir: string,
|
|
32
|
+
evalId: string,
|
|
33
|
+
condition: string,
|
|
34
|
+
): string {
|
|
35
|
+
const dispatchPath = join(iterationDir, "dispatch.json");
|
|
36
|
+
if (existsSync(dispatchPath)) {
|
|
37
|
+
try {
|
|
38
|
+
const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
|
|
39
|
+
tasks?: DispatchTaskRef[];
|
|
40
|
+
};
|
|
41
|
+
const task = dispatch.tasks?.find(
|
|
42
|
+
(t) => t.eval_id === evalId && t.condition === condition,
|
|
43
|
+
);
|
|
44
|
+
if (task?.agent_description) return task.agent_description;
|
|
45
|
+
} catch {
|
|
46
|
+
// fall through to legacy reconstruction
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
return `${evalId}:${condition}`;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function parseArgs(argv: string[]) {
|
|
53
|
+
const flag = (name: string): string | undefined => {
|
|
54
|
+
const i = argv.indexOf(`--${name}`);
|
|
55
|
+
if (i === -1) return undefined;
|
|
56
|
+
return argv[i + 1];
|
|
57
|
+
};
|
|
58
|
+
const has = (name: string) => argv.includes(`--${name}`);
|
|
59
|
+
const iteration = flag("iteration");
|
|
60
|
+
const subagentsDir = flag("subagents-dir");
|
|
61
|
+
const overwrite = has("overwrite");
|
|
62
|
+
if (!iteration) die("missing --iteration");
|
|
63
|
+
if (!subagentsDir)
|
|
64
|
+
die(
|
|
65
|
+
"missing --subagents-dir (e.g. ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
|
|
66
|
+
);
|
|
67
|
+
return { iteration, subagentsDir, overwrite };
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (import.meta.main) {
|
|
71
|
+
const fillArgv = Bun.argv.slice(2);
|
|
72
|
+
const { iteration, subagentsDir, overwrite } = parseArgs(fillArgv);
|
|
73
|
+
const fillCtx = detectRunContext(fillArgv);
|
|
74
|
+
const skill = fillCtx.skillName;
|
|
75
|
+
|
|
76
|
+
if (!existsSync(subagentsDir))
|
|
77
|
+
die(`subagents-dir not found: ${subagentsDir}`);
|
|
78
|
+
|
|
79
|
+
const adapter = claudeAdapter;
|
|
80
|
+
console.log("Using harness transcript adapter: claude-code");
|
|
81
|
+
|
|
82
|
+
const iterationDir = join(
|
|
83
|
+
fillCtx.workspaceRoot,
|
|
84
|
+
skill,
|
|
85
|
+
`iteration-${iteration}`,
|
|
86
|
+
);
|
|
87
|
+
if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
|
|
88
|
+
|
|
89
|
+
const conditionsPath = join(iterationDir, "conditions.json");
|
|
90
|
+
if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
|
|
91
|
+
const conditions: ConditionsRecord = JSON.parse(
|
|
92
|
+
readFileSync(conditionsPath, "utf8"),
|
|
93
|
+
);
|
|
94
|
+
const conditionNames = conditions.conditions.map((c) => c.name);
|
|
95
|
+
|
|
96
|
+
const evalDirs = readdirSync(iterationDir).filter((d) =>
|
|
97
|
+
d.startsWith("eval-"),
|
|
98
|
+
);
|
|
99
|
+
|
|
100
|
+
let filled = 0;
|
|
101
|
+
let skipped = 0;
|
|
102
|
+
let missing = 0;
|
|
103
|
+
|
|
104
|
+
for (const evalDir of evalDirs) {
|
|
105
|
+
const evalId = evalDir.replace(/^eval-/, "");
|
|
106
|
+
for (const cond of conditionNames) {
|
|
107
|
+
const condDir = join(iterationDir, evalDir, cond);
|
|
108
|
+
const runPath = join(condDir, "run.json");
|
|
109
|
+
if (!existsSync(runPath)) continue;
|
|
110
|
+
|
|
111
|
+
const run = validateAgainstSchema<RunRecord>(
|
|
112
|
+
"run-record",
|
|
113
|
+
JSON.parse(readFileSync(runPath, "utf8")),
|
|
114
|
+
runPath,
|
|
115
|
+
);
|
|
116
|
+
const existing = Array.isArray(run.tool_invocations)
|
|
117
|
+
? run.tool_invocations
|
|
118
|
+
: [];
|
|
119
|
+
if (existing.length > 0 && !overwrite) {
|
|
120
|
+
console.log(
|
|
121
|
+
`skip ${evalId}/${cond}: already has ${existing.length} tool_invocations (use --overwrite to replace)`,
|
|
122
|
+
);
|
|
123
|
+
skipped++;
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const description = resolveAgentDescription(iterationDir, evalId, cond);
|
|
128
|
+
const subagent = adapter.findByDescription(subagentsDir, description);
|
|
129
|
+
if (!subagent) {
|
|
130
|
+
console.warn(
|
|
131
|
+
`miss ${evalId}/${cond}: no subagent transcript with description='${description}'`,
|
|
132
|
+
);
|
|
133
|
+
missing++;
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const invocations = adapter.parseTranscript(subagent.jsonlPath);
|
|
138
|
+
run.tool_invocations = invocations;
|
|
139
|
+
writeFileSync(runPath, `${JSON.stringify(run, null, 2)}\n`);
|
|
140
|
+
console.log(
|
|
141
|
+
`fill ${evalId}/${cond}: wrote ${invocations.length} tool_invocations from ${subagent.jsonlPath}`,
|
|
142
|
+
);
|
|
143
|
+
filled++;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
console.log(
|
|
148
|
+
`\nFilled: ${filled}, skipped (already populated): ${skipped}, missing transcript: ${missing}`,
|
|
149
|
+
);
|
|
150
|
+
if (missing > 0)
|
|
151
|
+
console.warn(
|
|
152
|
+
"Missing transcripts mean the dispatching agent's dispatch `description` did not match the task's `agent_description` in dispatch.json (or dispatch.json is absent and the legacy `eval-id:condition` reconstruction found no match). transcript_check assertions for those runs will be graded unverifiable.",
|
|
153
|
+
);
|
|
154
|
+
}
|