@slowdini/slow-powers-opencode 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +37 -65
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -13
  5. package/skills/evaluating-skills/SKILL.md +91 -337
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/verifying-development-work/SKILL.md +17 -6
  17. package/skills/verifying-development-work/code-review.md +68 -0
  18. package/skills/verifying-development-work/comment-review.md +85 -0
  19. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  20. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  21. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  22. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  23. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  25. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  26. package/skills/verifying-development-work/evals/evals.json +34 -2
  27. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  28. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  29. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  30. package/skills/evaluating-skills/harness-details/claude.md +0 -158
  31. package/skills/evaluating-skills/runner/README.md +0 -154
  32. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  33. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  34. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
  35. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
  36. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
  37. package/skills/evaluating-skills/runner/aggregate.ts +0 -248
  38. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  39. package/skills/evaluating-skills/runner/context.ts +0 -90
  40. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
  41. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
  42. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  43. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  44. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  45. package/skills/evaluating-skills/runner/grade.ts +0 -603
  46. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  47. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  48. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  49. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
  50. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  51. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  52. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  53. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  54. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
  55. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
  56. package/skills/evaluating-skills/runner/run.test.ts +0 -1180
  57. package/skills/evaluating-skills/runner/run.ts +0 -1029
  58. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
  59. package/skills/evaluating-skills/runner/types.ts +0 -112
  60. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  61. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  62. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  63. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  64. package/skills/evaluating-skills/runner/validate.ts +0 -21
  65. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  66. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  67. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  68. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
  69. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
  70. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  71. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  72. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
  73. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  74. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  75. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  76. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  77. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  78. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  79. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  80. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  81. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  82. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -1,603 +0,0 @@
1
- #!/usr/bin/env bun
2
- import {
3
- existsSync,
4
- mkdirSync,
5
- readdirSync,
6
- readFileSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { join } from "node:path";
10
- import { detectRunContext } from "./context";
11
- import {
12
- type AssertionResult,
13
- type AssertionTranscriptCheck,
14
- type ConditionsRecord,
15
- type EvalsConfig,
16
- type GradingResult,
17
- type RunRecord,
18
- SKILL_INVOKED_META_ID,
19
- type ToolInvocation,
20
- } from "./types";
21
- import { validateEvalsConfig } from "./validate";
22
- import { validateAgainstSchema } from "./validate-schema";
23
-
24
- type Mode = "emit-judge-tasks" | "finalize";
25
-
26
- function die(msg: string): never {
27
- console.error(`error: ${msg}`);
28
- process.exit(1);
29
- }
30
-
31
- function parseArgs(argv: string[]) {
32
- const flag = (name: string): string | undefined => {
33
- const i = argv.indexOf(`--${name}`);
34
- if (i === -1) return undefined;
35
- return argv[i + 1];
36
- };
37
- const has = (name: string) => argv.includes(`--${name}`);
38
- const iteration = flag("iteration");
39
- if (!iteration) die("missing --iteration");
40
-
41
- const mode: Mode = has("finalize") ? "finalize" : "emit-judge-tasks";
42
- return { iteration, mode };
43
- }
44
-
45
- function readJson<T>(path: string): T {
46
- return JSON.parse(readFileSync(path, "utf8"));
47
- }
48
-
49
- function writeJson(path: string, value: unknown) {
50
- writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
51
- }
52
-
53
- function ensureDir(path: string): void {
54
- if (!existsSync(path)) mkdirSync(path, { recursive: true });
55
- }
56
-
57
- let skill = "";
58
- let iteration = "";
59
- let iterationDir = "";
60
- let conditions: ConditionsRecord = {
61
- mode: "new-skill",
62
- conditions: [],
63
- timestamp: "",
64
- };
65
- let conditionNames: string[] = [];
66
- let evalsConfig: EvalsConfig = { skill_name: "", evals: [] };
67
-
68
- if (import.meta.main) {
69
- const argv = Bun.argv.slice(2);
70
- const parsed = parseArgs(argv);
71
- let ctx: ReturnType<typeof detectRunContext>;
72
- try {
73
- ctx = detectRunContext(argv);
74
- } catch (err) {
75
- die(err instanceof Error ? err.message : String(err));
76
- }
77
- skill = ctx.skillName;
78
- iteration = parsed.iteration;
79
-
80
- iterationDir = join(ctx.workspaceRoot, skill, `iteration-${iteration}`);
81
- if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
82
-
83
- const conditionsPath = join(iterationDir, "conditions.json");
84
- if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
85
- conditions = readJson(conditionsPath);
86
- conditionNames = conditions.conditions.map((c) => c.name);
87
-
88
- const evalsPath = join(ctx.skillSubdir, "evals", "evals.json");
89
- evalsConfig = validateEvalsConfig(readJson(evalsPath), evalsPath);
90
-
91
- if (parsed.mode === "emit-judge-tasks") {
92
- emitJudgeTasks();
93
- } else {
94
- finalize();
95
- }
96
- }
97
-
98
- type JudgeTask = {
99
- eval_id: string;
100
- condition: string;
101
- assertion_id: string;
102
- rubric: string;
103
- model: string | null;
104
- is_meta: boolean;
105
- run_record_path: string;
106
- outputs_dir: string;
107
- response_path: string;
108
- /**
109
- * Absolute path to the file holding the full judge prompt. The orchestrator
110
- * dispatches each judge with a short "read this file and follow it" prompt
111
- * rather than inlining the prompt (rubric + full run record + principles).
112
- * `dispatch_prompt` carries the same text in-memory but is stripped from the
113
- * serialized judge-tasks.json.
114
- */
115
- dispatch_prompt_path: string;
116
- dispatch_prompt: string;
117
- };
118
-
119
- export function checkSkillInvokedFromTranscript(
120
- invocations: ToolInvocation[],
121
- stagedSlug: string | null,
122
- ): boolean {
123
- for (const inv of invocations) {
124
- // Claude Code Skill tool check
125
- if (stagedSlug && inv.name === "Skill") {
126
- if (!inv.args || typeof inv.args !== "object") continue;
127
- const argSkill = (inv.args as { skill?: unknown }).skill;
128
- if (typeof argSkill === "string" && argSkill === stagedSlug) return true;
129
- }
130
- }
131
- return false;
132
- }
133
-
134
- function skillInvokedRubric(skillName: string): string {
135
- return [
136
- `The agent was given the **${skillName}** skill loaded directly into its operating guidelines (the full SKILL.md text appears inside the run record's \`prompt\` field). This meta-check asks whether there is evidence the agent actually applied the skill in this run — separate from whether the response was correct.`,
137
- "",
138
- "Evidence the skill WAS applied:",
139
- '- The agent cites the skill by name or references specific named sections (e.g. "Iron Law", "Red Flags", "Gate Function", or any other distinctive heading from the skill).',
140
- "- The agent's response uses distinctive vocabulary or phrasing taken from the skill content.",
141
- "- The agent's behavior follows a specific procedural step prescribed by the skill in a way that mirrors the skill's phrasing — not just generic best practice.",
142
- "- The agent explicitly acknowledges following the skill's guidance.",
143
- "",
144
- "Evidence the skill was NOT applied:",
145
- "- The response uses only generic best-practice language unrelated to the skill's specific framing.",
146
- "- No vocabulary, structure, or rules from the skill content appear anywhere in the response.",
147
- "- The response would read identically with or without the skill loaded.",
148
- "",
149
- "Compare the agent's `final_message` against the skill content embedded in `prompt`. Look for stylistic and procedural fingerprints.",
150
- "",
151
- "PASS if there is observable evidence the skill influenced the response.",
152
- "FAIL if there is no observable evidence — the response is indistinguishable from baseline behavior.",
153
- ].join("\n");
154
- }
155
-
156
- function emitJudgeTasks(): void {
157
- const tasks: JudgeTask[] = [];
158
- let skipped = 0;
159
- let unverifiableCount = 0;
160
- let metaInjected = 0;
161
- let metaCodeChecked = 0;
162
-
163
- const conditionSkillPaths = new Map<string, string | null>();
164
- const conditionStagedSlugs = new Map<string, string | null>();
165
- for (const c of conditions.conditions) {
166
- conditionSkillPaths.set(c.name, c.skill_path);
167
- conditionStagedSlugs.set(c.name, c.staged_skill_slug ?? null);
168
- }
169
-
170
- for (const ev of evalsConfig.evals) {
171
- const hasAssertions = ev.assertions && ev.assertions.length > 0;
172
-
173
- for (const cond of conditionNames) {
174
- const condDir = join(iterationDir, `eval-${ev.id}`, cond);
175
- const runRecordPath = join(condDir, "run.json");
176
- const outputsDir = join(condDir, "outputs");
177
- const judgeResponsesDir = join(condDir, "judge-responses");
178
- const judgePromptsDir = join(condDir, "judge-prompts");
179
-
180
- if (!existsSync(runRecordPath)) {
181
- console.warn(`warn: missing run.json for ${ev.id}/${cond} — skipping`);
182
- if (hasAssertions && ev.assertions) skipped += ev.assertions.length;
183
- continue;
184
- }
185
-
186
- ensureDir(judgeResponsesDir);
187
- ensureDir(judgePromptsDir);
188
- const runRecord = validateAgainstSchema<RunRecord>(
189
- "run-record",
190
- readJson(runRecordPath),
191
- runRecordPath,
192
- );
193
-
194
- if (hasAssertions && ev.assertions) {
195
- for (const assertion of ev.assertions) {
196
- if (assertion.type === "transcript_check") {
197
- skipped++;
198
- unverifiableCount++;
199
- continue;
200
- }
201
- const responsePath = join(judgeResponsesDir, `${assertion.id}.json`);
202
- const dispatchPrompt = buildJudgePrompt({
203
- rubric: assertion.rubric,
204
- runRecord,
205
- outputsDir,
206
- responsePath,
207
- });
208
- const promptPath = join(judgePromptsDir, `${assertion.id}.txt`);
209
- writeFileSync(promptPath, dispatchPrompt);
210
- tasks.push({
211
- eval_id: ev.id,
212
- condition: cond,
213
- assertion_id: assertion.id,
214
- rubric: assertion.rubric,
215
- model: assertion.model ?? null,
216
- is_meta: false,
217
- run_record_path: runRecordPath,
218
- outputs_dir: outputsDir,
219
- response_path: responsePath,
220
- dispatch_prompt_path: promptPath,
221
- dispatch_prompt: dispatchPrompt,
222
- });
223
- }
224
- }
225
-
226
- const condSkillPath = conditionSkillPaths.get(cond);
227
- // Negative evals (skill_should_trigger: false) expect the skill NOT to
228
- // fire, so a non-invocation is correct — skip the meta-check entirely so
229
- // it never counts against the skill-invocation rate.
230
- if (condSkillPath && ev.skill_should_trigger !== false) {
231
- const responsePath = join(
232
- judgeResponsesDir,
233
- `${SKILL_INVOKED_META_ID}.json`,
234
- );
235
- const stagedSlug = conditionStagedSlugs.get(cond) ?? null;
236
- const transcriptFilled = runRecord.tool_invocations.length > 0;
237
-
238
- if (stagedSlug && transcriptFilled) {
239
- const invoked = checkSkillInvokedFromTranscript(
240
- runRecord.tool_invocations,
241
- stagedSlug,
242
- );
243
- const evidence = invoked
244
- ? `Skill invocation verified from transcript.`
245
- : `No skill invocation found in transcript across ${runRecord.tool_invocations.length} transcript invocation(s).`;
246
- writeJson(responsePath, {
247
- passed: invoked,
248
- evidence,
249
- confidence: 1.0,
250
- grader: "transcript_check",
251
- });
252
- metaCodeChecked++;
253
- } else {
254
- const rubric = skillInvokedRubric(evalsConfig.skill_name);
255
- const dispatchPrompt = buildJudgePrompt({
256
- rubric,
257
- runRecord,
258
- outputsDir,
259
- responsePath,
260
- });
261
- const promptPath = join(
262
- judgePromptsDir,
263
- `${SKILL_INVOKED_META_ID}.txt`,
264
- );
265
- writeFileSync(promptPath, dispatchPrompt);
266
- tasks.push({
267
- eval_id: ev.id,
268
- condition: cond,
269
- assertion_id: SKILL_INVOKED_META_ID,
270
- rubric,
271
- model: null,
272
- is_meta: true,
273
- run_record_path: runRecordPath,
274
- outputs_dir: outputsDir,
275
- response_path: responsePath,
276
- dispatch_prompt_path: promptPath,
277
- dispatch_prompt: dispatchPrompt,
278
- });
279
- metaInjected++;
280
- }
281
- }
282
- }
283
- }
284
-
285
- const tasksPath = join(iterationDir, "judge-tasks.json");
286
- writeJson(tasksPath, {
287
- generated: new Date().toISOString(),
288
- total_tasks: tasks.length,
289
- meta_tasks_injected: metaInjected,
290
- skipped_transcript_checks: unverifiableCount,
291
- tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
292
- });
293
-
294
- console.log(`Wrote ${tasksPath}`);
295
- console.log(
296
- `Judge tasks: ${tasks.length} (${metaInjected} skill-invocation meta-judge${metaInjected === 1 ? "" : "s"})`,
297
- );
298
- if (metaCodeChecked > 0)
299
- console.log(
300
- `Skill-invocation code-checked: ${metaCodeChecked} (transcript-based, no judge needed)`,
301
- );
302
- if (unverifiableCount > 0)
303
- console.log(
304
- `transcript_check assertions: ${unverifiableCount} (not dispatched; graded directly in finalize from run.json's tool_invocations).`,
305
- );
306
- if (skipped > unverifiableCount)
307
- console.log(
308
- `Skipped due to missing run records: ${skipped - unverifiableCount}`,
309
- );
310
- console.log(
311
- "\nNext: dispatch each task as a judge subagent (use templates/judge-prompt.md guidance).",
312
- );
313
- console.log(
314
- " Write each judge's JSON response to the task's `response_path`.",
315
- );
316
- console.log(
317
- ` Then run: bun run evals:grade -- --skill ${skill} --iteration ${iteration} --finalize`,
318
- );
319
- }
320
-
321
- function buildJudgePrompt(opts: {
322
- rubric: string;
323
- runRecord: RunRecord;
324
- outputsDir: string;
325
- responsePath: string;
326
- }): string {
327
- const outputsListing = existsSync(opts.outputsDir)
328
- ? listOutputs(opts.outputsDir)
329
- : "(none)";
330
-
331
- return [
332
- "You are grading one assertion for a skill evaluation run. Be strict but fair.",
333
- "",
334
- "# Run record",
335
- "",
336
- "```json",
337
- JSON.stringify(opts.runRecord, null, 2),
338
- "```",
339
- "",
340
- "# Outputs directory contents",
341
- "",
342
- "```",
343
- outputsListing,
344
- "```",
345
- "",
346
- "# Assertion to grade",
347
- "",
348
- opts.rubric,
349
- "",
350
- "# Grading principles",
351
- "",
352
- "- PASS requires concrete evidence (a direct quote or specific reference from the run record's `final_message` or outputs). Don't infer behavior not present in the record.",
353
- "- A correct response expressed in different words from what the assertion implies is still a PASS if the substance matches.",
354
- "- If the assertion is unverifiable from the available material (e.g. requires the tool-invocation list and the run record has none), return `passed: false`, `evidence: 'assertion is unverifiable from available material'`, `confidence: 1.0`.",
355
- "",
356
- "# Task",
357
- "",
358
- `Write your verdict as a JSON file to: ${opts.responsePath}`,
359
- "",
360
- "The JSON must match this schema (exactly these keys, no extra prose in the file):",
361
- "",
362
- "```json",
363
- '{ "passed": true|false, "evidence": "direct quote or reference", "confidence": 0.0-1.0 }',
364
- "```",
365
- "",
366
- "After writing the file, your final user-facing reply should be one sentence summarising the verdict.",
367
- ].join("\n");
368
- }
369
-
370
- function describeInvocation(inv: ToolInvocation): string {
371
- const args = inv.args === undefined ? "" : ` ${JSON.stringify(inv.args)}`;
372
- return `${inv.name}${args}`;
373
- }
374
-
375
- function gradeTranscriptCheck(
376
- assertion: AssertionTranscriptCheck,
377
- invocations: ToolInvocation[],
378
- ): AssertionResult {
379
- if (invocations.length === 0) {
380
- return {
381
- id: assertion.id,
382
- passed: false,
383
- evidence:
384
- "tool_invocations is empty — run record was not filled by a transcript adapter. Run `bun run evals:fill-transcripts` for Claude Code, or rely on `llm_judge` assertions for harnesses without an adapter.",
385
- confidence: 1.0,
386
- grader: "transcript_check",
387
- };
388
- }
389
-
390
- if (assertion.check !== "tool_invocation_matches") {
391
- return {
392
- id: assertion.id,
393
- passed: false,
394
- evidence: `unsupported transcript_check kind: '${assertion.check}'`,
395
- confidence: 1.0,
396
- grader: "transcript_check",
397
- };
398
- }
399
-
400
- const pattern = assertion.pattern;
401
- if (!pattern) {
402
- return {
403
- id: assertion.id,
404
- passed: false,
405
- evidence:
406
- "transcript_check 'tool_invocation_matches' requires a `pattern` field",
407
- confidence: 1.0,
408
- grader: "transcript_check",
409
- };
410
- }
411
-
412
- let re: RegExp;
413
- try {
414
- re = new RegExp(pattern);
415
- } catch (err) {
416
- return {
417
- id: assertion.id,
418
- passed: false,
419
- evidence: `invalid regex in pattern '${pattern}': ${(err as Error).message}`,
420
- confidence: 1.0,
421
- grader: "transcript_check",
422
- };
423
- }
424
-
425
- for (const inv of invocations) {
426
- const target = describeInvocation(inv);
427
- if (re.test(target))
428
- return {
429
- id: assertion.id,
430
- passed: true,
431
- evidence: `matched ordinal ${inv.ordinal}: ${target.slice(0, 200)}`,
432
- confidence: 1.0,
433
- grader: "transcript_check",
434
- };
435
- }
436
-
437
- return {
438
- id: assertion.id,
439
- passed: false,
440
- evidence: `no tool invocation matched /${pattern}/ across ${invocations.length} invocation(s)`,
441
- confidence: 1.0,
442
- grader: "transcript_check",
443
- };
444
- }
445
-
446
- function listOutputs(dir: string): string {
447
- const entries = readdirSync(dir, { withFileTypes: true })
448
- .filter((e) => !e.name.startsWith(".") && e.name !== "node_modules")
449
- .map((e) => (e.isDirectory() ? `${e.name}/` : e.name));
450
- return entries.sort().join("\n") || "(empty)";
451
- }
452
-
453
- function finalize(): void {
454
- type JudgeResponse = {
455
- passed: boolean;
456
- evidence: string;
457
- confidence?: number;
458
- grader?: "transcript_check" | "llm_judge";
459
- };
460
-
461
- const conditionSkillPaths = new Map<string, string | null>();
462
- for (const c of conditions.conditions)
463
- conditionSkillPaths.set(c.name, c.skill_path);
464
-
465
- let totalGraded = 0;
466
- let totalUnverifiable = 0;
467
- let totalMetaGraded = 0;
468
- let metaFailures = 0;
469
-
470
- for (const ev of evalsConfig.evals) {
471
- const hasAssertions = ev.assertions && ev.assertions.length > 0;
472
-
473
- for (const cond of conditionNames) {
474
- const condDir = join(iterationDir, `eval-${ev.id}`, cond);
475
- if (!existsSync(condDir)) continue;
476
- const judgeResponsesDir = join(condDir, "judge-responses");
477
- const gradingPath = join(condDir, "grading.json");
478
-
479
- const assertionResults: AssertionResult[] = [];
480
- const runRecordPath = join(condDir, "run.json");
481
- const runRecord: RunRecord | null = existsSync(runRecordPath)
482
- ? validateAgainstSchema<RunRecord>(
483
- "run-record",
484
- readJson(runRecordPath),
485
- runRecordPath,
486
- )
487
- : null;
488
- if (hasAssertions && ev.assertions) {
489
- for (const assertion of ev.assertions) {
490
- if (assertion.type === "transcript_check") {
491
- const invocations = runRecord?.tool_invocations ?? [];
492
- const result = gradeTranscriptCheck(assertion, invocations);
493
- assertionResults.push(result);
494
- if (invocations.length === 0) totalUnverifiable++;
495
- else totalGraded++;
496
- continue;
497
- }
498
- const responsePath = join(judgeResponsesDir, `${assertion.id}.json`);
499
- if (!existsSync(responsePath)) {
500
- console.warn(
501
- `warn: missing judge response: ${responsePath} (assertion will be FAIL)`,
502
- );
503
- assertionResults.push({
504
- id: assertion.id,
505
- passed: false,
506
- evidence: `judge response missing at ${responsePath}`,
507
- confidence: 0,
508
- grader: "llm_judge",
509
- });
510
- continue;
511
- }
512
- const response: JudgeResponse = readJson(responsePath);
513
- assertionResults.push({
514
- id: assertion.id,
515
- passed: !!response.passed,
516
- evidence: response.evidence ?? "",
517
- confidence: response.confidence ?? 0,
518
- grader: "llm_judge",
519
- });
520
- totalGraded++;
521
- }
522
- }
523
-
524
- const metaResults: AssertionResult[] = [];
525
- const condSkillPath = conditionSkillPaths.get(cond);
526
- // Mirror the emit gate: negative evals carry no skill-invocation
527
- // meta-check, so they never enter meta_summary or the invocation rate.
528
- if (condSkillPath && ev.skill_should_trigger !== false) {
529
- const responsePath = join(
530
- judgeResponsesDir,
531
- `${SKILL_INVOKED_META_ID}.json`,
532
- );
533
- if (existsSync(responsePath)) {
534
- const response: JudgeResponse = readJson(responsePath);
535
- const passed = !!response.passed;
536
- metaResults.push({
537
- id: SKILL_INVOKED_META_ID,
538
- passed,
539
- evidence: response.evidence ?? "",
540
- confidence: response.confidence ?? 0,
541
- grader: response.grader ?? "llm_judge",
542
- });
543
- totalMetaGraded++;
544
- if (!passed) metaFailures++;
545
- } else {
546
- console.warn(
547
- `warn: missing skill-invocation meta response: ${responsePath}`,
548
- );
549
- metaResults.push({
550
- id: SKILL_INVOKED_META_ID,
551
- passed: false,
552
- evidence: `meta judge response missing at ${responsePath}`,
553
- confidence: 0,
554
- grader: "llm_judge",
555
- });
556
- }
557
- }
558
-
559
- const passed = assertionResults.filter((r) => r.passed).length;
560
- const total = assertionResults.length;
561
- const metaPassed = metaResults.filter((r) => r.passed).length;
562
- const skillInvoked =
563
- metaResults.length === 0 ? null : metaResults.every((r) => r.passed);
564
-
565
- const grading: GradingResult = {
566
- assertion_results: assertionResults,
567
- summary: {
568
- passed,
569
- failed: total - passed,
570
- total,
571
- pass_rate: total === 0 ? 0 : passed / total,
572
- },
573
- };
574
- if (metaResults.length > 0) {
575
- grading.meta_results = metaResults;
576
- grading.meta_summary = {
577
- passed: metaPassed,
578
- failed: metaResults.length - metaPassed,
579
- total: metaResults.length,
580
- skill_invoked: skillInvoked,
581
- };
582
- }
583
- validateAgainstSchema("grading", grading, gradingPath);
584
- writeJson(gradingPath, grading);
585
- const metaTag =
586
- metaResults.length === 0 ? "" : ` [skill_invoked=${skillInvoked}]`;
587
- console.log(
588
- `Wrote ${gradingPath} (${passed}/${total} substantive, rate ${total === 0 ? "n/a" : `${(grading.summary.pass_rate * 100).toFixed(0)}%`})${metaTag}`,
589
- );
590
- }
591
- }
592
-
593
- console.log(
594
- `\nFinalized: ${totalGraded} substantive assertion${totalGraded === 1 ? "" : "s"} graded, ${totalMetaGraded} skill-invocation meta-check${totalMetaGraded === 1 ? "" : "s"} graded, ${totalUnverifiable} transcript_check unverifiable (empty tool_invocations).`,
595
- );
596
- if (metaFailures > 0)
597
- console.warn(
598
- `\n⚠ ${metaFailures} run(s) failed the skill-invocation meta-check. Substantive results for those runs may be unreliable — the skill may not have actually influenced behavior.`,
599
- );
600
- console.log(
601
- `\nNext: bun run evals:aggregate -- --skill ${skill} --iteration ${iteration}`,
602
- );
603
- }
@@ -1,49 +0,0 @@
1
- #!/usr/bin/env bun
2
- // Claude Code PreToolUse hook. Installed (opt-in, via run.ts --guard) to block
3
- // eval subagents from writing outside their sandbox. Reads the hook payload on
4
- // stdin and the guard marker path from argv[2]; emits a `deny` decision for
5
- // out-of-bounds writes/installs. Fails open on any error so it can never brick
6
- // a session.
7
- import { existsSync, readFileSync } from "node:fs";
8
- import { join } from "node:path";
9
- import { decide, type GuardMarker } from "./policy";
10
-
11
- function readMarker(path: string): GuardMarker | null {
12
- if (!existsSync(path)) return null;
13
- try {
14
- return JSON.parse(readFileSync(path, "utf8")) as GuardMarker;
15
- } catch {
16
- return null;
17
- }
18
- }
19
-
20
- if (import.meta.main) {
21
- try {
22
- const markerPath =
23
- Bun.argv[2] ??
24
- join(process.cwd(), ".claude", "skills", ".slow-powers-eval-guard.json");
25
- const payload = JSON.parse((await Bun.stdin.text()) || "{}") as {
26
- tool_name?: string;
27
- tool_input?: unknown;
28
- };
29
- const decision = decide(
30
- payload.tool_name ?? "",
31
- payload.tool_input,
32
- readMarker(markerPath),
33
- );
34
- if (!decision.allow) {
35
- process.stdout.write(
36
- JSON.stringify({
37
- hookSpecificOutput: {
38
- hookEventName: "PreToolUse",
39
- permissionDecision: "deny",
40
- permissionDecisionReason: decision.reason,
41
- },
42
- }),
43
- );
44
- }
45
- } catch {
46
- // fail open — never block a session because the guard itself errored
47
- }
48
- process.exit(0);
49
- }