@slowdini/slow-powers-opencode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +174 -0
  3. package/bootstrap.md +16 -0
  4. package/opencode/plugins/slow-powers.js +86 -0
  5. package/package.json +66 -0
  6. package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
  7. package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
  8. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
  9. package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
  10. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
  11. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
  12. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
  13. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
  14. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
  15. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
  16. package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
  17. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
  18. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
  19. package/skills/evaluating-skills/SKILL.md +448 -0
  20. package/skills/evaluating-skills/evals/evals.json +52 -0
  21. package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
  22. package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
  23. package/skills/evaluating-skills/harness-details/claude.md +135 -0
  24. package/skills/evaluating-skills/pressure-scenarios.md +163 -0
  25. package/skills/evaluating-skills/runner/README.md +140 -0
  26. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
  27. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
  28. package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
  29. package/skills/evaluating-skills/runner/aggregate.ts +228 -0
  30. package/skills/evaluating-skills/runner/context.test.ts +181 -0
  31. package/skills/evaluating-skills/runner/context.ts +90 -0
  32. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
  33. package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
  34. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
  35. package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
  36. package/skills/evaluating-skills/runner/grade.test.ts +347 -0
  37. package/skills/evaluating-skills/runner/grade.ts +603 -0
  38. package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
  39. package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
  40. package/skills/evaluating-skills/runner/guard/install.ts +147 -0
  41. package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
  42. package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
  43. package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
  44. package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
  45. package/skills/evaluating-skills/runner/run.test.ts +716 -0
  46. package/skills/evaluating-skills/runner/run.ts +814 -0
  47. package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
  48. package/skills/evaluating-skills/runner/types.ts +104 -0
  49. package/skills/evaluating-skills/runner/validate-all.ts +54 -0
  50. package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
  51. package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
  52. package/skills/evaluating-skills/runner/validate.test.ts +56 -0
  53. package/skills/evaluating-skills/runner/validate.ts +21 -0
  54. package/skills/evaluating-skills/schema/evals.schema.json +105 -0
  55. package/skills/evaluating-skills/schema/grading.schema.json +84 -0
  56. package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
  57. package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
  58. package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
  59. package/skills/evaluating-skills/templates/evals.json.example +17 -0
  60. package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
  61. package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
  62. package/skills/finishing-a-development-branch/SKILL.md +96 -0
  63. package/skills/finishing-a-development-branch/evals/evals.json +41 -0
  64. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
  65. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
  66. package/skills/hardening-plans/SKILL.md +72 -0
  67. package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
  68. package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
  69. package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
  70. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
  71. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
  72. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
  73. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
  74. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
  75. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
  76. package/skills/hardening-plans/evals/evals.json +114 -0
  77. package/skills/systematic-debugging/CREATION-LOG.md +119 -0
  78. package/skills/systematic-debugging/SKILL.md +84 -0
  79. package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
  80. package/skills/systematic-debugging/condition-based-waiting.md +115 -0
  81. package/skills/systematic-debugging/defense-in-depth.md +122 -0
  82. package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
  83. package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
  84. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  85. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  86. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  87. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  88. package/skills/systematic-debugging/evals/evals.json +45 -0
  89. package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
  90. package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
  91. package/skills/systematic-debugging/find-polluter.sh +63 -0
  92. package/skills/systematic-debugging/root-cause-tracing.md +169 -0
  93. package/skills/systematic-debugging/test-academic.md +14 -0
  94. package/skills/systematic-debugging/test-pressure-1.md +58 -0
  95. package/skills/systematic-debugging/test-pressure-2.md +68 -0
  96. package/skills/systematic-debugging/test-pressure-3.md +69 -0
  97. package/skills/test-driven-development/SKILL.md +93 -0
  98. package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
  99. package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
  100. package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
  101. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
  102. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
  103. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
  104. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
  105. package/skills/test-driven-development/evals/evals.json +77 -0
  106. package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
  107. package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
  108. package/skills/test-driven-development/testing-anti-patterns.md +299 -0
  109. package/skills/using-git-worktrees/SKILL.md +70 -0
  110. package/skills/using-git-worktrees/evals/evals.json +40 -0
  111. package/skills/verification-before-completion/SKILL.md +65 -0
  112. package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
  113. package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
  114. package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
  115. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  116. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  117. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
  118. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  119. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  120. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  121. package/skills/verification-before-completion/evals/evals.json +77 -0
  122. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
  123. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
  124. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
  125. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
  126. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
  127. package/skills/writing-skills/SKILL.md +306 -0
  128. package/skills/writing-skills/evals/evals.json +40 -0
  129. package/skills/writing-skills/graphviz-conventions.dot +172 -0
  130. package/skills/writing-skills/persuasion-principles.md +187 -0
  131. package/skills/writing-skills/scripts/render-graphs.js +181 -0
@@ -0,0 +1,263 @@
1
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
+ import { mkdirSync, rmSync, utimesSync, writeFileSync } from "node:fs";
3
+ import { tmpdir } from "node:os";
4
+ import { join } from "node:path";
5
+ import {
6
+ findByDescription,
7
+ listSubagents,
8
+ parseTranscript,
9
+ } from "./claude-code-transcript";
10
+
11
+ const FIXTURE_ROOT = join(tmpdir(), `claude-code-adapter-test-${process.pid}`);
12
+
13
+ function jsonl(lines: object[]): string {
14
+ return `${lines.map((l) => JSON.stringify(l)).join("\n")}\n`;
15
+ }
16
+
17
+ beforeAll(() => {
18
+ mkdirSync(FIXTURE_ROOT, { recursive: true });
19
+ });
20
+
21
+ afterAll(() => {
22
+ rmSync(FIXTURE_ROOT, { recursive: true, force: true });
23
+ });
24
+
25
+ describe("parseTranscript", () => {
26
+ test("extracts tool_use blocks from assistant messages with ordinal and args", () => {
27
+ const path = join(FIXTURE_ROOT, "simple.jsonl");
28
+ writeFileSync(
29
+ path,
30
+ jsonl([
31
+ {
32
+ type: "user",
33
+ message: { role: "user", content: "Run the tests" },
34
+ },
35
+ {
36
+ type: "assistant",
37
+ message: {
38
+ role: "assistant",
39
+ content: [
40
+ { type: "text", text: "Running tests now." },
41
+ {
42
+ type: "tool_use",
43
+ id: "toolu_001",
44
+ name: "Bash",
45
+ input: { command: "bun test" },
46
+ },
47
+ ],
48
+ },
49
+ },
50
+ {
51
+ type: "user",
52
+ message: {
53
+ role: "user",
54
+ content: [
55
+ {
56
+ type: "tool_result",
57
+ tool_use_id: "toolu_001",
58
+ content: "2 pass\n0 fail",
59
+ },
60
+ ],
61
+ },
62
+ },
63
+ {
64
+ type: "assistant",
65
+ message: {
66
+ role: "assistant",
67
+ content: [
68
+ {
69
+ type: "tool_use",
70
+ id: "toolu_002",
71
+ name: "Read",
72
+ input: { file_path: "/tmp/x.txt" },
73
+ },
74
+ ],
75
+ },
76
+ },
77
+ ]),
78
+ );
79
+
80
+ const result = parseTranscript(path);
81
+ expect(result).toHaveLength(2);
82
+ expect(result[0]).toMatchObject({
83
+ name: "Bash",
84
+ ordinal: 0,
85
+ args: { command: "bun test" },
86
+ result: "2 pass\n0 fail",
87
+ });
88
+ expect(result[1]).toMatchObject({
89
+ name: "Read",
90
+ ordinal: 1,
91
+ args: { file_path: "/tmp/x.txt" },
92
+ });
93
+ expect(result[1].result).toBeUndefined();
94
+ });
95
+
96
+ test("returns empty array when no tool_use blocks present", () => {
97
+ const path = join(FIXTURE_ROOT, "no-tools.jsonl");
98
+ writeFileSync(
99
+ path,
100
+ jsonl([
101
+ { type: "user", message: { role: "user", content: "hi" } },
102
+ {
103
+ type: "assistant",
104
+ message: {
105
+ role: "assistant",
106
+ content: [{ type: "text", text: "hello" }],
107
+ },
108
+ },
109
+ ]),
110
+ );
111
+ expect(parseTranscript(path)).toEqual([]);
112
+ });
113
+
114
+ test("skips malformed JSONL lines without throwing", () => {
115
+ const path = join(FIXTURE_ROOT, "malformed.jsonl");
116
+ writeFileSync(
117
+ path,
118
+ [
119
+ JSON.stringify({
120
+ type: "assistant",
121
+ message: {
122
+ role: "assistant",
123
+ content: [
124
+ {
125
+ type: "tool_use",
126
+ id: "toolu_a",
127
+ name: "Bash",
128
+ input: { command: "ls" },
129
+ },
130
+ ],
131
+ },
132
+ }),
133
+ "not valid json",
134
+ JSON.stringify({
135
+ type: "assistant",
136
+ message: {
137
+ role: "assistant",
138
+ content: [
139
+ {
140
+ type: "tool_use",
141
+ id: "toolu_b",
142
+ name: "Read",
143
+ input: { file_path: "/tmp" },
144
+ },
145
+ ],
146
+ },
147
+ }),
148
+ "",
149
+ ].join("\n"),
150
+ );
151
+ const result = parseTranscript(path);
152
+ expect(result).toHaveLength(2);
153
+ expect(result.map((r) => r.name)).toEqual(["Bash", "Read"]);
154
+ });
155
+
156
+ test("handles tool_result with array content", () => {
157
+ const path = join(FIXTURE_ROOT, "array-result.jsonl");
158
+ writeFileSync(
159
+ path,
160
+ jsonl([
161
+ {
162
+ type: "assistant",
163
+ message: {
164
+ role: "assistant",
165
+ content: [
166
+ {
167
+ type: "tool_use",
168
+ id: "toolu_x",
169
+ name: "Bash",
170
+ input: { command: "echo hi" },
171
+ },
172
+ ],
173
+ },
174
+ },
175
+ {
176
+ type: "user",
177
+ message: {
178
+ role: "user",
179
+ content: [
180
+ {
181
+ type: "tool_result",
182
+ tool_use_id: "toolu_x",
183
+ content: [{ type: "text", text: "hi" }],
184
+ },
185
+ ],
186
+ },
187
+ },
188
+ ]),
189
+ );
190
+ const result = parseTranscript(path);
191
+ expect(result).toHaveLength(1);
192
+ expect(result[0].result).toBe("hi");
193
+ });
194
+ });
195
+
196
+ describe("listSubagents / findByDescription", () => {
197
+ test("matches subagents by meta description", () => {
198
+ const dir = join(FIXTURE_ROOT, "subagents");
199
+ mkdirSync(dir, { recursive: true });
200
+
201
+ writeFileSync(
202
+ join(dir, "agent-aaa111.meta.json"),
203
+ JSON.stringify({
204
+ agentType: "general-purpose",
205
+ description: "claim-without-running:with_skill",
206
+ toolUseId: "toolu_p1",
207
+ }),
208
+ );
209
+ writeFileSync(join(dir, "agent-aaa111.jsonl"), "");
210
+
211
+ writeFileSync(
212
+ join(dir, "agent-bbb222.meta.json"),
213
+ JSON.stringify({
214
+ agentType: "general-purpose",
215
+ description: "claim-without-running:without_skill",
216
+ toolUseId: "toolu_p2",
217
+ }),
218
+ );
219
+ writeFileSync(join(dir, "agent-bbb222.jsonl"), "");
220
+
221
+ expect(listSubagents(dir)).toHaveLength(2);
222
+
223
+ const match = findByDescription(dir, "claim-without-running:with_skill");
224
+ expect(match).not.toBeNull();
225
+ expect(match?.meta.toolUseId).toBe("toolu_p1");
226
+
227
+ const miss = findByDescription(dir, "no-such-eval:with_skill");
228
+ expect(miss).toBeNull();
229
+ });
230
+
231
+ test("returns null when subagents dir does not exist", () => {
232
+ expect(listSubagents(join(FIXTURE_ROOT, "does-not-exist"))).toEqual([]);
233
+ expect(
234
+ findByDescription(join(FIXTURE_ROOT, "does-not-exist"), "x"),
235
+ ).toBeNull();
236
+ });
237
+
238
+ test("on duplicate descriptions, returns the most-recently-written transcript", () => {
239
+ const dir = join(FIXTURE_ROOT, "dup-subagents");
240
+ mkdirSync(dir, { recursive: true });
241
+
242
+ // Older agent for this description.
243
+ writeFileSync(
244
+ join(dir, "agent-old.meta.json"),
245
+ JSON.stringify({ description: "dup:with_skill", toolUseId: "toolu_old" }),
246
+ );
247
+ writeFileSync(join(dir, "agent-old.jsonl"), "");
248
+ const old = new Date(Date.now() - 60_000);
249
+ utimesSync(join(dir, "agent-old.jsonl"), old, old);
250
+
251
+ // Newer agent with the same description (e.g. a retry within the same run).
252
+ writeFileSync(
253
+ join(dir, "agent-new.meta.json"),
254
+ JSON.stringify({ description: "dup:with_skill", toolUseId: "toolu_new" }),
255
+ );
256
+ writeFileSync(join(dir, "agent-new.jsonl"), "");
257
+ const recent = new Date();
258
+ utimesSync(join(dir, "agent-new.jsonl"), recent, recent);
259
+
260
+ const match = findByDescription(dir, "dup:with_skill");
261
+ expect(match?.meta.toolUseId).toBe("toolu_new");
262
+ });
263
+ });
@@ -0,0 +1,146 @@
1
+ import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import type { ToolInvocation } from "../types";
4
+
5
+ type ToolUseBlock = {
6
+ type: "tool_use";
7
+ id: string;
8
+ name: string;
9
+ input: unknown;
10
+ };
11
+
12
+ type ToolResultBlock = {
13
+ type: "tool_result";
14
+ tool_use_id: string;
15
+ content: string | unknown[];
16
+ };
17
+
18
+ type ContentBlock = ToolUseBlock | ToolResultBlock | { type: string };
19
+
20
+ type TranscriptRecord = {
21
+ type: "user" | "assistant" | string;
22
+ message?: {
23
+ role?: string;
24
+ content?: string | ContentBlock[];
25
+ };
26
+ };
27
+
28
+ function flattenContent(
29
+ content: string | ContentBlock[] | undefined,
30
+ ): ContentBlock[] {
31
+ if (!content) return [];
32
+ if (typeof content === "string") return [];
33
+ return content;
34
+ }
35
+
36
+ function stringifyResult(content: ToolResultBlock["content"]): string {
37
+ if (typeof content === "string") return content;
38
+ if (Array.isArray(content))
39
+ return content
40
+ .map((c) => {
41
+ if (typeof c === "string") return c;
42
+ if (c && typeof c === "object" && "text" in c)
43
+ return String((c as { text: unknown }).text);
44
+ return JSON.stringify(c);
45
+ })
46
+ .join("\n");
47
+ return JSON.stringify(content);
48
+ }
49
+
50
+ export function parseTranscript(jsonlPath: string): ToolInvocation[] {
51
+ const raw = readFileSync(jsonlPath, "utf8");
52
+ const lines = raw.split("\n").filter((l) => l.length > 0);
53
+
54
+ const invocations: ToolInvocation[] = [];
55
+ const indexById = new Map<string, number>();
56
+
57
+ for (const line of lines) {
58
+ let record: TranscriptRecord;
59
+ try {
60
+ record = JSON.parse(line) as TranscriptRecord;
61
+ } catch {
62
+ continue;
63
+ }
64
+
65
+ const blocks = flattenContent(record.message?.content);
66
+
67
+ if (record.type === "assistant") {
68
+ for (const block of blocks) {
69
+ if (block.type !== "tool_use") continue;
70
+ const tu = block as ToolUseBlock;
71
+ const ordinal = invocations.length;
72
+ indexById.set(tu.id, ordinal);
73
+ invocations.push({
74
+ name: tu.name,
75
+ args: tu.input,
76
+ ordinal,
77
+ });
78
+ }
79
+ continue;
80
+ }
81
+
82
+ if (record.type === "user") {
83
+ for (const block of blocks) {
84
+ if (block.type !== "tool_result") continue;
85
+ const tr = block as ToolResultBlock;
86
+ const idx = indexById.get(tr.tool_use_id);
87
+ if (idx === undefined) continue;
88
+ invocations[idx].result = stringifyResult(tr.content);
89
+ }
90
+ }
91
+ }
92
+
93
+ return invocations;
94
+ }
95
+
96
+ export type SubagentMeta = {
97
+ agentType?: string;
98
+ description?: string;
99
+ toolUseId?: string;
100
+ };
101
+
102
+ export type SubagentEntry = {
103
+ jsonlPath: string;
104
+ metaPath: string;
105
+ meta: SubagentMeta;
106
+ };
107
+
108
+ export function listSubagents(subagentsDir: string): SubagentEntry[] {
109
+ if (!existsSync(subagentsDir)) return [];
110
+ const files = readdirSync(subagentsDir);
111
+ const out: SubagentEntry[] = [];
112
+ for (const f of files) {
113
+ if (!f.endsWith(".meta.json")) continue;
114
+ const base = f.slice(0, -".meta.json".length);
115
+ const metaPath = join(subagentsDir, f);
116
+ const jsonlPath = join(subagentsDir, `${base}.jsonl`);
117
+ if (!existsSync(jsonlPath)) continue;
118
+ try {
119
+ const meta = JSON.parse(readFileSync(metaPath, "utf8")) as SubagentMeta;
120
+ out.push({ jsonlPath, metaPath, meta });
121
+ } catch {}
122
+ }
123
+ return out;
124
+ }
125
+
126
+ export function findByDescription(
127
+ subagentsDir: string,
128
+ description: string,
129
+ ): SubagentEntry | null {
130
+ const entries = listSubagents(subagentsDir);
131
+ const matches = entries.filter((e) => e.meta.description === description);
132
+ if (matches.length === 0) return null;
133
+ if (matches.length === 1) return matches[0];
134
+
135
+ // Descriptions are namespaced per iteration+run (see run.ts), so duplicates
136
+ // here mean a retry within the same run. Prefer the most-recently-written
137
+ // transcript; readdir order is not chronological.
138
+ matches.sort((a, b) => {
139
+ try {
140
+ return statSync(b.jsonlPath).mtimeMs - statSync(a.jsonlPath).mtimeMs;
141
+ } catch {
142
+ return 0;
143
+ }
144
+ });
145
+ return matches[0];
146
+ }
@@ -0,0 +1,188 @@
1
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
+ import {
3
+ existsSync,
4
+ mkdirSync,
5
+ readFileSync,
6
+ rmSync,
7
+ writeFileSync,
8
+ } from "node:fs";
9
+ import { tmpdir } from "node:os";
10
+ import { join } from "node:path";
11
+
12
+ const FIXTURE_ROOT = join(
13
+ tmpdir(),
14
+ `slow-powers-aggregate-test-${process.pid}`,
15
+ );
16
+ const AGGREGATE_TS = join(import.meta.dir, "aggregate.ts");
17
+
18
+ beforeAll(() => {
19
+ mkdirSync(FIXTURE_ROOT, { recursive: true });
20
+ });
21
+
22
+ afterAll(() => {
23
+ rmSync(FIXTURE_ROOT, { recursive: true, force: true });
24
+ });
25
+
26
+ function writeJson(path: string, value: unknown) {
27
+ writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
28
+ }
29
+
30
+ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
31
+ test("computes benchmark.json from a hand-built graded workspace under CWD", () => {
32
+ const root = join(FIXTURE_ROOT, "agg-basic");
33
+ // Skill dir + skill-under-test (detectRunContext validates SKILL.md exists)
34
+ const skillDir = join(root, "skill-dir");
35
+ const skillSub = join(skillDir, "mr-review");
36
+ mkdirSync(skillSub, { recursive: true });
37
+ writeFileSync(
38
+ join(skillSub, "SKILL.md"),
39
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
40
+ );
41
+
42
+ // Working dir that holds the workspace (mirrors stageRoot/workspaceRoot = CWD)
43
+ const cwd = join(root, "work");
44
+ const iterationDir = join(
45
+ cwd,
46
+ "skills-workspace",
47
+ "mr-review",
48
+ "iteration-1",
49
+ );
50
+ mkdirSync(iterationDir, { recursive: true });
51
+ writeJson(join(iterationDir, "conditions.json"), {
52
+ mode: "new-skill",
53
+ conditions: [
54
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
55
+ { name: "without_skill", skill_path: null },
56
+ ],
57
+ timestamp: new Date().toISOString(),
58
+ harness: "claude-code",
59
+ });
60
+
61
+ const mkCond = (cond: string, passRate: number, tokens: number) => {
62
+ const condDir = join(iterationDir, "eval-e1", cond);
63
+ mkdirSync(condDir, { recursive: true });
64
+ writeJson(join(condDir, "grading.json"), {
65
+ assertion_results: [],
66
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: passRate },
67
+ });
68
+ writeJson(join(condDir, "timing.json"), {
69
+ total_tokens: tokens,
70
+ duration_ms: 1000,
71
+ });
72
+ };
73
+ mkCond("with_skill", 1, 5000);
74
+ mkCond("without_skill", 0, 3000);
75
+
76
+ const res = Bun.spawnSync(
77
+ [
78
+ "bun",
79
+ "run",
80
+ AGGREGATE_TS,
81
+ "--skill-dir",
82
+ skillDir,
83
+ "--skill",
84
+ "mr-review",
85
+ "--iteration",
86
+ "1",
87
+ ],
88
+ { cwd, stdout: "pipe", stderr: "pipe" },
89
+ );
90
+ expect(res.exitCode).toBe(0);
91
+
92
+ const benchmarkPath = join(iterationDir, "benchmark.json");
93
+ expect(existsSync(benchmarkPath)).toBe(true);
94
+ const benchmark = JSON.parse(readFileSync(benchmarkPath, "utf8")) as {
95
+ delta: { pass_rate: number; total_tokens: number };
96
+ run_summary: Record<string, { pass_rate: { mean: number } }>;
97
+ };
98
+ expect(benchmark.run_summary.with_skill.pass_rate.mean).toBe(1);
99
+ expect(benchmark.run_summary.without_skill.pass_rate.mean).toBe(0);
100
+ expect(benchmark.delta.pass_rate).toBe(1);
101
+ expect(benchmark.delta.total_tokens).toBe(2000);
102
+ });
103
+
104
+ test("surfaces stray-writes violations as validity_warnings", () => {
105
+ const root = join(FIXTURE_ROOT, "agg-stray");
106
+ const skillDir = join(root, "skill-dir");
107
+ const skillSub = join(skillDir, "mr-review");
108
+ mkdirSync(skillSub, { recursive: true });
109
+ writeFileSync(
110
+ join(skillSub, "SKILL.md"),
111
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
112
+ );
113
+
114
+ const cwd = join(root, "work");
115
+ const iterationDir = join(
116
+ cwd,
117
+ "skills-workspace",
118
+ "mr-review",
119
+ "iteration-1",
120
+ );
121
+ mkdirSync(iterationDir, { recursive: true });
122
+ writeJson(join(iterationDir, "conditions.json"), {
123
+ mode: "new-skill",
124
+ conditions: [
125
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
126
+ { name: "without_skill", skill_path: null },
127
+ ],
128
+ timestamp: new Date().toISOString(),
129
+ harness: "claude-code",
130
+ });
131
+ for (const cond of ["with_skill", "without_skill"]) {
132
+ const condDir = join(iterationDir, "eval-e1", cond);
133
+ mkdirSync(condDir, { recursive: true });
134
+ writeJson(join(condDir, "grading.json"), {
135
+ assertion_results: [],
136
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
137
+ });
138
+ writeJson(join(condDir, "timing.json"), {
139
+ total_tokens: 100,
140
+ duration_ms: 1,
141
+ });
142
+ }
143
+ writeJson(join(iterationDir, "stray-writes.json"), {
144
+ generated: new Date().toISOString(),
145
+ iteration: 1,
146
+ totals: { violations: 1, warnings: 0 },
147
+ runs: [
148
+ {
149
+ eval_id: "e1",
150
+ condition: "with_skill",
151
+ violations: [
152
+ {
153
+ tool: "Write",
154
+ path: "/repo/runner/run.ts",
155
+ ordinal: 3,
156
+ reason: "x",
157
+ },
158
+ ],
159
+ warnings: [],
160
+ },
161
+ ],
162
+ });
163
+
164
+ const res = Bun.spawnSync(
165
+ [
166
+ "bun",
167
+ "run",
168
+ AGGREGATE_TS,
169
+ "--skill-dir",
170
+ skillDir,
171
+ "--skill",
172
+ "mr-review",
173
+ "--iteration",
174
+ "1",
175
+ ],
176
+ { cwd, stdout: "pipe", stderr: "pipe" },
177
+ );
178
+ expect(res.exitCode).toBe(0);
179
+ const benchmark = JSON.parse(
180
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
181
+ ) as { validity_warnings: string[] };
182
+ expect(
183
+ benchmark.validity_warnings.some(
184
+ (w) => w.includes("e1/with_skill") && w.includes("outside"),
185
+ ),
186
+ ).toBe(true);
187
+ });
188
+ });