@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,485 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import { mkdirSync, rmSync, utimesSync, writeFileSync } from "node:fs";
3
- import { tmpdir } from "node:os";
4
- import { join } from "node:path";
5
- import {
6
- findByDescription,
7
- listSubagents,
8
- parseTranscript,
9
- parseTranscriptFull,
10
- } from "./claude-code-transcript";
11
-
12
- const FIXTURE_ROOT = join(tmpdir(), `claude-code-adapter-test-${process.pid}`);
13
-
14
- function jsonl(lines: object[]): string {
15
- return `${lines.map((l) => JSON.stringify(l)).join("\n")}\n`;
16
- }
17
-
18
- beforeAll(() => {
19
- mkdirSync(FIXTURE_ROOT, { recursive: true });
20
- });
21
-
22
- afterAll(() => {
23
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
24
- });
25
-
26
- describe("parseTranscript", () => {
27
- test("extracts tool_use blocks from assistant messages with ordinal and args", () => {
28
- const path = join(FIXTURE_ROOT, "simple.jsonl");
29
- writeFileSync(
30
- path,
31
- jsonl([
32
- {
33
- type: "user",
34
- message: { role: "user", content: "Run the tests" },
35
- },
36
- {
37
- type: "assistant",
38
- message: {
39
- role: "assistant",
40
- content: [
41
- { type: "text", text: "Running tests now." },
42
- {
43
- type: "tool_use",
44
- id: "toolu_001",
45
- name: "Bash",
46
- input: { command: "bun test" },
47
- },
48
- ],
49
- },
50
- },
51
- {
52
- type: "user",
53
- message: {
54
- role: "user",
55
- content: [
56
- {
57
- type: "tool_result",
58
- tool_use_id: "toolu_001",
59
- content: "2 pass\n0 fail",
60
- },
61
- ],
62
- },
63
- },
64
- {
65
- type: "assistant",
66
- message: {
67
- role: "assistant",
68
- content: [
69
- {
70
- type: "tool_use",
71
- id: "toolu_002",
72
- name: "Read",
73
- input: { file_path: "/tmp/x.txt" },
74
- },
75
- ],
76
- },
77
- },
78
- ]),
79
- );
80
-
81
- const result = parseTranscript(path);
82
- expect(result).toHaveLength(2);
83
- expect(result[0]).toMatchObject({
84
- name: "Bash",
85
- ordinal: 0,
86
- args: { command: "bun test" },
87
- result: "2 pass\n0 fail",
88
- });
89
- expect(result[1]).toMatchObject({
90
- name: "Read",
91
- ordinal: 1,
92
- args: { file_path: "/tmp/x.txt" },
93
- });
94
- expect(result[1].result).toBeUndefined();
95
- });
96
-
97
- test("returns empty array when no tool_use blocks present", () => {
98
- const path = join(FIXTURE_ROOT, "no-tools.jsonl");
99
- writeFileSync(
100
- path,
101
- jsonl([
102
- { type: "user", message: { role: "user", content: "hi" } },
103
- {
104
- type: "assistant",
105
- message: {
106
- role: "assistant",
107
- content: [{ type: "text", text: "hello" }],
108
- },
109
- },
110
- ]),
111
- );
112
- expect(parseTranscript(path)).toEqual([]);
113
- });
114
-
115
- test("skips malformed JSONL lines without throwing", () => {
116
- const path = join(FIXTURE_ROOT, "malformed.jsonl");
117
- writeFileSync(
118
- path,
119
- [
120
- JSON.stringify({
121
- type: "assistant",
122
- message: {
123
- role: "assistant",
124
- content: [
125
- {
126
- type: "tool_use",
127
- id: "toolu_a",
128
- name: "Bash",
129
- input: { command: "ls" },
130
- },
131
- ],
132
- },
133
- }),
134
- "not valid json",
135
- JSON.stringify({
136
- type: "assistant",
137
- message: {
138
- role: "assistant",
139
- content: [
140
- {
141
- type: "tool_use",
142
- id: "toolu_b",
143
- name: "Read",
144
- input: { file_path: "/tmp" },
145
- },
146
- ],
147
- },
148
- }),
149
- "",
150
- ].join("\n"),
151
- );
152
- const result = parseTranscript(path);
153
- expect(result).toHaveLength(2);
154
- expect(result.map((r) => r.name)).toEqual(["Bash", "Read"]);
155
- });
156
-
157
- test("handles tool_result with array content", () => {
158
- const path = join(FIXTURE_ROOT, "array-result.jsonl");
159
- writeFileSync(
160
- path,
161
- jsonl([
162
- {
163
- type: "assistant",
164
- message: {
165
- role: "assistant",
166
- content: [
167
- {
168
- type: "tool_use",
169
- id: "toolu_x",
170
- name: "Bash",
171
- input: { command: "echo hi" },
172
- },
173
- ],
174
- },
175
- },
176
- {
177
- type: "user",
178
- message: {
179
- role: "user",
180
- content: [
181
- {
182
- type: "tool_result",
183
- tool_use_id: "toolu_x",
184
- content: [{ type: "text", text: "hi" }],
185
- },
186
- ],
187
- },
188
- },
189
- ]),
190
- );
191
- const result = parseTranscript(path);
192
- expect(result).toHaveLength(1);
193
- expect(result[0].result).toBe("hi");
194
- });
195
- });
196
-
197
- describe("parseTranscriptFull", () => {
198
- const usage = (output: number) => ({
199
- input_tokens: 100,
200
- cache_creation_input_tokens: 50,
201
- cache_read_input_tokens: 200,
202
- output_tokens: output,
203
- });
204
-
205
- test("sums usage across unique message ids, deduping repeated ids", () => {
206
- // One API response spans multiple jsonl lines (one per content block) and
207
- // repeats the same message.id + usage on each — it must be counted once.
208
- const path = join(FIXTURE_ROOT, "full-dedup.jsonl");
209
- writeFileSync(
210
- path,
211
- jsonl([
212
- {
213
- type: "user",
214
- timestamp: "2026-06-04T10:00:00.000Z",
215
- message: { role: "user", content: "go" },
216
- },
217
- {
218
- type: "assistant",
219
- timestamp: "2026-06-04T10:00:05.000Z",
220
- message: {
221
- id: "msg_aaa",
222
- role: "assistant",
223
- usage: usage(10),
224
- content: [{ type: "text", text: "first block" }],
225
- },
226
- },
227
- {
228
- type: "assistant",
229
- timestamp: "2026-06-04T10:00:06.000Z",
230
- message: {
231
- id: "msg_aaa",
232
- role: "assistant",
233
- usage: usage(10),
234
- content: [
235
- {
236
- type: "tool_use",
237
- id: "toolu_1",
238
- name: "Bash",
239
- input: { command: "ls" },
240
- },
241
- ],
242
- },
243
- },
244
- {
245
- type: "assistant",
246
- timestamp: "2026-06-04T10:01:00.000Z",
247
- message: {
248
- id: "msg_bbb",
249
- role: "assistant",
250
- usage: usage(40),
251
- content: [{ type: "text", text: "done" }],
252
- },
253
- },
254
- ]),
255
- );
256
-
257
- const full = parseTranscriptFull(path);
258
- // msg_aaa counted once (100+50+200+10) + msg_bbb (100+50+200+40) = 750
259
- expect(full.total_tokens).toBe(750);
260
- });
261
-
262
- test("returns null total_tokens when no usage objects present", () => {
263
- const path = join(FIXTURE_ROOT, "full-no-usage.jsonl");
264
- writeFileSync(
265
- path,
266
- jsonl([
267
- {
268
- type: "assistant",
269
- message: {
270
- role: "assistant",
271
- content: [{ type: "text", text: "hi" }],
272
- },
273
- },
274
- ]),
275
- );
276
- expect(parseTranscriptFull(path).total_tokens).toBeNull();
277
- });
278
-
279
- test("derives duration_ms from first and last line timestamps", () => {
280
- const path = join(FIXTURE_ROOT, "full-duration.jsonl");
281
- writeFileSync(
282
- path,
283
- jsonl([
284
- {
285
- type: "user",
286
- timestamp: "2026-06-04T10:00:00.000Z",
287
- message: { role: "user", content: "go" },
288
- },
289
- {
290
- type: "assistant",
291
- timestamp: "2026-06-04T10:02:30.500Z",
292
- message: {
293
- id: "msg_x",
294
- role: "assistant",
295
- content: [{ type: "text", text: "done" }],
296
- },
297
- },
298
- ]),
299
- );
300
- expect(parseTranscriptFull(path).duration_ms).toBe(150_500);
301
- });
302
-
303
- test("returns null duration_ms with fewer than two timestamps", () => {
304
- const path = join(FIXTURE_ROOT, "full-one-ts.jsonl");
305
- writeFileSync(
306
- path,
307
- jsonl([
308
- {
309
- type: "assistant",
310
- timestamp: "2026-06-04T10:00:00.000Z",
311
- message: { role: "assistant", content: [] },
312
- },
313
- { type: "assistant", message: { role: "assistant", content: [] } },
314
- ]),
315
- );
316
- expect(parseTranscriptFull(path).duration_ms).toBeNull();
317
- });
318
-
319
- test("final_text is the concatenated text of the last assistant message", () => {
320
- const path = join(FIXTURE_ROOT, "full-final-text.jsonl");
321
- writeFileSync(
322
- path,
323
- jsonl([
324
- {
325
- type: "assistant",
326
- message: {
327
- id: "msg_1",
328
- role: "assistant",
329
- content: [{ type: "text", text: "intermediate" }],
330
- },
331
- },
332
- {
333
- type: "assistant",
334
- message: {
335
- id: "msg_2",
336
- role: "assistant",
337
- content: [
338
- { type: "text", text: "All tests pass." },
339
- {
340
- type: "tool_use",
341
- id: "toolu_z",
342
- name: "Bash",
343
- input: { command: "true" },
344
- },
345
- { type: "text", text: "Wrapping up." },
346
- ],
347
- },
348
- },
349
- {
350
- type: "user",
351
- message: {
352
- role: "user",
353
- content: [
354
- { type: "tool_result", tool_use_id: "toolu_z", content: "ok" },
355
- ],
356
- },
357
- },
358
- ]),
359
- );
360
- expect(parseTranscriptFull(path).final_text).toBe(
361
- "All tests pass.\nWrapping up.",
362
- );
363
- });
364
-
365
- test("final_text is null when no assistant text exists", () => {
366
- const path = join(FIXTURE_ROOT, "full-no-text.jsonl");
367
- writeFileSync(
368
- path,
369
- jsonl([{ type: "user", message: { role: "user", content: "hi" } }]),
370
- );
371
- expect(parseTranscriptFull(path).final_text).toBeNull();
372
- });
373
-
374
- test("tool_invocations matches parseTranscript output", () => {
375
- const path = join(FIXTURE_ROOT, "full-invocations.jsonl");
376
- writeFileSync(
377
- path,
378
- jsonl([
379
- {
380
- type: "assistant",
381
- timestamp: "2026-06-04T10:00:00.000Z",
382
- message: {
383
- id: "msg_1",
384
- role: "assistant",
385
- usage: usage(5),
386
- content: [
387
- {
388
- type: "tool_use",
389
- id: "toolu_q",
390
- name: "Read",
391
- input: { file_path: "/tmp/a" },
392
- },
393
- ],
394
- },
395
- },
396
- {
397
- type: "user",
398
- timestamp: "2026-06-04T10:00:02.000Z",
399
- message: {
400
- role: "user",
401
- content: [
402
- {
403
- type: "tool_result",
404
- tool_use_id: "toolu_q",
405
- content: "contents",
406
- },
407
- ],
408
- },
409
- },
410
- ]),
411
- );
412
- expect(parseTranscriptFull(path).tool_invocations).toEqual(
413
- parseTranscript(path),
414
- );
415
- });
416
- });
417
-
418
- describe("listSubagents / findByDescription", () => {
419
- test("matches subagents by meta description", () => {
420
- const dir = join(FIXTURE_ROOT, "subagents");
421
- mkdirSync(dir, { recursive: true });
422
-
423
- writeFileSync(
424
- join(dir, "agent-aaa111.meta.json"),
425
- JSON.stringify({
426
- agentType: "general-purpose",
427
- description: "claim-without-running:with_skill",
428
- toolUseId: "toolu_p1",
429
- }),
430
- );
431
- writeFileSync(join(dir, "agent-aaa111.jsonl"), "");
432
-
433
- writeFileSync(
434
- join(dir, "agent-bbb222.meta.json"),
435
- JSON.stringify({
436
- agentType: "general-purpose",
437
- description: "claim-without-running:without_skill",
438
- toolUseId: "toolu_p2",
439
- }),
440
- );
441
- writeFileSync(join(dir, "agent-bbb222.jsonl"), "");
442
-
443
- expect(listSubagents(dir)).toHaveLength(2);
444
-
445
- const match = findByDescription(dir, "claim-without-running:with_skill");
446
- expect(match).not.toBeNull();
447
- expect(match?.meta.toolUseId).toBe("toolu_p1");
448
-
449
- const miss = findByDescription(dir, "no-such-eval:with_skill");
450
- expect(miss).toBeNull();
451
- });
452
-
453
- test("returns null when subagents dir does not exist", () => {
454
- expect(listSubagents(join(FIXTURE_ROOT, "does-not-exist"))).toEqual([]);
455
- expect(
456
- findByDescription(join(FIXTURE_ROOT, "does-not-exist"), "x"),
457
- ).toBeNull();
458
- });
459
-
460
- test("on duplicate descriptions, returns the most-recently-written transcript", () => {
461
- const dir = join(FIXTURE_ROOT, "dup-subagents");
462
- mkdirSync(dir, { recursive: true });
463
-
464
- // Older agent for this description.
465
- writeFileSync(
466
- join(dir, "agent-old.meta.json"),
467
- JSON.stringify({ description: "dup:with_skill", toolUseId: "toolu_old" }),
468
- );
469
- writeFileSync(join(dir, "agent-old.jsonl"), "");
470
- const old = new Date(Date.now() - 60_000);
471
- utimesSync(join(dir, "agent-old.jsonl"), old, old);
472
-
473
- // Newer agent with the same description (e.g. a retry within the same run).
474
- writeFileSync(
475
- join(dir, "agent-new.meta.json"),
476
- JSON.stringify({ description: "dup:with_skill", toolUseId: "toolu_new" }),
477
- );
478
- writeFileSync(join(dir, "agent-new.jsonl"), "");
479
- const recent = new Date();
480
- utimesSync(join(dir, "agent-new.jsonl"), recent, recent);
481
-
482
- const match = findByDescription(dir, "dup:with_skill");
483
- expect(match?.meta.toolUseId).toBe("toolu_new");
484
- });
485
- });