@slowdini/slow-powers-opencode 0.1.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +32 -13
  2. package/package.json +5 -1
  3. package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
  4. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  5. package/skills/evaluating-skills/SKILL.md +22 -20
  6. package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
  7. package/skills/evaluating-skills/harness-details/claude.md +51 -15
  8. package/skills/evaluating-skills/harness-parity.md +155 -0
  9. package/skills/evaluating-skills/pressure-scenarios.md +1 -1
  10. package/skills/evaluating-skills/runner/README.md +28 -19
  11. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
  12. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
  13. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
  14. package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
  15. package/skills/evaluating-skills/runner/aggregate.ts +21 -0
  16. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
  17. package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
  18. package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
  19. package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
  20. package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
  21. package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
  22. package/skills/evaluating-skills/runner/record-runs.ts +209 -0
  23. package/skills/evaluating-skills/runner/run.test.ts +523 -0
  24. package/skills/evaluating-skills/runner/run.ts +376 -17
  25. package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
  26. package/skills/evaluating-skills/runner/types.ts +9 -0
  27. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
  28. package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
  29. package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
  30. package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
  31. package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
  32. package/skills/hardening-plans/SKILL.md +1 -1
  33. package/skills/systematic-debugging/SKILL.md +4 -0
  34. package/skills/test-driven-development/SKILL.md +2 -0
  35. package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
  36. package/skills/verifying-development-work/SKILL.md +99 -0
  37. package/skills/verifying-development-work/code-review.md +68 -0
  38. package/skills/verifying-development-work/comment-review.md +85 -0
  39. package/skills/verifying-development-work/evals/baseline/BASELINE.md +23 -0
  40. package/skills/verifying-development-work/evals/baseline/NOTES.md +87 -0
  41. package/skills/verifying-development-work/evals/baseline/benchmark.json +54 -0
  42. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  43. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  44. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  45. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  46. package/skills/verifying-development-work/evals/evals.json +178 -0
  47. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  48. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  49. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
  50. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
  51. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
  52. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
  53. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
  54. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
  55. package/skills/working-in-isolation/SKILL.md +2 -2
  56. package/skills/writing-skills/SKILL.md +2 -3
  57. package/skills/finishing-a-development-branch/SKILL.md +0 -96
  58. package/skills/finishing-a-development-branch/evals/evals.json +0 -41
  59. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
  60. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
  61. package/skills/verification-before-completion/SKILL.md +0 -65
  62. package/skills/verification-before-completion/evals/baseline/BASELINE.md +0 -22
  63. package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
  64. package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
  65. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  66. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  67. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  68. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  69. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  70. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  71. package/skills/verification-before-completion/evals/evals.json +0 -77
  72. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
  73. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
  74. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
  75. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
  76. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
@@ -22,6 +22,7 @@ Other flags:
22
22
  - `--workspace-dir <path>` (optional) — where iteration artifacts are written. Defaults to `<CWD>/skills-workspace`.
23
23
  - `--harness claude-code` (optional, default `claude-code`; the only supported harness).
24
24
  - `--no-stage`, `--dry-run`, `--iteration <N>`, `--mode <new-skill|revision>`, `--baseline <label>`, `--label <label>` — as before.
25
+ - `--ref <git-ref>` (optional, `snapshot` only) — snapshot the skill (SKILL.md + sibling assets, excluding `evals/`) as it existed at a git ref, read straight from git without touching the working tree. Use it for the common edit-first Mode B order: edit the skill, *then* snapshot the old version with `--ref HEAD` (or any commit/tag/branch) as the baseline. Without `--ref`, `snapshot` reads the working tree as before.
25
26
  - `--only <id,id,...>` / `--skip <id,id,...>` (optional) — run only, or all-but, the named eval ids from `evals.json`. The two are mutually exclusive, and every named id must exist (the run aborts with the available ids listed otherwise). Use this for a cost-conscious reduced-set run instead of temporarily editing `evals.json` down. The pre-flight summary and the `N evals × 2 conditions` count reflect the filtered set.
26
27
  - `--plan-mode` (optional, Claude Code) — inject the harness's verbatim plan-mode procedure as an operating-context layer. When set, the runner reads `profiles/<harness>/plan-mode.md` and emits it (via the session adapter's `renderPlanModeContext`) as a `<system-reminder>` block in every dispatch, after the available-skills block and before the user request. It is identical across the with/without-skill arms and recorded as `plan_mode` in `dispatch.json`. This is issue #142's highest-fidelity in-runner approximation of a real plan mode — still text the agent reads, so a pass is necessary-not-sufficient; see *Seeding conversation context (and its ceiling)* in `../SKILL.md`. Opt-in, and meant only for plan-mode-relevant skills; a harness with no profile aborts the run, leaving the portable dispatch contract unchanged.
27
28
 
@@ -29,7 +30,7 @@ Staging is written under the current working directory: `<CWD>/.claude/skills/`.
29
30
 
30
31
  ## Driving the loop
31
32
 
32
- Every run produces both a `dispatch-manifest.md` (human-readable) and a `dispatch.json` (machine-readable). An agent in a session reads `dispatch.json`, dispatches each task itself, and writes the run/timing records to the paths in each task.
33
+ Every run produces both a `dispatch-manifest.md` (human-readable) and a `dispatch.json` (machine-readable). An agent in a session reads `dispatch.json` and dispatches each task itself. On Claude Code the rest is two fixed-order commands around the judge dispatches — `ingest` (record-runs → fill-transcripts → detect-stray-writes → grade) and `finalize` (grade --finalize → aggregate) — so the whole loop is three runner calls and two dispatch batches. On harnesses without persisted transcripts, the agent writes the records to the paths in each task by hand and runs the chained steps individually (the portable path).
33
34
 
34
35
  ## Quickstart (internal / repo use)
35
36
 
@@ -44,20 +45,20 @@ Maintainers run from the repo root; the npm scripts supply `--skill-dir ./skills
44
45
  bun run evals -- --skill <name> --mode new-skill
45
46
 
46
47
  # 3. Read skills-workspace/<name>/iteration-1/dispatch.json and dispatch each
47
- # task as a fresh general-purpose subagent, writing run.json + timing.json
48
- # to the paths in each task.
48
+ # task as a fresh general-purpose subagent (each writes its own
49
+ # outputs/final-message.md).
49
50
 
50
- # 4. Fill tool_invocations from subagent transcripts:
51
- bun run evals:fill-transcripts -- --skill <name> --iteration 1 \
51
+ # 4. Ingest record-runs fill-transcripts → detect-stray-writes → grade,
52
+ # in fixed order (assembles run.json + timing.json from dispatch.json,
53
+ # final-message.md, and the persisted transcripts, then emits judge tasks):
54
+ bun run evals:ingest -- --skill <name> --iteration 1 \
52
55
  --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/
53
56
 
54
- # 5. Grade:
55
- bun run evals:grade -- --skill <name> --iteration 1
56
- # (After judge subagents complete and their responses are written, finalize:)
57
- bun run evals:grade -- --skill <name> --iteration 1 --finalize
57
+ # 5. Dispatch each judge task ingest listed, writing responses to their
58
+ # response_path.
58
59
 
59
- # 6. Aggregate:
60
- bun run evals:aggregate -- --skill <name> --iteration 1
60
+ # 6. Finalize — grade --finalize → aggregate:
61
+ bun run evals:finalize -- --skill <name> --iteration 1
61
62
 
62
63
  # 7. Read skills-workspace/<name>/iteration-1/benchmark.json.
63
64
 
@@ -68,18 +69,24 @@ bun run evals:promote-baseline -- --skill <name> --iteration 1
68
69
 
69
70
  ### Mode B — Evaluate a language change to an existing skill
70
71
 
72
+ The common case is edit-first: you've already changed the skill, then decide to eval.
73
+ Snapshot the *old* version from git — no working-tree dance:
74
+
71
75
  ```bash
72
- # 1. Snapshot current SKILL.md before editing.
73
- bun run evals:snapshot -- --skill <name> --label baseline-2026-05-24
76
+ # 1. Edit skills/<name>/SKILL.md (the "new" version is now in the working tree).
74
77
 
75
- # 2. Edit skills/<name>/SKILL.md.
78
+ # 2. Snapshot the old version straight from git as the baseline.
79
+ bun run evals:snapshot -- --skill <name> --label baseline-2026-05-24 --ref HEAD
76
80
 
77
- # 3. Build the iteration-N workspace, comparing snapshot vs current.
81
+ # 3. Build the iteration-N workspace, comparing baseline (old) vs current (new).
78
82
  bun run evals -- --skill <name> --mode revision --baseline baseline-2026-05-24
79
83
 
80
84
  # 4-7. Same as Mode A.
81
85
  ```
82
86
 
87
+ If you snapshot *before* editing instead, drop `--ref HEAD` from step 2 (it reads the
88
+ working tree) and run it before step 1.
89
+
83
90
  ### Dry run (workspace prep only)
84
91
 
85
92
  ```bash
@@ -102,12 +109,14 @@ If you have the slow-powers plugin installed and a personal skill, you do **not*
102
109
  ## Layout
103
110
 
104
111
  - `context.ts` — `detectRunContext(argv)` builds the `RunContext` every command shares: resolves `--skill-dir`/`--skill`, enumerates sibling skills, resolves `--bootstrap`/`--workspace-dir`, and derives `stageRoot` (CWD) and `workspaceRoot`.
105
- - `run.ts` — orchestrator; builds workspace tree, snapshots SKILL.md, emits dispatch manifest. On Claude Code (default), also stages each condition's snapshot at `<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md` so the subagent can discover and invoke it via the Skill tool, stages every *other* skill found in `--skill-dir` at its natural name so cross-references resolve, and builds the `<session-start-context>` block (see *Environment parity* below). Pass `--no-stage` to opt out and fall back to inlining the SKILL.md into the dispatch prompt. Also handles the `snapshot` subcommand.
112
+ - `run.ts` — orchestrator; builds workspace tree, snapshots SKILL.md, emits dispatch manifest. On Claude Code (default), also stages each condition's snapshot at `<stageRoot>/.claude/skills/slow-powers-eval-<iteration>-<condition>__<skillName>/SKILL.md` so the subagent can discover and invoke it via the Skill tool, stages every *other* skill found in `--skill-dir` at its natural name so cross-references resolve, and builds the `<session-start-context>` block (see *Environment parity* below). Pass `--no-stage` to opt out and fall back to inlining the SKILL.md into the dispatch prompt. Pass `--stage-name <name>` to stage under a verbatim name instead of the eval slug (issue #144 name-confound experiments; single-staging-condition modes only, refuses to clobber an existing dir, registered for next-run cleanup). Also handles the `snapshot`, `ingest`/`finalize` (fixed-order post-dispatch chains over the sibling commands), and `teardown` subcommands.
106
113
  - `grade.ts` — evaluates `transcript_check` assertions directly (regex against `tool_invocations`), emits judge-task files for `llm_judge` assertions, then finalizes by merging judge responses into per-run `grading.json`. The `__skill_invoked` meta-check is code-based on Claude Code when the staged-skill slug is known and `tool_invocations` is populated (deterministic scan for a `Skill` tool call with matching slug); it falls back to an LLM judge looking for behavioral fingerprints when either signal is missing.
107
114
  - `aggregate.ts` — reads grading.json + timing.json from an iteration, writes `benchmark.json` with pass-rate / duration / token stats keyed by condition name.
108
115
  - `promote-baseline.ts` — copies the durable subset of an iteration (`benchmark.json` + each run's `grading.json` + a `BASELINE.md` provenance file) into the skill's version-controlled `evals/baseline/`. Flags: `--skill-dir`/`--skill` (as everywhere), `--iteration <N>` (required), `--label <tag>` (optional, recorded in provenance). Everything else in the workspace stays gitignored.
109
- - `fill-transcripts.ts` — walks the iteration tree, matches each `(eval, condition)` to a subagent transcript by description, parses the transcript with the appropriate adapter, populates `tool_invocations` in `run.json`.
110
- - `adapters/claude-code-transcript.ts` — reads a Claude Code subagent JSONL and returns `ToolInvocation[]`. Also exposes `listSubagents` / `findByDescription` for the fill-transcripts CLI.
116
+ - `record-runs.ts` — assembles a schema-valid `run.json` and backfills `timing.json` for every task in a runner-built iteration, from `dispatch.json` (carry-over fields) + `outputs/final-message.md` (`final_message`, transcript fallback) + the persisted transcript (`tool_invocations`, tokens, duration). Never clobbers existing records without `--overwrite`; transcript-derived timing carries `"source": "transcript"`. Claude-Code-tier, like `fill-transcripts` transcript-less harnesses keep authoring records manually (the portable path).
117
+ - `fill-transcripts.ts` — walks the iteration tree, matches each `(eval, condition)` to a subagent transcript by description, parses the transcript with the appropriate adapter, populates `tool_invocations` in `run.json`. Subsumed by `record-runs` for runner-built iterations; still the tool for filling a pre-existing (hand- or agent-written) `run.json`.
118
+ - `detect-stray-writes.ts` — scans each run's `tool_invocations` for sandbox breaches and writes `stray-writes.json`: write tools targeting paths outside the run's outputs dir (violations), mutating Bash heuristics (warnings), and **live-source reads** — a read tool or Bash command accessing the live skill-under-test directory instead of its staged copy, the signature of the staged-slug resolution race (skills staged mid-session aren't guaranteed resolvable by the Skill tool, whose registry is built at session start; an agent that hits "Unknown skill" improvises and reads the live source, contaminating its arm). `aggregate` lifts all three into `benchmark.json`'s `validity_warnings`.
119
+ - `adapters/claude-code-transcript.ts` — reads a Claude Code subagent JSONL and returns `ToolInvocation[]` (`parseTranscript`), or the full summary with usage tokens deduped by message id, wall-clock duration, and the last assistant text (`parseTranscriptFull`). Also exposes `listSubagents` / `findByDescription` for transcript correlation.
111
120
  - `types.ts` — shared TypeScript types matching `../schema/*.json`.
112
121
  - `validate.ts` / `validate-all.ts` — validator for `evals.json` against the JSON Schema rules. `validate-all.ts` takes `--skill-dir` and validates every skill's `evals.json` in it.
113
122
 
@@ -137,7 +146,7 @@ For the **`without_skill` / baseline condition** in this realistic environment,
137
146
  - **General fallback.** Harnesses without project-local discovery should keep using `--no-stage`; the inline `<skill>` block in the dispatch prompt is the only skill the subagent sees. Bootstrap is omitted in this mode because its references to other skills would mislead the agent.
138
147
  - **Plan-mode profiles (`--plan-mode`).** The plan-mode operating-context layer is also a harness-specific surface. The procedure text lives in `profiles/<harness>/plan-mode.md` and is wrapped by a `renderPlanModeContext` in that harness's session adapter (`adapters/<harness>-session.ts`), exactly mirroring how `renderAvailableSkillsBlock` is harness-specific. Only `profiles/claude-code/plan-mode.md` exists today; a harness that wants this fidelity layer adds its own profile file (its native plan/research mode procedure) plus a renderer alongside the Claude ones. A harness with no profile simply has no `--plan-mode`, and the portable dispatch contract is unchanged.
139
148
 
140
- The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `harness-parity-check.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.
149
+ The committed per-skill baselines (`skills/<skill>/evals/baseline/`) plus the `transcript_check` assertions in the baseline eval suite give other harnesses a concrete target to reproduce: a harness whose adapter populates `tool_invocations` faithfully should be able to re-run a skill's eval and land close to the committed `benchmark.json` delta. See `../harness-parity.md` — the transcript adapter is a parity target, and evals are not production functionality, so a harness can aim high here without risking user-facing behavior.
141
150
 
142
151
  **Operational notes.** Do not run two `run.ts` invocations concurrently against the same CWD — they race on `<stageRoot>/.claude/skills/` and the manifest.
143
152
 
@@ -6,7 +6,7 @@
6
6
  // bullets. Other harnesses (Codex, OpenCode) surface their skills differently,
7
7
  // so this rendering lives in an adapter rather than inline in the harness-
8
8
  // agnostic orchestrator. A new harness adds its own renderer alongside this one
9
- // (see harness-parity-check.md).
9
+ // (see ../../harness-parity.md).
10
10
 
11
11
  import type { AvailableSkill } from "../types";
12
12
 
@@ -33,7 +33,7 @@ export function renderAvailableSkillsBlock(skills: AvailableSkill[]): string {
33
33
  * operating under, not prose it merely reads. The profile text (the verbatim
34
34
  * plan-mode procedure) lives in `../profiles/claude-code/plan-mode.md`; this
35
35
  * adapter owns only the harness-native framing, so a new harness adds its own
36
- * renderer + profile alongside this one (see harness-parity-check.md). Returns
36
+ * renderer + profile alongside this one (see ../../harness-parity.md). Returns
37
37
  * an empty string for empty input so the caller can omit the section entirely.
38
38
  */
39
39
  export function renderPlanModeContext(profileText: string): string {
@@ -6,6 +6,7 @@ import {
6
6
  findByDescription,
7
7
  listSubagents,
8
8
  parseTranscript,
9
+ parseTranscriptFull,
9
10
  } from "./claude-code-transcript";
10
11
 
11
12
  const FIXTURE_ROOT = join(tmpdir(), `claude-code-adapter-test-${process.pid}`);
@@ -193,6 +194,227 @@ describe("parseTranscript", () => {
193
194
  });
194
195
  });
195
196
 
197
+ describe("parseTranscriptFull", () => {
198
+ const usage = (output: number) => ({
199
+ input_tokens: 100,
200
+ cache_creation_input_tokens: 50,
201
+ cache_read_input_tokens: 200,
202
+ output_tokens: output,
203
+ });
204
+
205
+ test("sums usage across unique message ids, deduping repeated ids", () => {
206
+ // One API response spans multiple jsonl lines (one per content block) and
207
+ // repeats the same message.id + usage on each — it must be counted once.
208
+ const path = join(FIXTURE_ROOT, "full-dedup.jsonl");
209
+ writeFileSync(
210
+ path,
211
+ jsonl([
212
+ {
213
+ type: "user",
214
+ timestamp: "2026-06-04T10:00:00.000Z",
215
+ message: { role: "user", content: "go" },
216
+ },
217
+ {
218
+ type: "assistant",
219
+ timestamp: "2026-06-04T10:00:05.000Z",
220
+ message: {
221
+ id: "msg_aaa",
222
+ role: "assistant",
223
+ usage: usage(10),
224
+ content: [{ type: "text", text: "first block" }],
225
+ },
226
+ },
227
+ {
228
+ type: "assistant",
229
+ timestamp: "2026-06-04T10:00:06.000Z",
230
+ message: {
231
+ id: "msg_aaa",
232
+ role: "assistant",
233
+ usage: usage(10),
234
+ content: [
235
+ {
236
+ type: "tool_use",
237
+ id: "toolu_1",
238
+ name: "Bash",
239
+ input: { command: "ls" },
240
+ },
241
+ ],
242
+ },
243
+ },
244
+ {
245
+ type: "assistant",
246
+ timestamp: "2026-06-04T10:01:00.000Z",
247
+ message: {
248
+ id: "msg_bbb",
249
+ role: "assistant",
250
+ usage: usage(40),
251
+ content: [{ type: "text", text: "done" }],
252
+ },
253
+ },
254
+ ]),
255
+ );
256
+
257
+ const full = parseTranscriptFull(path);
258
+ // msg_aaa counted once (100+50+200+10) + msg_bbb (100+50+200+40) = 750
259
+ expect(full.total_tokens).toBe(750);
260
+ });
261
+
262
+ test("returns null total_tokens when no usage objects present", () => {
263
+ const path = join(FIXTURE_ROOT, "full-no-usage.jsonl");
264
+ writeFileSync(
265
+ path,
266
+ jsonl([
267
+ {
268
+ type: "assistant",
269
+ message: {
270
+ role: "assistant",
271
+ content: [{ type: "text", text: "hi" }],
272
+ },
273
+ },
274
+ ]),
275
+ );
276
+ expect(parseTranscriptFull(path).total_tokens).toBeNull();
277
+ });
278
+
279
+ test("derives duration_ms from first and last line timestamps", () => {
280
+ const path = join(FIXTURE_ROOT, "full-duration.jsonl");
281
+ writeFileSync(
282
+ path,
283
+ jsonl([
284
+ {
285
+ type: "user",
286
+ timestamp: "2026-06-04T10:00:00.000Z",
287
+ message: { role: "user", content: "go" },
288
+ },
289
+ {
290
+ type: "assistant",
291
+ timestamp: "2026-06-04T10:02:30.500Z",
292
+ message: {
293
+ id: "msg_x",
294
+ role: "assistant",
295
+ content: [{ type: "text", text: "done" }],
296
+ },
297
+ },
298
+ ]),
299
+ );
300
+ expect(parseTranscriptFull(path).duration_ms).toBe(150_500);
301
+ });
302
+
303
+ test("returns null duration_ms with fewer than two timestamps", () => {
304
+ const path = join(FIXTURE_ROOT, "full-one-ts.jsonl");
305
+ writeFileSync(
306
+ path,
307
+ jsonl([
308
+ {
309
+ type: "assistant",
310
+ timestamp: "2026-06-04T10:00:00.000Z",
311
+ message: { role: "assistant", content: [] },
312
+ },
313
+ { type: "assistant", message: { role: "assistant", content: [] } },
314
+ ]),
315
+ );
316
+ expect(parseTranscriptFull(path).duration_ms).toBeNull();
317
+ });
318
+
319
+ test("final_text is the concatenated text of the last assistant message", () => {
320
+ const path = join(FIXTURE_ROOT, "full-final-text.jsonl");
321
+ writeFileSync(
322
+ path,
323
+ jsonl([
324
+ {
325
+ type: "assistant",
326
+ message: {
327
+ id: "msg_1",
328
+ role: "assistant",
329
+ content: [{ type: "text", text: "intermediate" }],
330
+ },
331
+ },
332
+ {
333
+ type: "assistant",
334
+ message: {
335
+ id: "msg_2",
336
+ role: "assistant",
337
+ content: [
338
+ { type: "text", text: "All tests pass." },
339
+ {
340
+ type: "tool_use",
341
+ id: "toolu_z",
342
+ name: "Bash",
343
+ input: { command: "true" },
344
+ },
345
+ { type: "text", text: "Wrapping up." },
346
+ ],
347
+ },
348
+ },
349
+ {
350
+ type: "user",
351
+ message: {
352
+ role: "user",
353
+ content: [
354
+ { type: "tool_result", tool_use_id: "toolu_z", content: "ok" },
355
+ ],
356
+ },
357
+ },
358
+ ]),
359
+ );
360
+ expect(parseTranscriptFull(path).final_text).toBe(
361
+ "All tests pass.\nWrapping up.",
362
+ );
363
+ });
364
+
365
+ test("final_text is null when no assistant text exists", () => {
366
+ const path = join(FIXTURE_ROOT, "full-no-text.jsonl");
367
+ writeFileSync(
368
+ path,
369
+ jsonl([{ type: "user", message: { role: "user", content: "hi" } }]),
370
+ );
371
+ expect(parseTranscriptFull(path).final_text).toBeNull();
372
+ });
373
+
374
+ test("tool_invocations matches parseTranscript output", () => {
375
+ const path = join(FIXTURE_ROOT, "full-invocations.jsonl");
376
+ writeFileSync(
377
+ path,
378
+ jsonl([
379
+ {
380
+ type: "assistant",
381
+ timestamp: "2026-06-04T10:00:00.000Z",
382
+ message: {
383
+ id: "msg_1",
384
+ role: "assistant",
385
+ usage: usage(5),
386
+ content: [
387
+ {
388
+ type: "tool_use",
389
+ id: "toolu_q",
390
+ name: "Read",
391
+ input: { file_path: "/tmp/a" },
392
+ },
393
+ ],
394
+ },
395
+ },
396
+ {
397
+ type: "user",
398
+ timestamp: "2026-06-04T10:00:02.000Z",
399
+ message: {
400
+ role: "user",
401
+ content: [
402
+ {
403
+ type: "tool_result",
404
+ tool_use_id: "toolu_q",
405
+ content: "contents",
406
+ },
407
+ ],
408
+ },
409
+ },
410
+ ]),
411
+ );
412
+ expect(parseTranscriptFull(path).tool_invocations).toEqual(
413
+ parseTranscript(path),
414
+ );
415
+ });
416
+ });
417
+
196
418
  describe("listSubagents / findByDescription", () => {
197
419
  test("matches subagents by meta description", () => {
198
420
  const dir = join(FIXTURE_ROOT, "subagents");
@@ -15,12 +15,31 @@ type ToolResultBlock = {
15
15
  content: string | unknown[];
16
16
  };
17
17
 
18
- type ContentBlock = ToolUseBlock | ToolResultBlock | { type: string };
18
+ type TextBlock = {
19
+ type: "text";
20
+ text: string;
21
+ };
22
+
23
+ type ContentBlock =
24
+ | ToolUseBlock
25
+ | ToolResultBlock
26
+ | TextBlock
27
+ | { type: string };
28
+
29
+ type UsageRecord = {
30
+ input_tokens?: number;
31
+ output_tokens?: number;
32
+ cache_creation_input_tokens?: number;
33
+ cache_read_input_tokens?: number;
34
+ };
19
35
 
20
36
  type TranscriptRecord = {
21
37
  type: "user" | "assistant" | string;
38
+ timestamp?: string;
22
39
  message?: {
40
+ id?: string;
23
41
  role?: string;
42
+ usage?: UsageRecord;
24
43
  content?: string | ContentBlock[];
25
44
  };
26
45
  };
@@ -47,21 +66,25 @@ function stringifyResult(content: ToolResultBlock["content"]): string {
47
66
  return JSON.stringify(content);
48
67
  }
49
68
 
50
- export function parseTranscript(jsonlPath: string): ToolInvocation[] {
69
+ function readRecords(jsonlPath: string): TranscriptRecord[] {
51
70
  const raw = readFileSync(jsonlPath, "utf8");
52
- const lines = raw.split("\n").filter((l) => l.length > 0);
53
-
54
- const invocations: ToolInvocation[] = [];
55
- const indexById = new Map<string, number>();
56
-
57
- for (const line of lines) {
58
- let record: TranscriptRecord;
71
+ const records: TranscriptRecord[] = [];
72
+ for (const line of raw.split("\n")) {
73
+ if (line.length === 0) continue;
59
74
  try {
60
- record = JSON.parse(line) as TranscriptRecord;
75
+ records.push(JSON.parse(line) as TranscriptRecord);
61
76
  } catch {
62
- continue;
77
+ // skip malformed lines
63
78
  }
79
+ }
80
+ return records;
81
+ }
82
+
83
+ function extractInvocations(records: TranscriptRecord[]): ToolInvocation[] {
84
+ const invocations: ToolInvocation[] = [];
85
+ const indexById = new Map<string, number>();
64
86
 
87
+ for (const record of records) {
65
88
  const blocks = flattenContent(record.message?.content);
66
89
 
67
90
  if (record.type === "assistant") {
@@ -93,6 +116,79 @@ export function parseTranscript(jsonlPath: string): ToolInvocation[] {
93
116
  return invocations;
94
117
  }
95
118
 
119
+ export function parseTranscript(jsonlPath: string): ToolInvocation[] {
120
+ return extractInvocations(readRecords(jsonlPath));
121
+ }
122
+
123
+ export type TranscriptSummary = {
124
+ tool_invocations: ToolInvocation[];
125
+ /**
126
+ * Sum of usage across unique API responses. One response spans multiple
127
+ * jsonl lines (one per content block) and repeats the same `message.id` +
128
+ * `usage` on each, so totals are deduped by `message.id`. Includes cache
129
+ * creation/read tokens — a different accounting than the harness's task
130
+ * completion event.
131
+ */
132
+ total_tokens: number | null;
133
+ /** Wall clock between the first and last line timestamps. */
134
+ duration_ms: number | null;
135
+ /** Concatenated text blocks of the last assistant message. */
136
+ final_text: string | null;
137
+ };
138
+
139
+ export function parseTranscriptFull(jsonlPath: string): TranscriptSummary {
140
+ const records = readRecords(jsonlPath);
141
+
142
+ const usageById = new Map<string, UsageRecord>();
143
+ let firstTs: number | null = null;
144
+ let lastTs: number | null = null;
145
+ let timestampCount = 0;
146
+ let finalText: string | null = null;
147
+
148
+ for (const record of records) {
149
+ if (record.timestamp) {
150
+ const ts = Date.parse(record.timestamp);
151
+ if (!Number.isNaN(ts)) {
152
+ if (firstTs === null) firstTs = ts;
153
+ lastTs = ts;
154
+ timestampCount++;
155
+ }
156
+ }
157
+
158
+ if (record.type !== "assistant") continue;
159
+
160
+ const { id, usage } = record.message ?? {};
161
+ if (id && usage) usageById.set(id, usage);
162
+
163
+ const texts = flattenContent(record.message?.content)
164
+ .filter((b): b is TextBlock => b.type === "text")
165
+ .map((b) => b.text);
166
+ if (texts.length > 0) finalText = texts.join("\n");
167
+ }
168
+
169
+ let totalTokens: number | null = null;
170
+ if (usageById.size > 0) {
171
+ totalTokens = 0;
172
+ for (const usage of usageById.values()) {
173
+ totalTokens +=
174
+ (usage.input_tokens ?? 0) +
175
+ (usage.output_tokens ?? 0) +
176
+ (usage.cache_creation_input_tokens ?? 0) +
177
+ (usage.cache_read_input_tokens ?? 0);
178
+ }
179
+ }
180
+
181
+ return {
182
+ tool_invocations: extractInvocations(records),
183
+ total_tokens: totalTokens,
184
+ duration_ms:
185
+ timestampCount >= 2 && firstTs !== null && lastTs !== null
186
+ ? lastTs - firstTs
187
+ : null,
188
+ final_text: finalText,
189
+ };
190
+ }
191
+
96
192
  export type SubagentMeta = {
97
193
  agentType?: string;
98
194
  description?: string;