@os-eco/overstory-cli 0.7.2 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +21 -9
  2. package/agents/builder.md +6 -0
  3. package/agents/coordinator.md +2 -2
  4. package/agents/lead.md +4 -1
  5. package/agents/merger.md +3 -2
  6. package/agents/monitor.md +1 -1
  7. package/agents/reviewer.md +1 -0
  8. package/agents/scout.md +1 -0
  9. package/package.json +2 -2
  10. package/src/agents/hooks-deployer.test.ts +6 -5
  11. package/src/agents/identity.test.ts +3 -2
  12. package/src/agents/manifest.test.ts +4 -3
  13. package/src/agents/overlay.test.ts +3 -2
  14. package/src/commands/agents.test.ts +5 -4
  15. package/src/commands/agents.ts +18 -8
  16. package/src/commands/completions.test.ts +8 -5
  17. package/src/commands/completions.ts +37 -1
  18. package/src/commands/costs.test.ts +4 -3
  19. package/src/commands/dashboard.test.ts +265 -6
  20. package/src/commands/dashboard.ts +367 -64
  21. package/src/commands/doctor.test.ts +3 -2
  22. package/src/commands/errors.test.ts +3 -2
  23. package/src/commands/feed.test.ts +3 -2
  24. package/src/commands/feed.ts +2 -29
  25. package/src/commands/inspect.test.ts +3 -2
  26. package/src/commands/log.test.ts +248 -8
  27. package/src/commands/log.ts +193 -110
  28. package/src/commands/logs.test.ts +3 -2
  29. package/src/commands/mail.test.ts +3 -2
  30. package/src/commands/metrics.test.ts +4 -3
  31. package/src/commands/nudge.test.ts +3 -2
  32. package/src/commands/prime.test.ts +3 -2
  33. package/src/commands/prime.ts +1 -16
  34. package/src/commands/replay.test.ts +3 -2
  35. package/src/commands/run.test.ts +2 -1
  36. package/src/commands/sling.test.ts +127 -0
  37. package/src/commands/sling.ts +101 -3
  38. package/src/commands/status.test.ts +8 -8
  39. package/src/commands/trace.test.ts +3 -2
  40. package/src/commands/watch.test.ts +3 -2
  41. package/src/config.test.ts +3 -3
  42. package/src/doctor/agents.test.ts +3 -2
  43. package/src/doctor/logs.test.ts +3 -2
  44. package/src/doctor/structure.test.ts +3 -2
  45. package/src/index.ts +3 -1
  46. package/src/logging/color.ts +1 -1
  47. package/src/logging/format.test.ts +110 -0
  48. package/src/logging/format.ts +42 -1
  49. package/src/logging/logger.test.ts +3 -2
  50. package/src/mail/client.test.ts +3 -2
  51. package/src/mail/store.test.ts +3 -2
  52. package/src/merge/queue.test.ts +3 -2
  53. package/src/merge/resolver.test.ts +39 -0
  54. package/src/merge/resolver.ts +1 -1
  55. package/src/metrics/pricing.ts +80 -0
  56. package/src/metrics/transcript.test.ts +58 -1
  57. package/src/metrics/transcript.ts +9 -68
  58. package/src/mulch/client.test.ts +63 -2
  59. package/src/mulch/client.ts +62 -1
  60. package/src/runtimes/claude.test.ts +4 -3
  61. package/src/runtimes/pi-guards.test.ts +55 -2
  62. package/src/runtimes/pi-guards.ts +26 -9
  63. package/src/schema-consistency.test.ts +4 -2
  64. package/src/sessions/compat.test.ts +3 -2
  65. package/src/sessions/store.test.ts +3 -2
  66. package/src/test-helpers.ts +20 -1
  67. package/src/tracker/beads.test.ts +454 -0
  68. package/src/tracker/seeds.test.ts +461 -0
  69. package/src/watchdog/daemon.test.ts +4 -3
  70. package/src/watchdog/triage.test.ts +3 -2
@@ -203,6 +203,9 @@ function createMockMulchClient(
203
203
  action: "analyze",
204
204
  };
205
205
  },
206
+ async appendOutcome() {
207
+ // No-op stub: resolver tests don't exercise outcome appending
208
+ },
206
209
  };
207
210
  }
208
211
 
@@ -1440,6 +1443,42 @@ describe("createMergeResolver", () => {
1440
1443
  });
1441
1444
  });
1442
1445
 
1446
+ describe("queryConflictHistory uses sortByScore", () => {
1447
+ test("passes sortByScore: true to mulch search when querying conflict history", async () => {
1448
+ const repoDir = await createTempGitRepo();
1449
+ try {
1450
+ const defaultBranch = await getDefaultBranch(repoDir);
1451
+ await setupContentConflict(repoDir, defaultBranch);
1452
+
1453
+ const entry = makeTestEntry({
1454
+ branchName: "feature-branch",
1455
+ filesModified: ["src/test.ts"],
1456
+ });
1457
+
1458
+ // Capture search call options
1459
+ let capturedSearchOptions: unknown;
1460
+ const mockMulchClient = createMockMulchClient();
1461
+ mockMulchClient.search = async (_query, options) => {
1462
+ capturedSearchOptions = options;
1463
+ return "";
1464
+ };
1465
+
1466
+ const resolver = createMergeResolver({
1467
+ aiResolveEnabled: false,
1468
+ reimagineEnabled: false,
1469
+ mulchClient: mockMulchClient,
1470
+ });
1471
+
1472
+ await resolver.resolve(entry, defaultBranch, repoDir);
1473
+
1474
+ // Verify sortByScore was passed to search
1475
+ expect(capturedSearchOptions).toMatchObject({ sortByScore: true });
1476
+ } finally {
1477
+ await cleanupTempDir(repoDir);
1478
+ }
1479
+ });
1480
+ });
1481
+
1443
1482
  describe("AI-resolve with history context", () => {
1444
1483
  test("includes historical context in AI prompt when available", async () => {
1445
1484
  const repoDir = await createTempGitRepo();
@@ -514,7 +514,7 @@ async function queryConflictHistory(
514
514
  entry: MergeEntry,
515
515
  ): Promise<ConflictHistory> {
516
516
  try {
517
- const searchOutput = await mulchClient.search("merge-conflict");
517
+ const searchOutput = await mulchClient.search("merge-conflict", { sortByScore: true });
518
518
  const patterns = parseConflictPatterns(searchOutput);
519
519
  return buildConflictHistory(patterns, entry.filesModified);
520
520
  } catch {
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Runtime-agnostic pricing and cost estimation for AI models.
3
+ *
4
+ * Extracted from transcript.ts so any runtime can use cost estimation
5
+ * without pulling in Claude Code-specific JSONL parsing logic.
6
+ *
7
+ * To add support for a new provider model, add an entry to MODEL_PRICING
8
+ * using a lowercase substring that uniquely identifies the model tier
9
+ * (e.g. "opus", "sonnet", "haiku").
10
+ */
11
+
12
+ /** Canonical token usage representation shared across all runtimes. */
13
+ export interface TokenUsage {
14
+ inputTokens: number;
15
+ outputTokens: number;
16
+ cacheReadTokens: number;
17
+ cacheCreationTokens: number;
18
+ modelUsed: string | null;
19
+ }
20
+
21
+ /** Pricing per million tokens (USD). */
22
+ export interface ModelPricing {
23
+ inputPerMTok: number;
24
+ outputPerMTok: number;
25
+ cacheReadPerMTok: number;
26
+ cacheCreationPerMTok: number;
27
+ }
28
+
29
+ /** Hardcoded pricing for known Claude models. */
30
+ const MODEL_PRICING: Record<string, ModelPricing> = {
31
+ opus: {
32
+ inputPerMTok: 15,
33
+ outputPerMTok: 75,
34
+ cacheReadPerMTok: 1.5, // 10% of input
35
+ cacheCreationPerMTok: 3.75, // 25% of input
36
+ },
37
+ sonnet: {
38
+ inputPerMTok: 3,
39
+ outputPerMTok: 15,
40
+ cacheReadPerMTok: 0.3, // 10% of input
41
+ cacheCreationPerMTok: 0.75, // 25% of input
42
+ },
43
+ haiku: {
44
+ inputPerMTok: 0.8,
45
+ outputPerMTok: 4,
46
+ cacheReadPerMTok: 0.08, // 10% of input
47
+ cacheCreationPerMTok: 0.2, // 25% of input
48
+ },
49
+ };
50
+
51
+ /**
52
+ * Determine the pricing tier for a given model string.
53
+ * Matches on substring: "opus" -> opus pricing, "sonnet" -> sonnet, "haiku" -> haiku.
54
+ * Returns null if unrecognized.
55
+ */
56
+ export function getPricingForModel(model: string): ModelPricing | null {
57
+ const lower = model.toLowerCase();
58
+ if (lower.includes("opus")) return MODEL_PRICING.opus ?? null;
59
+ if (lower.includes("sonnet")) return MODEL_PRICING.sonnet ?? null;
60
+ if (lower.includes("haiku")) return MODEL_PRICING.haiku ?? null;
61
+ return null;
62
+ }
63
+
64
+ /**
65
+ * Calculate the estimated cost in USD for a given usage and model.
66
+ * Returns null if the model is unrecognized.
67
+ */
68
+ export function estimateCost(usage: TokenUsage): number | null {
69
+ if (usage.modelUsed === null) return null;
70
+
71
+ const pricing = getPricingForModel(usage.modelUsed);
72
+ if (pricing === null) return null;
73
+
74
+ const inputCost = (usage.inputTokens / 1_000_000) * pricing.inputPerMTok;
75
+ const outputCost = (usage.outputTokens / 1_000_000) * pricing.outputPerMTok;
76
+ const cacheReadCost = (usage.cacheReadTokens / 1_000_000) * pricing.cacheReadPerMTok;
77
+ const cacheCreationCost = (usage.cacheCreationTokens / 1_000_000) * pricing.cacheCreationPerMTok;
78
+
79
+ return inputCost + outputCost + cacheReadCost + cacheCreationCost;
80
+ }
@@ -1,8 +1,13 @@
1
1
  /**
2
- * Tests for Claude Code transcript JSONL parser.
2
+ * Tests for Claude Code transcript JSONL parser and pricing.ts module.
3
3
  *
4
4
  * Uses temp files with real-format JSONL data. No mocks.
5
5
  * Philosophy: "never mock what you can use for real" (mx-252b16).
6
+ *
7
+ * Coverage:
8
+ * - parseTranscriptUsage (transcript.ts)
9
+ * - estimateCost re-export (transcript.ts -> pricing.ts)
10
+ * - getPricingForModel (pricing.ts)
6
11
  */
7
12
 
8
13
  import { afterEach, beforeEach, describe, expect, test } from "bun:test";
@@ -10,6 +15,7 @@ import { mkdtemp } from "node:fs/promises";
10
15
  import { tmpdir } from "node:os";
11
16
  import { join } from "node:path";
12
17
  import { cleanupTempDir } from "../test-helpers.ts";
18
+ import { getPricingForModel, estimateCost as pricingEstimateCost } from "./pricing.ts";
13
19
  import { estimateCost, parseTranscriptUsage } from "./transcript.ts";
14
20
 
15
21
  let tempDir: string;
@@ -354,3 +360,54 @@ describe("estimateCost", () => {
354
360
  }
355
361
  });
356
362
  });
363
+
364
+ // === getPricingForModel (pricing.ts) ===
365
+
366
+ describe("getPricingForModel", () => {
367
+ test("matches opus substring", () => {
368
+ const pricing = getPricingForModel("claude-opus-4-6");
369
+ expect(pricing).not.toBeNull();
370
+ if (pricing !== null) {
371
+ expect(pricing.inputPerMTok).toBe(15);
372
+ expect(pricing.outputPerMTok).toBe(75);
373
+ }
374
+ });
375
+
376
+ test("matches sonnet substring", () => {
377
+ const pricing = getPricingForModel("claude-sonnet-4-20250514");
378
+ expect(pricing).not.toBeNull();
379
+ if (pricing !== null) {
380
+ expect(pricing.inputPerMTok).toBe(3);
381
+ expect(pricing.outputPerMTok).toBe(15);
382
+ }
383
+ });
384
+
385
+ test("matches haiku substring", () => {
386
+ const pricing = getPricingForModel("claude-haiku-3-5-20241022");
387
+ expect(pricing).not.toBeNull();
388
+ if (pricing !== null) {
389
+ expect(pricing.inputPerMTok).toBe(0.8);
390
+ expect(pricing.outputPerMTok).toBe(4);
391
+ }
392
+ });
393
+
394
+ test("returns null for unknown model", () => {
395
+ const pricing = getPricingForModel("gpt-4o");
396
+ expect(pricing).toBeNull();
397
+ });
398
+ });
399
+
400
+ // === re-export parity ===
401
+
402
+ describe("estimateCost re-export parity", () => {
403
+ test("transcript.estimateCost and pricing.estimateCost produce same result", () => {
404
+ const usage = {
405
+ inputTokens: 1_000_000,
406
+ outputTokens: 1_000_000,
407
+ cacheReadTokens: 1_000_000,
408
+ cacheCreationTokens: 1_000_000,
409
+ modelUsed: "claude-opus-4-6",
410
+ };
411
+ expect(estimateCost(usage)).toBe(pricingEstimateCost(usage));
412
+ });
413
+ });
@@ -1,8 +1,12 @@
1
1
  /**
2
2
  * Parser for Claude Code transcript JSONL files.
3
3
  *
4
- * Extracts token usage data from assistant-type entries in transcript files
5
- * at ~/.claude/projects/{project-slug}/{session-id}.jsonl.
4
+ * This is a Claude Code-specific JSONL parser that extracts token usage data
5
+ * from assistant-type entries in transcript files at
6
+ * ~/.claude/projects/{project-slug}/{session-id}.jsonl.
7
+ *
8
+ * Runtime-agnostic pricing logic lives in ./pricing.ts. Other runtimes
9
+ * implement their own transcript parsing via AgentRuntime.parseTranscript().
6
10
  *
7
11
  * Each assistant entry contains per-turn usage:
8
12
  * {
@@ -19,74 +23,11 @@
19
23
  * }
20
24
  */
21
25
 
22
- export interface TranscriptUsage {
23
- inputTokens: number;
24
- outputTokens: number;
25
- cacheReadTokens: number;
26
- cacheCreationTokens: number;
27
- modelUsed: string | null;
28
- }
29
-
30
- /** Pricing per million tokens (USD). */
31
- interface ModelPricing {
32
- inputPerMTok: number;
33
- outputPerMTok: number;
34
- cacheReadPerMTok: number;
35
- cacheCreationPerMTok: number;
36
- }
37
-
38
- /** Hardcoded pricing for known Claude models. */
39
- const MODEL_PRICING: Record<string, ModelPricing> = {
40
- opus: {
41
- inputPerMTok: 15,
42
- outputPerMTok: 75,
43
- cacheReadPerMTok: 1.5, // 10% of input
44
- cacheCreationPerMTok: 3.75, // 25% of input
45
- },
46
- sonnet: {
47
- inputPerMTok: 3,
48
- outputPerMTok: 15,
49
- cacheReadPerMTok: 0.3, // 10% of input
50
- cacheCreationPerMTok: 0.75, // 25% of input
51
- },
52
- haiku: {
53
- inputPerMTok: 0.8,
54
- outputPerMTok: 4,
55
- cacheReadPerMTok: 0.08, // 10% of input
56
- cacheCreationPerMTok: 0.2, // 25% of input
57
- },
58
- };
59
-
60
- /**
61
- * Determine the pricing tier for a given model string.
62
- * Matches on substring: "opus" -> opus pricing, "sonnet" -> sonnet, "haiku" -> haiku.
63
- * Returns null if unrecognized.
64
- */
65
- function getPricingForModel(model: string): ModelPricing | null {
66
- const lower = model.toLowerCase();
67
- if (lower.includes("opus")) return MODEL_PRICING.opus ?? null;
68
- if (lower.includes("sonnet")) return MODEL_PRICING.sonnet ?? null;
69
- if (lower.includes("haiku")) return MODEL_PRICING.haiku ?? null;
70
- return null;
71
- }
72
-
73
- /**
74
- * Calculate the estimated cost in USD for a given usage and model.
75
- * Returns null if the model is unrecognized.
76
- */
77
- export function estimateCost(usage: TranscriptUsage): number | null {
78
- if (usage.modelUsed === null) return null;
26
+ import type { TokenUsage } from "./pricing.ts";
79
27
 
80
- const pricing = getPricingForModel(usage.modelUsed);
81
- if (pricing === null) return null;
28
+ export type TranscriptUsage = TokenUsage;
82
29
 
83
- const inputCost = (usage.inputTokens / 1_000_000) * pricing.inputPerMTok;
84
- const outputCost = (usage.outputTokens / 1_000_000) * pricing.outputPerMTok;
85
- const cacheReadCost = (usage.cacheReadTokens / 1_000_000) * pricing.cacheReadPerMTok;
86
- const cacheCreationCost = (usage.cacheCreationTokens / 1_000_000) * pricing.cacheCreationPerMTok;
87
-
88
- return inputCost + outputCost + cacheReadCost + cacheCreationCost;
89
- }
30
+ export { estimateCost } from "./pricing.ts";
90
31
 
91
32
  /**
92
33
  * Narrow an unknown value to determine if it looks like a transcript assistant entry.
@@ -6,10 +6,11 @@
6
6
  */
7
7
 
8
8
  import { afterEach, beforeEach, describe, expect, test } from "bun:test";
9
- import { mkdtemp, rm } from "node:fs/promises";
9
+ import { mkdtemp } from "node:fs/promises";
10
10
  import { tmpdir } from "node:os";
11
11
  import { join } from "node:path";
12
12
  import { AgentError } from "../errors.ts";
13
+ import { cleanupTempDir } from "../test-helpers.ts";
13
14
  import { createMulchClient } from "./client.ts";
14
15
 
15
16
  // Check if mulch is available
@@ -30,7 +31,7 @@ describe("createMulchClient", () => {
30
31
  });
31
32
 
32
33
  afterEach(async () => {
33
- await rm(tempDir, { recursive: true, force: true });
34
+ await cleanupTempDir(tempDir);
34
35
  });
35
36
 
36
37
  /**
@@ -162,6 +163,33 @@ describe("createMulchClient", () => {
162
163
  });
163
164
  expect(typeof result).toBe("string");
164
165
  });
166
+
167
+ test.skipIf(!hasMulch)("passes --sort-by-score flag in prime options", async () => {
168
+ await initMulch();
169
+ const client = createMulchClient(tempDir);
170
+ // mulch prime --sort-by-score may not be supported in older mulch versions;
171
+ // the interface and impl are forward-looking — test accepts both outcomes.
172
+ try {
173
+ const result = await client.prime([], "markdown", { sortByScore: true });
174
+ expect(typeof result).toBe("string");
175
+ } catch (error) {
176
+ expect(error).toBeInstanceOf(AgentError);
177
+ }
178
+ });
179
+
180
+ test.skipIf(!hasMulch)("passes --sort-by-score with --files together", async () => {
181
+ await initMulch();
182
+ const client = createMulchClient(tempDir);
183
+ try {
184
+ const result = await client.prime([], "markdown", {
185
+ files: ["src/config.ts"],
186
+ sortByScore: true,
187
+ });
188
+ expect(typeof result).toBe("string");
189
+ } catch (error) {
190
+ expect(error).toBeInstanceOf(AgentError);
191
+ }
192
+ });
165
193
  });
166
194
 
167
195
  describe("status", () => {
@@ -452,6 +480,39 @@ describe("createMulchClient", () => {
452
480
  expect(typeof result).toBe("string");
453
481
  });
454
482
 
483
+ test.skipIf(!hasMulch)("passes --classification flag when provided", async () => {
484
+ await initMulch();
485
+ const client = createMulchClient(tempDir);
486
+ const result = await client.search("test", { classification: "foundational" });
487
+ expect(typeof result).toBe("string");
488
+ });
489
+
490
+ test.skipIf(!hasMulch)("passes --outcome-status flag when provided (success)", async () => {
491
+ await initMulch();
492
+ const client = createMulchClient(tempDir);
493
+ const result = await client.search("test", { outcomeStatus: "success" });
494
+ expect(typeof result).toBe("string");
495
+ });
496
+
497
+ test.skipIf(!hasMulch)("passes --outcome-status flag when provided (failure)", async () => {
498
+ await initMulch();
499
+ const client = createMulchClient(tempDir);
500
+ const result = await client.search("test", { outcomeStatus: "failure" });
501
+ expect(typeof result).toBe("string");
502
+ });
503
+
504
+ test.skipIf(!hasMulch)("passes all search filters together", async () => {
505
+ await initMulch();
506
+ const client = createMulchClient(tempDir);
507
+ const result = await client.search("test", {
508
+ classification: "tactical",
509
+ outcomeStatus: "success",
510
+ sortByScore: true,
511
+ file: "src/config.ts",
512
+ });
513
+ expect(typeof result).toBe("string");
514
+ });
515
+
455
516
  test.skipIf(!hasMulch)("roundtrip: record via API then search and find it", async () => {
456
517
  await initMulch();
457
518
  const addProc = Bun.spawn(["ml", "add", "roundtrip"], {
@@ -28,9 +28,22 @@ export interface MulchClient {
28
28
  options?: {
29
29
  files?: string[];
30
30
  excludeDomain?: string[];
31
+ sortByScore?: boolean;
31
32
  },
32
33
  ): Promise<string>;
33
34
 
35
+ /** Append an outcome entry to an existing record by ID in the given domain. */
36
+ appendOutcome(
37
+ domain: string,
38
+ id: string,
39
+ outcome: {
40
+ status: "success" | "failure" | "partial";
41
+ agent?: string;
42
+ notes?: string;
43
+ duration?: number;
44
+ },
45
+ ): Promise<void>;
46
+
34
47
  /** Show domain statistics. */
35
48
  status(): Promise<MulchStatus>;
36
49
 
@@ -58,7 +71,15 @@ export interface MulchClient {
58
71
  query(domain?: string): Promise<string>;
59
72
 
60
73
  /** Search records across all domains. */
61
- search(query: string, options?: { file?: string; sortByScore?: boolean }): Promise<string>;
74
+ search(
75
+ query: string,
76
+ options?: {
77
+ file?: string;
78
+ sortByScore?: boolean;
79
+ classification?: string;
80
+ outcomeStatus?: "success" | "failure";
81
+ },
82
+ ): Promise<string>;
62
83
 
63
84
  /** Show expertise record changes since a git ref. */
64
85
  diff(options?: { since?: string }): Promise<MulchDiffResult>;
@@ -214,6 +235,8 @@ interface MulchProgrammaticApi {
214
235
  type?: string;
215
236
  tag?: string;
216
237
  classification?: string;
238
+ outcomeStatus?: "success" | "failure";
239
+ sortByScore?: boolean;
217
240
  file?: string;
218
241
  cwd?: string;
219
242
  },
@@ -222,6 +245,22 @@ interface MulchProgrammaticApi {
222
245
  domain: string,
223
246
  options?: { type?: string; classification?: string; file?: string; cwd?: string },
224
247
  ): Promise<MulchExpertiseRecord[]>;
248
+ appendOutcome(
249
+ domain: string,
250
+ id: string,
251
+ outcome: {
252
+ status: "success" | "failure" | "partial";
253
+ agent?: string;
254
+ notes?: string;
255
+ duration?: number;
256
+ recorded_at?: string;
257
+ },
258
+ options?: { cwd?: string },
259
+ ): Promise<{
260
+ record: MulchExpertiseRecord;
261
+ outcome: { status: string; agent?: string; notes?: string; recorded_at?: string };
262
+ total_outcomes: number;
263
+ }>;
225
264
  }
226
265
 
227
266
  const MULCH_PKG = "@os-eco/mulch-cli";
@@ -406,6 +445,9 @@ export function createMulchClient(cwd: string): MulchClient {
406
445
  if (options?.excludeDomain && options.excludeDomain.length > 0) {
407
446
  args.push("--exclude-domain", ...options.excludeDomain);
408
447
  }
448
+ if (options?.sortByScore) {
449
+ args.push("--sort-by-score");
450
+ }
409
451
  const { stdout } = await runMulch(args, "prime");
410
452
  return stdout;
411
453
  },
@@ -472,6 +514,9 @@ export function createMulchClient(cwd: string): MulchClient {
472
514
  const api = await loadMulchApi();
473
515
  const results = await api.searchExpertise(query, {
474
516
  file: options?.file,
517
+ classification: options?.classification,
518
+ outcomeStatus: options?.outcomeStatus,
519
+ sortByScore: options?.sortByScore,
475
520
  cwd,
476
521
  });
477
522
  return formatSearchResults(results);
@@ -595,5 +640,21 @@ export function createMulchClient(cwd: string): MulchClient {
595
640
  throw new AgentError(`Failed to parse JSON from mulch compact: ${trimmed.slice(0, 200)}`);
596
641
  }
597
642
  },
643
+
644
+ async appendOutcome(domain, id, outcome) {
645
+ const api = await loadMulchApi();
646
+ try {
647
+ await api.appendOutcome(
648
+ domain,
649
+ id,
650
+ { ...outcome, recorded_at: new Date().toISOString() },
651
+ { cwd },
652
+ );
653
+ } catch (error) {
654
+ throw new AgentError(
655
+ `mulch appendOutcome ${domain}/${id} failed: ${error instanceof Error ? error.message : String(error)}`,
656
+ );
657
+ }
658
+ },
598
659
  };
599
660
  }
@@ -1,7 +1,8 @@
1
1
  import { afterEach, beforeEach, describe, expect, test } from "bun:test";
2
- import { mkdtemp, rm } from "node:fs/promises";
2
+ import { mkdtemp } from "node:fs/promises";
3
3
  import { tmpdir } from "node:os";
4
4
  import { join } from "node:path";
5
+ import { cleanupTempDir } from "../test-helpers.ts";
5
6
  import type { ResolvedModel } from "../types.ts";
6
7
  import { ClaudeRuntime } from "./claude.ts";
7
8
  import type { SpawnOpts } from "./types.ts";
@@ -239,7 +240,7 @@ describe("ClaudeRuntime", () => {
239
240
  });
240
241
 
241
242
  afterEach(async () => {
242
- await rm(tempDir, { recursive: true, force: true });
243
+ await cleanupTempDir(tempDir);
243
244
  });
244
245
 
245
246
  test("writes overlay to .claude/CLAUDE.md when overlay is provided", async () => {
@@ -373,7 +374,7 @@ describe("ClaudeRuntime", () => {
373
374
  });
374
375
 
375
376
  afterEach(async () => {
376
- await rm(tempDir, { recursive: true, force: true });
377
+ await cleanupTempDir(tempDir);
377
378
  });
378
379
 
379
380
  test("returns null for non-existent file", async () => {
@@ -349,7 +349,9 @@ describe("generatePiGuardExtension", () => {
349
349
 
350
350
  test("generated code contains pi.exec ov log tool-start in tool_call handler", () => {
351
351
  const generated = generatePiGuardExtension(builderHooks());
352
- expect(generated).toContain('pi.exec("ov", ["log", "tool-start", "--agent", AGENT_NAME])');
352
+ expect(generated).toContain(
353
+ 'pi.exec("ov", ["log", "tool-start", "--agent", AGENT_NAME, "--tool-name", event.toolName])',
354
+ );
353
355
  });
354
356
 
355
357
  test('generated code contains pi.on("tool_execution_end", ...)', () => {
@@ -359,7 +361,9 @@ describe("generatePiGuardExtension", () => {
359
361
 
360
362
  test("generated code contains pi.exec ov log tool-end in tool_execution_end handler", () => {
361
363
  const generated = generatePiGuardExtension(builderHooks());
362
- expect(generated).toContain('pi.exec("ov", ["log", "tool-end", "--agent", AGENT_NAME])');
364
+ expect(generated).toContain(
365
+ 'pi.exec("ov", ["log", "tool-end", "--agent", AGENT_NAME, "--tool-name", event.toolName])',
366
+ );
363
367
  });
364
368
 
365
369
  test('generated code contains pi.on("session_shutdown", ...)', () => {
@@ -373,6 +377,55 @@ describe("generatePiGuardExtension", () => {
373
377
  'await pi.exec("ov", ["log", "session-end", "--agent", AGENT_NAME])',
374
378
  );
375
379
  });
380
+
381
+ test("tool_call handler passes --tool-name event.toolName to tool-start", () => {
382
+ const generated = generatePiGuardExtension(builderHooks());
383
+ expect(generated).toContain(
384
+ 'pi.exec("ov", ["log", "tool-start", "--agent", AGENT_NAME, "--tool-name", event.toolName])',
385
+ );
386
+ });
387
+
388
+ test("tool_execution_end handler passes --tool-name event.toolName to tool-end", () => {
389
+ const generated = generatePiGuardExtension(builderHooks());
390
+ expect(generated).toContain(
391
+ 'pi.exec("ov", ["log", "tool-end", "--agent", AGENT_NAME, "--tool-name", event.toolName])',
392
+ );
393
+ });
394
+
395
+ test("tool_execution_end handler uses named event parameter (not _event)", () => {
396
+ const generated = generatePiGuardExtension(builderHooks());
397
+ expect(generated).toContain('pi.on("tool_execution_end", async (event) => {');
398
+ expect(generated).not.toContain('pi.on("tool_execution_end", async (_event) => {');
399
+ });
400
+
401
+ test('generated code contains pi.on("agent_end", ...)', () => {
402
+ const generated = generatePiGuardExtension(builderHooks());
403
+ expect(generated).toContain('pi.on("agent_end",');
404
+ });
405
+
406
+ test("generated code awaits pi.exec ov log session-end in agent_end handler", () => {
407
+ const generated = generatePiGuardExtension(builderHooks());
408
+ // agent_end handler must await (not fire-and-forget) so it completes
409
+ // before Pi moves on, ensuring the SessionStore is updated.
410
+ const agentEndIdx = generated.indexOf('pi.on("agent_end"');
411
+ const sessionShutdownIdx = generated.indexOf('pi.on("session_shutdown"');
412
+ expect(agentEndIdx).toBeGreaterThan(-1);
413
+ expect(sessionShutdownIdx).toBeGreaterThan(-1);
414
+ // agent_end must come before session_shutdown
415
+ expect(agentEndIdx).toBeLessThan(sessionShutdownIdx);
416
+ // Extract the agent_end handler body
417
+ const handlerBody = generated.slice(agentEndIdx, sessionShutdownIdx);
418
+ expect(handlerBody).toContain(
419
+ 'await pi.exec("ov", ["log", "session-end", "--agent", AGENT_NAME])',
420
+ );
421
+ });
422
+
423
+ test("agent_end handler is present for all capabilities", () => {
424
+ for (const hooks of [builderHooks(), scoutHooks(), coordinatorHooks()]) {
425
+ const generated = generatePiGuardExtension(hooks);
426
+ expect(generated).toContain('pi.on("agent_end",');
427
+ }
428
+ });
376
429
  });
377
430
 
378
431
  describe("PiRuntime integration", () => {