@poncho-ai/harness 0.52.1 → 0.53.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @poncho-ai/harness@0.52.1 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
2
+ > @poncho-ai/harness@0.53.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
3
3
  > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
4
4
 
5
5
  [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -8,9 +8,9 @@
8
8
  CLI tsup v8.5.1
9
9
  CLI Target: es2022
10
10
  ESM Build start
11
- ESM dist/index.js 536.24 KB
12
11
  ESM dist/isolate-F2PPSUL6.js 53.82 KB
13
- ESM ⚡️ Build success in 235ms
12
+ ESM dist/index.js 541.42 KB
13
+ ESM ⚡️ Build success in 239ms
14
14
  DTS Build start
15
- DTS ⚡️ Build success in 8126ms
16
- DTS dist/index.d.ts 92.40 KB
15
+ DTS ⚡️ Build success in 7708ms
16
+ DTS dist/index.d.ts 94.04 KB
package/CHANGELOG.md CHANGED
@@ -1,5 +1,111 @@
1
1
  # @poncho-ai/harness
2
2
 
3
+ ## 0.53.0
4
+
5
+ ### Minor Changes
6
+
7
+ - [#145](https://github.com/cesr/poncho-ai/pull/145) [`bfa4976`](https://github.com/cesr/poncho-ai/commit/bfa4976ac8b05a300e22271e23c3bae4aadae2a8) Thanks [@cesr](https://github.com/cesr)! - events: add stable identity so streaming clients match instead of guess
8
+
9
+ Additive fields that let a streaming client reconstruct view-state by
10
+ identity rather than inferring structure from event order (the source of a
11
+ class of reconnect/subagent rendering bugs):
12
+ - `tool:started` / `tool:completed` / `tool:error` now carry `toolCallId`
13
+ (already in scope as `call.id` / `result.callId`). Clients match tool
14
+ pills by id instead of by tool name.
15
+ - `subagent:spawned|completed|error|stopped` now carry `parentToolCallId`
16
+ (the `spawn_subagent` tool call's id) and `task`; `completed`/`error`
17
+ also carry `resultText`. Clients attach subagent state to the spawning
18
+ tool's pill and render the result inline — no header-regex or
19
+ sequential-cursor pairing needed.
20
+ - `ToolContext` gains `toolCallId` so the `spawn_subagent` handler can
21
+ record which call produced the subagent (plumbed: tool-dispatcher →
22
+ spawn handler → `SubagentSpawnOptions.parentToolCallId` →
23
+ `subagentMeta.parentToolCallId` → the events above).
24
+ - `run:started` gains an optional `cause` field in the type
25
+ (`user|continuation|subagent_callback|approval_resume`); emission is
26
+ deferred to a later pass.
27
+
28
+ All fields are additive; older clients ignore them.
29
+
30
+ ### Patch Changes
31
+
32
+ - Updated dependencies [[`bfa4976`](https://github.com/cesr/poncho-ai/commit/bfa4976ac8b05a300e22271e23c3bae4aadae2a8)]:
33
+ - @poncho-ai/sdk@1.15.0
34
+
35
+ ## 0.52.2
36
+
37
+ ### Patch Changes
38
+
39
+ - [#124](https://github.com/cesr/poncho-ai/pull/124) [`4ae26e0`](https://github.com/cesr/poncho-ai/commit/4ae26e0d8d2788f57411f9c17e10766769514f9b) Thanks [@cesr](https://github.com/cesr)! - harness: postgres retry covers exec/transaction + 3 attempts + tighter idle
40
+
41
+ Follow-up to the previous `idle_timeout`/`max_lifetime`/retry patch.
42
+ Live testing on Railway showed the previous values weren't tight
43
+ enough — `write CONNECTION_ENDED postgres.railway.internal:5432`
44
+ still surfaced both during user-facing chat turns and during
45
+ subagent auto-callback reruns, despite the new config and the
46
+ one-shot retry.
47
+
48
+ Two failure modes the previous version didn't cover:
49
+ 1. The retry only wrapped `private query()` (executor.run/get/all),
50
+ but `executor.exec` (`sql.unsafe`) and `executor.transaction`
51
+ (`sql.begin`) called the postgres.js client directly. A pg drop
52
+ inside a transaction or migration write threw straight through.
53
+ 2. After an idle period the pool can have multiple stale sockets;
54
+ a single retry can checkout a second stale socket from the pool
55
+ and fail again. One-shot retry exhausted into an error visible
56
+ to the caller.
57
+
58
+ Fixes:
59
+ - All three executor paths (`run/get/all`, `exec`, `transaction`)
60
+ now go through the same `runWithRetry` wrapper. Transactions
61
+ only retry the connection-level `CONNECTION_ENDED` reject from
62
+ the postgres.js client — actual SQL errors mid-transaction
63
+ surface as a different error class and bypass the retry,
64
+ preserving atomic semantics.
65
+ - Three attempts with light exponential backoff (0, 50ms, 200ms).
66
+ Enough to ride out a typical staleness wave; if all three fail
67
+ the network is genuinely broken.
68
+ - `CONNECT_TIMEOUT` and `ECONNRESET` added to the retry-eligible
69
+ error codes.
70
+
71
+ Config knobs tightened:
72
+ - `idle_timeout: 5` (was 20). Empirically Railway's pg drops
73
+ sockets well before 20s; 5s wins the race in practice while
74
+ staying long enough for bursty workloads to reuse connections.
75
+ - `max_lifetime: 300` (was 600). Same reasoning — recycle more
76
+ aggressively.
77
+ - `connect_timeout: 10` (was 30 default). Faster failure during
78
+ incidents lets callers shed load instead of stacking up.
79
+
80
+ - [#144](https://github.com/cesr/poncho-ai/pull/144) [`28d640b`](https://github.com/cesr/poncho-ai/commit/28d640b2f82ea780f8e0be90965972d9903c01d7) Thanks [@cesr](https://github.com/cesr)! - orchestrator: make subagent result delivery reliable
81
+
82
+ Subagent results could silently never reach the parent agent. Several
83
+ plumbing bugs in `runSubagent` / `runSubagentContinuation`:
84
+ - **Emit-before-persist race.** `subagent:completed` / `subagent:error`
85
+ were emitted to the parent's event stream _before_ the result was
86
+ written to the store, so a consumer reacting to the event (the parent
87
+ callback, the streaming client) could race the write. Now the result
88
+ is persisted first, then the event is emitted.
89
+ - **Silently swallowed writes.** Two `appendSubagentResult(...).catch(() => {})`
90
+ call sites (the error path and the continuation-error path) dropped the
91
+ result with no trace on a transient store failure. Replaced with a
92
+ shared `appendSubagentResultReliable` helper that retries once and then
93
+ logs loudly — a dropped result is the worst failure mode (the parent
94
+ waits forever on a subagent it thinks is still running).
95
+ - **Un-awaited eventSink.** The subagent-callback run path was the lone
96
+ `this.eventSink(...)` call site that didn't `await` (every other site
97
+ does), so callback-turn events could interleave out of order. Now awaited.
98
+ - **Spawn rejections went to a bare `console.error`.** A background
99
+ `runSubagent` that rejected outside its own try/catch left the parent
100
+ hanging. Both fire-and-forget spawn paths now route to a
101
+ `handleSpawnFailure` that marks the child errored and hands the parent
102
+ an error result so the turn can resume.
103
+ - **`recoverStaleSubagents` now also drains undelivered results.** It
104
+ previously only rescued children stuck in `running`; it now also
105
+ re-triggers the parent callback for any parent that has results sitting
106
+ in the store with no active run (e.g. a result persisted just before a
107
+ process restart, whose in-memory callback trigger was lost).
108
+
3
109
  ## 0.52.1
4
110
 
5
111
  ### Patch Changes
package/dist/index.d.ts CHANGED
@@ -199,6 +199,9 @@ interface Conversation {
199
199
  * subagent's runs emit no telemetry (e.g. spawned from an incognito
200
200
  * turn). Read by the orchestrator's runSubagent / continuation. */
201
201
  suppressTelemetry?: boolean;
202
+ /** The parent's `spawn_subagent` tool call id — echoed onto subagent:*
203
+ * events so a client can attach subagent state to that tool's pill. */
204
+ parentToolCallId?: string;
202
205
  };
203
206
  channelMeta?: {
204
207
  platform: string;
@@ -1190,6 +1193,10 @@ interface SubagentManager {
1190
1193
  /** Inherit the parent run's telemetry choice — when true, the subagent
1191
1194
  * run (and its re-runs) emit no telemetry. */
1192
1195
  suppressTelemetry?: boolean;
1196
+ /** The id of the `spawn_subagent` tool call that produced this subagent,
1197
+ * so its events can carry `parentToolCallId` and a client can attach
1198
+ * subagent state to the spawning tool's pill. */
1199
+ parentToolCallId?: string;
1193
1200
  }): Promise<SubagentSpawnResult>;
1194
1201
  sendMessage(subagentId: string, message: string): Promise<SubagentSpawnResult>;
1195
1202
  stop(subagentId: string): Promise<void>;
@@ -1802,22 +1809,27 @@ declare class PostgresEngine extends SqlStorageEngine {
1802
1809
  private patchVfs;
1803
1810
  private query;
1804
1811
  /**
1805
- * Single retry on a transient connection-layer failure. The
1806
- * `idle_timeout` / `max_lifetime` config above prevents *most*
1807
- * stale-connection cases, but a query can still race a
1808
- * provider-initiated drop in flight the postgres.js client
1809
- * rejects with `code: "CONNECTION_ENDED"` and the next attempt
1810
- * checks out a fresh connection from the pool. One retry is
1811
- * enough; if it fails again the host-side network is genuinely
1812
- * broken and the caller should see the error.
1812
+ * Retry on transient connection-layer failures. Three attempts
1813
+ * with exponential-ish backoff (0, 50ms, 200ms) — the pool may
1814
+ * have multiple stale sockets accumulated during an idle period
1815
+ * (especially on managed Postgres after boot when no traffic
1816
+ * has flowed for a while), so a single retry can land on a
1817
+ * second stale socket and still fail. Three attempts virtually
1818
+ * always exhausts the staleness wave; if all three throw, the
1819
+ * failure is real and the caller should see it.
1813
1820
  *
1814
- * Only retries reads + the standard exec/run paths in `query`;
1815
- * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
1816
- * `sql.begin(...)` transactions are unwrapped those are
1817
- * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
1818
- * atomically scoped (transactions roll back cleanly), and adding
1819
- * a retry around them would complicate the transaction
1820
- * semantics.
1821
+ * Applied to every pg path the executor exposes:
1822
+ * - `query()` (run/get/all) — natural retry: queries are
1823
+ * idempotent at the connection-failure boundary because the
1824
+ * server-side rollback runs cleanly on socket close.
1825
+ * - `exec(sql)` for DDL — `CREATE TABLE IF NOT EXISTS` and
1826
+ * friends are idempotent by construction.
1827
+ * - `transaction(fn)` — only retried when the
1828
+ * CONNECTION_ENDED reject arrives *before* the transaction
1829
+ * body started executing on the connection; if it errors
1830
+ * mid-transaction, the postgres.js client surfaces a
1831
+ * different error class (the inner SQL error) and bypasses
1832
+ * this retry, preserving the all-or-nothing semantics.
1821
1833
  */
1822
1834
  private runWithRetry;
1823
1835
  private addToPathCache;
@@ -2145,6 +2157,22 @@ declare class AgentOrchestrator {
2145
2157
  processSubagentCallback(conversationId: string, skipLockCheck?: boolean): Promise<void>;
2146
2158
  runSubagentContinuation(conversationId: string, conversation: Conversation, continuationMessages: Message[]): AsyncGenerator<AgentEvent>;
2147
2159
  createSubagentManager(): SubagentManager;
2160
+ /**
2161
+ * Append a subagent result to its parent, retrying once on a transient
2162
+ * store failure before giving up loudly. A silently dropped result is the
2163
+ * worst subagent failure mode — the parent waits forever on a subagent it
2164
+ * thinks is still running — so this never swallows the error the way the
2165
+ * old `.catch(() => {})` call sites did. Returns whether the result landed.
2166
+ */
2167
+ private appendSubagentResultReliable;
2168
+ /**
2169
+ * A subagent's fire-and-forget background run rejected outside its own
2170
+ * error handling (e.g. it threw before entering its try block, or the
2171
+ * catch block itself threw). Without this the parent is left waiting on a
2172
+ * subagent that will never report back. Record the failure on the child
2173
+ * and hand the parent an error result so the turn can resume.
2174
+ */
2175
+ private handleSpawnFailure;
2148
2176
  recoverStaleSubagents(): Promise<void>;
2149
2177
  }
2150
2178
 
package/dist/index.js CHANGED
@@ -4415,12 +4415,12 @@ var PostgresEngine = class extends SqlStorageEngine {
4415
4415
  return rows;
4416
4416
  },
4417
4417
  exec: async (sql) => {
4418
- await this.sql.unsafe(sql);
4418
+ await this.runWithRetry(() => this.sql.unsafe(sql));
4419
4419
  },
4420
4420
  transaction: async (fn) => {
4421
- await this.sql.begin(async () => {
4421
+ await this.runWithRetry(() => this.sql.begin(async () => {
4422
4422
  await fn();
4423
- });
4423
+ }));
4424
4424
  }
4425
4425
  };
4426
4426
  }
@@ -4438,25 +4438,34 @@ var PostgresEngine = class extends SqlStorageEngine {
4438
4438
  prepare: false,
4439
4439
  // Connection-pool resilience. Managed Postgres providers
4440
4440
  // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
4441
- // connections server-side after a few minutes. Without these
4442
- // knobs, porsager/postgres keeps stale sockets in the pool;
4443
- // the next query on one rejects with
4444
- // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
4445
- // as a hard failure to the caller. Two complementary settings:
4441
+ // connections server-side after a few minutes and on
4442
+ // Railway in particular, mid-stream drops within a few
4443
+ // seconds of inactivity are common. Without these knobs,
4444
+ // porsager/postgres keeps stale sockets in the pool; the
4445
+ // next query on one rejects with
4446
+ // `write CONNECTION_ENDED <host>:5432` at `durMs=0`,
4447
+ // surfacing as a hard failure to the caller.
4446
4448
  //
4447
- // - `idle_timeout: 20` closes idle connections client-side
4448
- // after 20s, before any reasonable provider-side timer
4449
- // fires. Fresh connection on next checkout = no stale
4450
- // socket race.
4451
- // - `max_lifetime: 600` (10 min) recycles long-lived
4452
- // connections defensively even if they've stayed busy,
4453
- // which sidesteps a separate class of provider-side
4454
- // "max connection age" limits.
4449
+ // - `idle_timeout: 5` closes idle connections client-side
4450
+ // aggressively. Empirically Railway's pg drops sockets
4451
+ // well before the 20s value that managed-provider docs
4452
+ // suggest; 5s is short enough to win the race in
4453
+ // practice while staying long enough that bursty
4454
+ // workloads still get connection reuse.
4455
+ // - `max_lifetime: 300` (5 min) recycles long-lived
4456
+ // connections defensively. Even with idle_timeout, a
4457
+ // connection that's been actively serving small queries
4458
+ // for an hour can hit provider-side max-age limits.
4459
+ // - `connect_timeout: 10` — slightly less patient on
4460
+ // initial connect than the 30s default. Combined with
4461
+ // the retry below, "connection refused" surfaces faster
4462
+ // during incidents and the caller can shed load instead
4463
+ // of stacking up.
4455
4464
  //
4456
- // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
4457
- // pool size + initial connect behavior unchanged.
4458
- idle_timeout: 20,
4459
- max_lifetime: 60 * 10
4465
+ // Pool size (`max: 10`) unchanged.
4466
+ idle_timeout: 5,
4467
+ max_lifetime: 60 * 5,
4468
+ connect_timeout: 10
4460
4469
  });
4461
4470
  }
4462
4471
  async initialize() {
@@ -4505,33 +4514,47 @@ var PostgresEngine = class extends SqlStorageEngine {
4505
4514
  );
4506
4515
  }
4507
4516
  /**
4508
- * Single retry on a transient connection-layer failure. The
4509
- * `idle_timeout` / `max_lifetime` config above prevents *most*
4510
- * stale-connection cases, but a query can still race a
4511
- * provider-initiated drop in flight the postgres.js client
4512
- * rejects with `code: "CONNECTION_ENDED"` and the next attempt
4513
- * checks out a fresh connection from the pool. One retry is
4514
- * enough; if it fails again the host-side network is genuinely
4515
- * broken and the caller should see the error.
4517
+ * Retry on transient connection-layer failures. Three attempts
4518
+ * with exponential-ish backoff (0, 50ms, 200ms) — the pool may
4519
+ * have multiple stale sockets accumulated during an idle period
4520
+ * (especially on managed Postgres after boot when no traffic
4521
+ * has flowed for a while), so a single retry can land on a
4522
+ * second stale socket and still fail. Three attempts virtually
4523
+ * always exhausts the staleness wave; if all three throw, the
4524
+ * failure is real and the caller should see it.
4516
4525
  *
4517
- * Only retries reads + the standard exec/run paths in `query`;
4518
- * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
4519
- * `sql.begin(...)` transactions are unwrapped those are
4520
- * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
4521
- * atomically scoped (transactions roll back cleanly), and adding
4522
- * a retry around them would complicate the transaction
4523
- * semantics.
4526
+ * Applied to every pg path the executor exposes:
4527
+ * - `query()` (run/get/all) — natural retry: queries are
4528
+ * idempotent at the connection-failure boundary because the
4529
+ * server-side rollback runs cleanly on socket close.
4530
+ * - `exec(sql)` for DDL — `CREATE TABLE IF NOT EXISTS` and
4531
+ * friends are idempotent by construction.
4532
+ * - `transaction(fn)` — only retried when the
4533
+ * CONNECTION_ENDED reject arrives *before* the transaction
4534
+ * body started executing on the connection; if it errors
4535
+ * mid-transaction, the postgres.js client surfaces a
4536
+ * different error class (the inner SQL error) and bypasses
4537
+ * this retry, preserving the all-or-nothing semantics.
4524
4538
  */
4525
4539
  async runWithRetry(fn) {
4526
- try {
4527
- return await fn();
4528
- } catch (err) {
4529
- const code = err?.code;
4530
- if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
4540
+ const backoffs = [0, 50, 200];
4541
+ let lastErr;
4542
+ for (let attempt = 0; attempt < backoffs.length; attempt++) {
4543
+ if (backoffs[attempt] > 0) {
4544
+ await new Promise((r) => setTimeout(r, backoffs[attempt]));
4545
+ }
4546
+ try {
4531
4547
  return await fn();
4548
+ } catch (err) {
4549
+ lastErr = err;
4550
+ const code = err?.code;
4551
+ if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED" || code === "CONNECT_TIMEOUT" || code === "ECONNRESET") {
4552
+ continue;
4553
+ }
4554
+ throw err;
4532
4555
  }
4533
- throw err;
4534
4556
  }
4557
+ throw lastErr;
4535
4558
  }
4536
4559
  addToPathCache(tenantId, path) {
4537
4560
  const paths = this.pathCache.get(tenantId);
@@ -8300,7 +8323,8 @@ var createSubagentTools = (manager) => [
8300
8323
  parentConversationId: conversationId,
8301
8324
  ownerId,
8302
8325
  tenantId: context.tenantId,
8303
- suppressTelemetry: context.suppressTelemetry
8326
+ suppressTelemetry: context.suppressTelemetry,
8327
+ parentToolCallId: context.toolCallId
8304
8328
  });
8305
8329
  return { subagentId, status: "running" };
8306
8330
  }
@@ -8562,7 +8586,7 @@ var ToolDispatcher = class {
8562
8586
  };
8563
8587
  }
8564
8588
  try {
8565
- const output = await definition.handler(call.input, context);
8589
+ const output = await definition.handler(call.input, { ...context, toolCallId: call.id });
8566
8590
  if (context.abortSignal?.aborted) {
8567
8591
  return {
8568
8592
  callId: call.id,
@@ -11147,7 +11171,7 @@ ${textContent}` };
11147
11171
  return;
11148
11172
  }
11149
11173
  const runtimeToolName = exposedToolNames.get(call.name) ?? call.name;
11150
- yield pushEvent({ type: "tool:started", tool: runtimeToolName, input: call.input });
11174
+ yield pushEvent({ type: "tool:started", tool: runtimeToolName, toolCallId: call.id, input: call.input });
11151
11175
  if (this.requiresApprovalForToolCall(runtimeToolName, call.input)) {
11152
11176
  approvalNeeded.push({
11153
11177
  approvalId: `approval_${randomUUID5()}`,
@@ -11340,6 +11364,7 @@ ${textContent}` };
11340
11364
  yield pushEvent({
11341
11365
  type: "tool:error",
11342
11366
  tool: result2.tool,
11367
+ toolCallId: result2.callId,
11343
11368
  error: result2.error,
11344
11369
  recoverable: true
11345
11370
  });
@@ -11381,6 +11406,7 @@ ${textContent}` };
11381
11406
  yield pushEvent({
11382
11407
  type: "tool:completed",
11383
11408
  tool: result2.tool,
11409
+ toolCallId: result2.callId,
11384
11410
  input: callInputMap.get(result2.callId),
11385
11411
  output: result2.output,
11386
11412
  duration: now() - batchStart,
@@ -12803,11 +12829,14 @@ var AgentOrchestrator = class {
12803
12829
  result: { status: "completed", response: responseText, steps: 0, tokens: { input: 0, output: 0, cached: 0 }, duration: 0 },
12804
12830
  timestamp: Date.now()
12805
12831
  };
12806
- await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
12832
+ await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
12807
12833
  await this.eventSink(conv.parentConversationId, {
12808
12834
  type: "subagent:completed",
12809
12835
  subagentId,
12810
- conversationId: subagentId
12836
+ conversationId: subagentId,
12837
+ task: conv.subagentMeta?.task ?? conv.title,
12838
+ parentToolCallId: conv.subagentMeta?.parentToolCallId,
12839
+ resultText: responseText
12811
12840
  });
12812
12841
  await this.triggerParentCallback(conv.parentConversationId);
12813
12842
  }
@@ -12883,9 +12912,11 @@ var AgentOrchestrator = class {
12883
12912
  let latestRunId = "";
12884
12913
  let runResult;
12885
12914
  let runError;
12915
+ let parentToolCallId;
12886
12916
  try {
12887
12917
  const conversation = await this.conversationStore.getWithArchive(childConversationId);
12888
12918
  if (!conversation) throw new Error("Subagent conversation not found");
12919
+ parentToolCallId = conversation.subagentMeta?.parentToolCallId;
12889
12920
  if (conversation.subagentMeta?.status === "stopped") return;
12890
12921
  conversation.lastActivityAt = Date.now();
12891
12922
  await this.conversationStore.update(conversation);
@@ -13077,12 +13108,6 @@ var AgentOrchestrator = class {
13077
13108
  };
13078
13109
  await this.conversationStore.update(conv);
13079
13110
  }
13080
- this.hooks?.onStreamEnd?.(childConversationId);
13081
- await this.eventSink(parentConversationId, {
13082
- type: "subagent:completed",
13083
- subagentId: childConversationId,
13084
- conversationId: childConversationId
13085
- });
13086
13111
  let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
13087
13112
  if (!gathered) {
13088
13113
  const freshSubConv = await this.conversationStore.get(childConversationId);
@@ -13104,7 +13129,16 @@ var AgentOrchestrator = class {
13104
13129
  ...abnormal ? { error: { code: runError?.code ?? "SUBAGENT_INCOMPLETE", message: runError?.message ?? "subagent ended without a result" } } : {},
13105
13130
  timestamp: Date.now()
13106
13131
  };
13107
- await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult);
13132
+ await this.appendSubagentResultReliable(parentConversationId, pendingResult);
13133
+ this.hooks?.onStreamEnd?.(childConversationId);
13134
+ await this.eventSink(parentConversationId, {
13135
+ type: "subagent:completed",
13136
+ subagentId: childConversationId,
13137
+ conversationId: childConversationId,
13138
+ task,
13139
+ parentToolCallId,
13140
+ resultText: subagentResponse
13141
+ });
13108
13142
  this.triggerParentCallback(parentConversationId).catch(
13109
13143
  (err) => console.error(`[poncho][subagent] Parent callback failed:`, err instanceof Error ? err.message : err)
13110
13144
  );
@@ -13121,13 +13155,6 @@ var AgentOrchestrator = class {
13121
13155
  conv.updatedAt = Date.now();
13122
13156
  await this.conversationStore.update(conv);
13123
13157
  }
13124
- this.hooks?.onStreamEnd?.(childConversationId);
13125
- await this.eventSink(parentConversationId, {
13126
- type: "subagent:error",
13127
- subagentId: childConversationId,
13128
- conversationId: childConversationId,
13129
- error: errMsg
13130
- });
13131
13158
  const pendingResult = {
13132
13159
  subagentId: childConversationId,
13133
13160
  task,
@@ -13135,7 +13162,15 @@ var AgentOrchestrator = class {
13135
13162
  error: { code: "SUBAGENT_ERROR", message: errMsg },
13136
13163
  timestamp: Date.now()
13137
13164
  };
13138
- await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult).catch(() => {
13165
+ await this.appendSubagentResultReliable(parentConversationId, pendingResult);
13166
+ this.hooks?.onStreamEnd?.(childConversationId);
13167
+ await this.eventSink(parentConversationId, {
13168
+ type: "subagent:error",
13169
+ subagentId: childConversationId,
13170
+ conversationId: childConversationId,
13171
+ error: errMsg,
13172
+ task,
13173
+ parentToolCallId
13139
13174
  });
13140
13175
  this.triggerParentCallback(parentConversationId).catch(
13141
13176
  (err2) => console.error(`[poncho][subagent] Parent callback failed:`, err2 instanceof Error ? err2.message : err2)
@@ -13251,12 +13286,12 @@ ${resultBody}`,
13251
13286
  },
13252
13287
  initialContextTokens: conversation.contextTokens ?? 0,
13253
13288
  initialContextWindow: conversation.contextWindow ?? 0,
13254
- onEvent: (event) => {
13289
+ onEvent: async (event) => {
13255
13290
  if (event.type === "run:started") {
13256
13291
  const active = this.activeConversationRuns.get(conversationId);
13257
13292
  if (active) active.runId = event.runId;
13258
13293
  }
13259
- this.eventSink(conversationId, event);
13294
+ await this.eventSink(conversationId, event);
13260
13295
  }
13261
13296
  });
13262
13297
  flushTurnDraft(execution.draft);
@@ -13442,11 +13477,6 @@ ${resultBody}`,
13442
13477
  await this.conversationStore.update(conv);
13443
13478
  }
13444
13479
  this.activeSubagentRuns.delete(conversationId);
13445
- await this.eventSink(parentConversationId, {
13446
- type: "subagent:completed",
13447
- subagentId: conversationId,
13448
- conversationId
13449
- });
13450
13480
  let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
13451
13481
  if (!gathered) {
13452
13482
  const freshSubConv = await this.conversationStore.get(conversationId);
@@ -13464,7 +13494,17 @@ ${resultBody}`,
13464
13494
  ...abnormal ? { error: { code: runError?.code ?? "SUBAGENT_INCOMPLETE", message: runError?.message ?? "subagent ended without a result" } } : {},
13465
13495
  timestamp: Date.now()
13466
13496
  };
13467
- await this.conversationStore.appendSubagentResult(parentConversationId, result);
13497
+ await this.appendSubagentResultReliable(parentConversationId, result);
13498
+ }
13499
+ await this.eventSink(parentConversationId, {
13500
+ type: "subagent:completed",
13501
+ subagentId: conversationId,
13502
+ conversationId,
13503
+ task,
13504
+ parentToolCallId: conversation.subagentMeta?.parentToolCallId,
13505
+ resultText: subagentResponse
13506
+ });
13507
+ if (parentConv) {
13468
13508
  if (this.isServerless) {
13469
13509
  this.hooks.dispatchBackground("subagent-callback", parentConversationId);
13470
13510
  } else {
@@ -13492,11 +13532,6 @@ ${resultBody}`,
13492
13532
  conv.updatedAt = Date.now();
13493
13533
  await this.conversationStore.update(conv);
13494
13534
  }
13495
- await this.eventSink(conversation.parentConversationId, {
13496
- type: "subagent:completed",
13497
- subagentId: conversationId,
13498
- conversationId
13499
- });
13500
13535
  const parentConv = await this.conversationStore.get(conversation.parentConversationId);
13501
13536
  if (parentConv) {
13502
13537
  const result = {
@@ -13506,12 +13541,22 @@ ${resultBody}`,
13506
13541
  error: { code: "CONTINUATION_ERROR", message: err instanceof Error ? err.message : String(err) },
13507
13542
  timestamp: Date.now()
13508
13543
  };
13509
- await this.conversationStore.appendSubagentResult(conversation.parentConversationId, result);
13544
+ await this.appendSubagentResultReliable(conversation.parentConversationId, result);
13545
+ }
13546
+ await this.eventSink(conversation.parentConversationId, {
13547
+ type: "subagent:completed",
13548
+ subagentId: conversationId,
13549
+ conversationId,
13550
+ task,
13551
+ parentToolCallId: conversation.subagentMeta?.parentToolCallId
13552
+ });
13553
+ if (parentConv) {
13510
13554
  if (this.isServerless) {
13511
13555
  this.hooks.dispatchBackground("subagent-callback", conversation.parentConversationId);
13512
13556
  } else {
13513
- this.processSubagentCallback(conversation.parentConversationId).catch(() => {
13514
- });
13557
+ this.processSubagentCallback(conversation.parentConversationId).catch(
13558
+ (err2) => console.error(`[poncho][subagent] Continuation-error callback failed:`, err2 instanceof Error ? err2.message : err2)
13559
+ );
13515
13560
  }
13516
13561
  }
13517
13562
  }
@@ -13533,7 +13578,7 @@ ${resultBody}`,
13533
13578
  opts.tenantId ?? null,
13534
13579
  {
13535
13580
  parentConversationId: opts.parentConversationId,
13536
- subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry },
13581
+ subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry, parentToolCallId: opts.parentToolCallId },
13537
13582
  messages: [{ role: "user", content: opts.task }]
13538
13583
  }
13539
13584
  );
@@ -13545,7 +13590,8 @@ ${resultBody}`,
13545
13590
  type: "subagent:spawned",
13546
13591
  subagentId: conversation.conversationId,
13547
13592
  conversationId: conversation.conversationId,
13548
- task: opts.task
13593
+ task: opts.task,
13594
+ parentToolCallId: opts.parentToolCallId
13549
13595
  });
13550
13596
  if (this.isServerless) {
13551
13597
  this.hooks.dispatchBackground("subagent-run", conversation.conversationId);
@@ -13555,7 +13601,7 @@ ${resultBody}`,
13555
13601
  opts.parentConversationId,
13556
13602
  opts.task,
13557
13603
  opts.ownerId
13558
- ).catch((err) => console.error(`[poncho][subagent] Background spawn failed:`, err instanceof Error ? err.message : err));
13604
+ ).catch((err) => this.handleSpawnFailure(conversation.conversationId, opts.parentConversationId, opts.task, err));
13559
13605
  }
13560
13606
  return { subagentId: conversation.conversationId };
13561
13607
  },
@@ -13588,7 +13634,7 @@ ${resultBody}`,
13588
13634
  conversation.parentConversationId,
13589
13635
  message,
13590
13636
  conversation.ownerId
13591
- ).catch((err) => console.error(`[poncho][subagent] Background sendMessage failed:`, err instanceof Error ? err.message : err));
13637
+ ).catch((err) => this.handleSpawnFailure(subagentId, conversation.parentConversationId, message, err));
13592
13638
  }
13593
13639
  return { subagentId };
13594
13640
  },
@@ -13667,6 +13713,67 @@ ${resultBody}`,
13667
13713
  };
13668
13714
  }
13669
13715
  // ── Stale subagent recovery ──
13716
+ /**
13717
+ * Append a subagent result to its parent, retrying once on a transient
13718
+ * store failure before giving up loudly. A silently dropped result is the
13719
+ * worst subagent failure mode — the parent waits forever on a subagent it
13720
+ * thinks is still running — so this never swallows the error the way the
13721
+ * old `.catch(() => {})` call sites did. Returns whether the result landed.
13722
+ */
13723
+ async appendSubagentResultReliable(parentConversationId, result) {
13724
+ try {
13725
+ await this.conversationStore.appendSubagentResult(parentConversationId, result);
13726
+ return true;
13727
+ } catch (firstErr) {
13728
+ try {
13729
+ await this.conversationStore.appendSubagentResult(parentConversationId, result);
13730
+ return true;
13731
+ } catch (secondErr) {
13732
+ console.error(
13733
+ `[poncho][subagent] FAILED to persist result for subagent ${result.subagentId} to parent ${parentConversationId} after 2 attempts \u2014 the parent will not see this result:`,
13734
+ secondErr instanceof Error ? secondErr.message : secondErr,
13735
+ `(first attempt: ${firstErr instanceof Error ? firstErr.message : firstErr})`
13736
+ );
13737
+ return false;
13738
+ }
13739
+ }
13740
+ }
13741
+ /**
13742
+ * A subagent's fire-and-forget background run rejected outside its own
13743
+ * error handling (e.g. it threw before entering its try block, or the
13744
+ * catch block itself threw). Without this the parent is left waiting on a
13745
+ * subagent that will never report back. Record the failure on the child
13746
+ * and hand the parent an error result so the turn can resume.
13747
+ */
13748
+ async handleSpawnFailure(childConversationId, parentConversationId, task, err) {
13749
+ const message = err instanceof Error ? err.message : String(err);
13750
+ console.error(`[poncho][subagent] Background run failed for ${childConversationId}:`, message);
13751
+ try {
13752
+ const conv = await this.conversationStore.get(childConversationId);
13753
+ if (conv?.subagentMeta && conv.subagentMeta.status === "running") {
13754
+ conv.subagentMeta = {
13755
+ ...conv.subagentMeta,
13756
+ status: "error",
13757
+ error: { code: "SUBAGENT_SPAWN_FAILED", message }
13758
+ };
13759
+ conv.updatedAt = Date.now();
13760
+ await this.conversationStore.update(conv);
13761
+ }
13762
+ } catch {
13763
+ }
13764
+ const appended = await this.appendSubagentResultReliable(parentConversationId, {
13765
+ subagentId: childConversationId,
13766
+ task,
13767
+ status: "error",
13768
+ error: { code: "SUBAGENT_SPAWN_FAILED", message },
13769
+ timestamp: Date.now()
13770
+ });
13771
+ if (appended) {
13772
+ this.triggerParentCallback(parentConversationId).catch(
13773
+ (e) => console.error(`[poncho][subagent] Parent callback failed after spawn failure:`, e instanceof Error ? e.message : e)
13774
+ );
13775
+ }
13776
+ }
13670
13777
  async recoverStaleSubagents() {
13671
13778
  const allSummaries = await this.conversationStore.listSummaries();
13672
13779
  const subagentSummaries = allSummaries.filter((s) => s.parentConversationId);
@@ -13692,11 +13799,20 @@ ${resultBody}`,
13692
13799
  error: conv.subagentMeta.error,
13693
13800
  timestamp: Date.now()
13694
13801
  };
13695
- await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
13802
+ await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
13696
13803
  parentsToCallback.add(conv.parentConversationId);
13697
13804
  }
13698
13805
  }
13699
13806
  }
13807
+ const parentIds = new Set(
13808
+ subagentSummaries.map((s) => s.parentConversationId).filter((id) => !!id)
13809
+ );
13810
+ for (const parentId of parentIds) {
13811
+ if (parentsToCallback.has(parentId)) continue;
13812
+ if (this.activeConversationRuns.has(parentId)) continue;
13813
+ const parent = await this.conversationStore.get(parentId);
13814
+ if (parent?.pendingSubagentResults?.length) parentsToCallback.add(parentId);
13815
+ }
13700
13816
  for (const parentId of parentsToCallback) {
13701
13817
  this.processSubagentCallback(parentId).catch(
13702
13818
  (err) => console.error(`[poncho][subagent] Recovery callback failed for ${parentId}:`, err instanceof Error ? err.message : err)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@poncho-ai/harness",
3
- "version": "0.52.1",
3
+ "version": "0.53.0",
4
4
  "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
5
5
  "repository": {
6
6
  "type": "git",
@@ -34,7 +34,7 @@
34
34
  "mustache": "^4.2.0",
35
35
  "yaml": "^2.4.0",
36
36
  "zod": "^3.22.0",
37
- "@poncho-ai/sdk": "1.14.0"
37
+ "@poncho-ai/sdk": "1.15.0"
38
38
  },
39
39
  "peerDependencies": {
40
40
  "esbuild": ">=0.17.0",
package/src/harness.ts CHANGED
@@ -3344,7 +3344,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
3344
3344
  return;
3345
3345
  }
3346
3346
  const runtimeToolName = exposedToolNames.get(call.name) ?? call.name;
3347
- yield pushEvent({ type: "tool:started", tool: runtimeToolName, input: call.input });
3347
+ yield pushEvent({ type: "tool:started", tool: runtimeToolName, toolCallId: call.id, input: call.input });
3348
3348
  if (this.requiresApprovalForToolCall(runtimeToolName, call.input)) {
3349
3349
  approvalNeeded.push({
3350
3350
  approvalId: `approval_${randomUUID()}`,
@@ -3563,6 +3563,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
3563
3563
  yield pushEvent({
3564
3564
  type: "tool:error",
3565
3565
  tool: result.tool,
3566
+ toolCallId: result.callId,
3566
3567
  error: result.error,
3567
3568
  recoverable: true,
3568
3569
  });
@@ -3604,6 +3605,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
3604
3605
  yield pushEvent({
3605
3606
  type: "tool:completed",
3606
3607
  tool: result.tool,
3608
+ toolCallId: result.callId,
3607
3609
  input: callInputMap.get(result.callId),
3608
3610
  output: result.output,
3609
3611
  duration: now() - batchStart,
@@ -693,12 +693,15 @@ export class AgentOrchestrator {
693
693
  result: { status: "completed", response: responseText, steps: 0, tokens: { input: 0, output: 0, cached: 0 }, duration: 0 },
694
694
  timestamp: Date.now(),
695
695
  };
696
- await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
696
+ await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
697
697
 
698
698
  await this.eventSink(conv.parentConversationId, {
699
699
  type: "subagent:completed",
700
700
  subagentId,
701
701
  conversationId: subagentId,
702
+ task: conv.subagentMeta?.task ?? conv.title,
703
+ parentToolCallId: conv.subagentMeta?.parentToolCallId,
704
+ resultText: responseText,
702
705
  });
703
706
 
704
707
  await this.triggerParentCallback(conv.parentConversationId);
@@ -796,10 +799,14 @@ export class AgentOrchestrator {
796
799
  let latestRunId = "";
797
800
  let runResult: { status: "completed" | "error" | "cancelled"; response?: string; steps: number; duration: number; continuation?: boolean; continuationMessages?: Message[] } | undefined;
798
801
  let runError: { code?: string; message?: string } | undefined;
802
+ // The spawning tool call's id — echoed onto subagent:* events so the
803
+ // client can attach subagent state to that tool's pill.
804
+ let parentToolCallId: string | undefined;
799
805
 
800
806
  try {
801
807
  const conversation = await this.conversationStore.getWithArchive(childConversationId);
802
808
  if (!conversation) throw new Error("Subagent conversation not found");
809
+ parentToolCallId = conversation.subagentMeta?.parentToolCallId;
803
810
 
804
811
  if (conversation.subagentMeta?.status === "stopped") return;
805
812
 
@@ -1012,13 +1019,6 @@ export class AgentOrchestrator {
1012
1019
  await this.conversationStore.update(conv);
1013
1020
  }
1014
1021
 
1015
- this.hooks?.onStreamEnd?.(childConversationId);
1016
- await this.eventSink(parentConversationId, {
1017
- type: "subagent:completed",
1018
- subagentId: childConversationId,
1019
- conversationId: childConversationId,
1020
- });
1021
-
1022
1022
  // Recover the subagent's real output: prefer the run response, then the
1023
1023
  // streamed draft, then walk the transcript — discarding the synthetic
1024
1024
  // "[Error: ...]" placeholder at each step.
@@ -1051,7 +1051,21 @@ export class AgentOrchestrator {
1051
1051
  : {}),
1052
1052
  timestamp: Date.now(),
1053
1053
  };
1054
- await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult);
1054
+ // Persist the result BEFORE emitting subagent:completed: a consumer
1055
+ // reacting to the event (the parent callback, the streaming client)
1056
+ // must find the result already durable in the store, not race its write.
1057
+ await this.appendSubagentResultReliable(parentConversationId, pendingResult);
1058
+
1059
+ this.hooks?.onStreamEnd?.(childConversationId);
1060
+ await this.eventSink(parentConversationId, {
1061
+ type: "subagent:completed",
1062
+ subagentId: childConversationId,
1063
+ conversationId: childConversationId,
1064
+ task,
1065
+ parentToolCallId,
1066
+ resultText: subagentResponse,
1067
+ });
1068
+
1055
1069
  this.triggerParentCallback(parentConversationId).catch(err =>
1056
1070
  console.error(`[poncho][subagent] Parent callback failed:`, err instanceof Error ? err.message : err),
1057
1071
  );
@@ -1070,22 +1084,26 @@ export class AgentOrchestrator {
1070
1084
  await this.conversationStore.update(conv);
1071
1085
  }
1072
1086
 
1087
+ const pendingResult: PendingSubagentResult = {
1088
+ subagentId: childConversationId,
1089
+ task,
1090
+ status: "error",
1091
+ error: { code: "SUBAGENT_ERROR", message: errMsg },
1092
+ timestamp: Date.now(),
1093
+ };
1094
+ // Persist before emitting (see the success path); never swallow.
1095
+ await this.appendSubagentResultReliable(parentConversationId, pendingResult);
1096
+
1073
1097
  this.hooks?.onStreamEnd?.(childConversationId);
1074
1098
  await this.eventSink(parentConversationId, {
1075
1099
  type: "subagent:error",
1076
1100
  subagentId: childConversationId,
1077
1101
  conversationId: childConversationId,
1078
1102
  error: errMsg,
1103
+ task,
1104
+ parentToolCallId,
1079
1105
  });
1080
1106
 
1081
- const pendingResult: PendingSubagentResult = {
1082
- subagentId: childConversationId,
1083
- task,
1084
- status: "error",
1085
- error: { code: "SUBAGENT_ERROR", message: errMsg },
1086
- timestamp: Date.now(),
1087
- };
1088
- await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult).catch(() => {});
1089
1107
  this.triggerParentCallback(parentConversationId).catch(err2 =>
1090
1108
  console.error(`[poncho][subagent] Parent callback failed:`, err2 instanceof Error ? err2.message : err2),
1091
1109
  );
@@ -1221,12 +1239,15 @@ export class AgentOrchestrator {
1221
1239
  },
1222
1240
  initialContextTokens: conversation.contextTokens ?? 0,
1223
1241
  initialContextWindow: conversation.contextWindow ?? 0,
1224
- onEvent: (event) => {
1242
+ onEvent: async (event) => {
1225
1243
  if (event.type === "run:started") {
1226
1244
  const active = this.activeConversationRuns.get(conversationId);
1227
1245
  if (active) active.runId = event.runId;
1228
1246
  }
1229
- this.eventSink(conversationId, event);
1247
+ // Await so the event is fully sunk before the next step's events,
1248
+ // matching every other eventSink call site (the callback run path
1249
+ // was the lone fire-and-forget exception).
1250
+ await this.eventSink(conversationId, event);
1230
1251
  },
1231
1252
  });
1232
1253
  flushTurnDraft(execution.draft);
@@ -1436,11 +1457,6 @@ export class AgentOrchestrator {
1436
1457
  }
1437
1458
 
1438
1459
  this.activeSubagentRuns.delete(conversationId);
1439
- await this.eventSink(parentConversationId, {
1440
- type: "subagent:completed",
1441
- subagentId: conversationId,
1442
- conversationId,
1443
- });
1444
1460
 
1445
1461
  let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
1446
1462
  if (!gathered) {
@@ -1464,8 +1480,20 @@ export class AgentOrchestrator {
1464
1480
  : {}),
1465
1481
  timestamp: Date.now(),
1466
1482
  };
1467
- await this.conversationStore.appendSubagentResult(parentConversationId, result);
1483
+ // Persist before emitting completion (see runSubagent).
1484
+ await this.appendSubagentResultReliable(parentConversationId, result);
1485
+ }
1468
1486
 
1487
+ await this.eventSink(parentConversationId, {
1488
+ type: "subagent:completed",
1489
+ subagentId: conversationId,
1490
+ conversationId,
1491
+ task,
1492
+ parentToolCallId: conversation.subagentMeta?.parentToolCallId,
1493
+ resultText: subagentResponse,
1494
+ });
1495
+
1496
+ if (parentConv) {
1469
1497
  if (this.isServerless) {
1470
1498
  this.hooks!.dispatchBackground!("subagent-callback", parentConversationId);
1471
1499
  } else {
@@ -1490,12 +1518,6 @@ export class AgentOrchestrator {
1490
1518
  await this.conversationStore.update(conv);
1491
1519
  }
1492
1520
 
1493
- await this.eventSink(conversation.parentConversationId!, {
1494
- type: "subagent:completed",
1495
- subagentId: conversationId,
1496
- conversationId,
1497
- });
1498
-
1499
1521
  const parentConv = await this.conversationStore.get(conversation.parentConversationId!);
1500
1522
  if (parentConv) {
1501
1523
  const result: PendingSubagentResult = {
@@ -1505,11 +1527,25 @@ export class AgentOrchestrator {
1505
1527
  error: { code: "CONTINUATION_ERROR", message: err instanceof Error ? err.message : String(err) },
1506
1528
  timestamp: Date.now(),
1507
1529
  };
1508
- await this.conversationStore.appendSubagentResult(conversation.parentConversationId!, result);
1530
+ // Persist before emitting; never swallow (was `.catch(() => {})`).
1531
+ await this.appendSubagentResultReliable(conversation.parentConversationId!, result);
1532
+ }
1533
+
1534
+ await this.eventSink(conversation.parentConversationId!, {
1535
+ type: "subagent:completed",
1536
+ subagentId: conversationId,
1537
+ conversationId,
1538
+ task,
1539
+ parentToolCallId: conversation.subagentMeta?.parentToolCallId,
1540
+ });
1541
+
1542
+ if (parentConv) {
1509
1543
  if (this.isServerless) {
1510
1544
  this.hooks!.dispatchBackground!("subagent-callback", conversation.parentConversationId!);
1511
1545
  } else {
1512
- this.processSubagentCallback(conversation.parentConversationId!).catch(() => {});
1546
+ this.processSubagentCallback(conversation.parentConversationId!).catch(err2 =>
1547
+ console.error(`[poncho][subagent] Continuation-error callback failed:`, err2 instanceof Error ? err2.message : err2),
1548
+ );
1513
1549
  }
1514
1550
  }
1515
1551
  }
@@ -1534,7 +1570,7 @@ export class AgentOrchestrator {
1534
1570
  opts.tenantId ?? null,
1535
1571
  {
1536
1572
  parentConversationId: opts.parentConversationId,
1537
- subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry },
1573
+ subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry, parentToolCallId: opts.parentToolCallId },
1538
1574
  messages: [{ role: "user", content: opts.task }],
1539
1575
  },
1540
1576
  );
@@ -1549,6 +1585,7 @@ export class AgentOrchestrator {
1549
1585
  subagentId: conversation.conversationId,
1550
1586
  conversationId: conversation.conversationId,
1551
1587
  task: opts.task,
1588
+ parentToolCallId: opts.parentToolCallId,
1552
1589
  });
1553
1590
 
1554
1591
  if (this.isServerless) {
@@ -1559,7 +1596,7 @@ export class AgentOrchestrator {
1559
1596
  opts.parentConversationId,
1560
1597
  opts.task,
1561
1598
  opts.ownerId,
1562
- ).catch(err => console.error(`[poncho][subagent] Background spawn failed:`, err instanceof Error ? err.message : err));
1599
+ ).catch(err => this.handleSpawnFailure(conversation.conversationId, opts.parentConversationId, opts.task, err));
1563
1600
  }
1564
1601
 
1565
1602
  return { subagentId: conversation.conversationId };
@@ -1596,7 +1633,7 @@ export class AgentOrchestrator {
1596
1633
  conversation.parentConversationId,
1597
1634
  message,
1598
1635
  conversation.ownerId,
1599
- ).catch(err => console.error(`[poncho][subagent] Background sendMessage failed:`, err instanceof Error ? err.message : err));
1636
+ ).catch(err => this.handleSpawnFailure(subagentId, conversation.parentConversationId!, message, err));
1600
1637
  }
1601
1638
 
1602
1639
  return { subagentId };
@@ -1684,6 +1721,79 @@ export class AgentOrchestrator {
1684
1721
 
1685
1722
  // ── Stale subagent recovery ──
1686
1723
 
1724
+ /**
1725
+ * Append a subagent result to its parent, retrying once on a transient
1726
+ * store failure before giving up loudly. A silently dropped result is the
1727
+ * worst subagent failure mode — the parent waits forever on a subagent it
1728
+ * thinks is still running — so this never swallows the error the way the
1729
+ * old `.catch(() => {})` call sites did. Returns whether the result landed.
1730
+ */
1731
+ private async appendSubagentResultReliable(
1732
+ parentConversationId: string,
1733
+ result: PendingSubagentResult,
1734
+ ): Promise<boolean> {
1735
+ try {
1736
+ await this.conversationStore.appendSubagentResult(parentConversationId, result);
1737
+ return true;
1738
+ } catch (firstErr) {
1739
+ try {
1740
+ await this.conversationStore.appendSubagentResult(parentConversationId, result);
1741
+ return true;
1742
+ } catch (secondErr) {
1743
+ console.error(
1744
+ `[poncho][subagent] FAILED to persist result for subagent ${result.subagentId} ` +
1745
+ `to parent ${parentConversationId} after 2 attempts — the parent will not see this result:`,
1746
+ secondErr instanceof Error ? secondErr.message : secondErr,
1747
+ `(first attempt: ${firstErr instanceof Error ? firstErr.message : firstErr})`,
1748
+ );
1749
+ return false;
1750
+ }
1751
+ }
1752
+ }
1753
+
1754
+ /**
1755
+ * A subagent's fire-and-forget background run rejected outside its own
1756
+ * error handling (e.g. it threw before entering its try block, or the
1757
+ * catch block itself threw). Without this the parent is left waiting on a
1758
+ * subagent that will never report back. Record the failure on the child
1759
+ * and hand the parent an error result so the turn can resume.
1760
+ */
1761
+ private async handleSpawnFailure(
1762
+ childConversationId: string,
1763
+ parentConversationId: string,
1764
+ task: string,
1765
+ err: unknown,
1766
+ ): Promise<void> {
1767
+ const message = err instanceof Error ? err.message : String(err);
1768
+ console.error(`[poncho][subagent] Background run failed for ${childConversationId}:`, message);
1769
+ try {
1770
+ const conv = await this.conversationStore.get(childConversationId);
1771
+ if (conv?.subagentMeta && conv.subagentMeta.status === "running") {
1772
+ conv.subagentMeta = {
1773
+ ...conv.subagentMeta,
1774
+ status: "error",
1775
+ error: { code: "SUBAGENT_SPAWN_FAILED", message },
1776
+ };
1777
+ conv.updatedAt = Date.now();
1778
+ await this.conversationStore.update(conv);
1779
+ }
1780
+ } catch {
1781
+ // best-effort: the result append below is what the parent actually needs
1782
+ }
1783
+ const appended = await this.appendSubagentResultReliable(parentConversationId, {
1784
+ subagentId: childConversationId,
1785
+ task,
1786
+ status: "error",
1787
+ error: { code: "SUBAGENT_SPAWN_FAILED", message },
1788
+ timestamp: Date.now(),
1789
+ });
1790
+ if (appended) {
1791
+ this.triggerParentCallback(parentConversationId).catch(e =>
1792
+ console.error(`[poncho][subagent] Parent callback failed after spawn failure:`, e instanceof Error ? e.message : e),
1793
+ );
1794
+ }
1795
+ }
1796
+
1687
1797
  async recoverStaleSubagents(): Promise<void> {
1688
1798
  const allSummaries = await this.conversationStore.listSummaries();
1689
1799
  const subagentSummaries = allSummaries.filter((s) => s.parentConversationId);
@@ -1711,11 +1821,26 @@ export class AgentOrchestrator {
1711
1821
  error: conv.subagentMeta.error,
1712
1822
  timestamp: Date.now(),
1713
1823
  };
1714
- await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
1824
+ await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
1715
1825
  parentsToCallback.add(conv.parentConversationId);
1716
1826
  }
1717
1827
  }
1718
1828
  }
1829
+
1830
+ // Also drain parents that already have results sitting in the store but
1831
+ // no active run to deliver them — e.g. a result persisted just before a
1832
+ // process restart, whose in-memory callback trigger was lost. Without
1833
+ // this the parent stays stuck even though its result landed durably.
1834
+ const parentIds = new Set(
1835
+ subagentSummaries.map(s => s.parentConversationId).filter((id): id is string => !!id),
1836
+ );
1837
+ for (const parentId of parentIds) {
1838
+ if (parentsToCallback.has(parentId)) continue;
1839
+ if (this.activeConversationRuns.has(parentId)) continue;
1840
+ const parent = await this.conversationStore.get(parentId);
1841
+ if (parent?.pendingSubagentResults?.length) parentsToCallback.add(parentId);
1842
+ }
1843
+
1719
1844
  for (const parentId of parentsToCallback) {
1720
1845
  this.processSubagentCallback(parentId).catch(err =>
1721
1846
  console.error(`[poncho][subagent] Recovery callback failed for ${parentId}:`, err instanceof Error ? err.message : err),
package/src/state.ts CHANGED
@@ -79,6 +79,9 @@ export interface Conversation {
79
79
  * subagent's runs emit no telemetry (e.g. spawned from an incognito
80
80
  * turn). Read by the orchestrator's runSubagent / continuation. */
81
81
  suppressTelemetry?: boolean;
82
+ /** The parent's `spawn_subagent` tool call id — echoed onto subagent:*
83
+ * events so a client can attach subagent state to that tool's pill. */
84
+ parentToolCallId?: string;
82
85
  };
83
86
  channelMeta?: {
84
87
  platform: string;
@@ -36,12 +36,25 @@ export class PostgresEngine extends SqlStorageEngine {
36
36
  return rows as T[];
37
37
  },
38
38
  exec: async (sql: string): Promise<void> => {
39
- await this.sql.unsafe(sql);
39
+ // DDL is idempotent in our migrations (`CREATE TABLE IF NOT
40
+ // EXISTS`, etc.), so retrying on a stale-socket drop is
41
+ // safe — same idempotency as `query()` reads/writes.
42
+ await this.runWithRetry(() => this.sql.unsafe(sql));
40
43
  },
41
44
  transaction: async (fn: () => Promise<void>): Promise<void> => {
42
- await this.sql.begin(async () => {
45
+ // Transactions are inherently retry-safe at the
46
+ // CONNECTION_ENDED boundary: if the connection dies before
47
+ // BEGIN takes effect server-side, no work was committed and
48
+ // re-running `fn` produces the correct end state. The retry
49
+ // only catches the connection-level reject from the
50
+ // postgres.js client; a partial-commit + drop scenario
51
+ // surfaces as a different error code and bypasses the
52
+ // retry, preserving the caller's expectation that a
53
+ // returned transaction either fully committed or fully
54
+ // rolled back.
55
+ await this.runWithRetry(() => this.sql.begin(async () => {
43
56
  await fn();
44
- });
57
+ }));
45
58
  },
46
59
  };
47
60
  }
@@ -59,25 +72,34 @@ export class PostgresEngine extends SqlStorageEngine {
59
72
  prepare: false,
60
73
  // Connection-pool resilience. Managed Postgres providers
61
74
  // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
62
- // connections server-side after a few minutes. Without these
63
- // knobs, porsager/postgres keeps stale sockets in the pool;
64
- // the next query on one rejects with
65
- // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
66
- // as a hard failure to the caller. Two complementary settings:
75
+ // connections server-side after a few minutes and on
76
+ // Railway in particular, mid-stream drops within a few
77
+ // seconds of inactivity are common. Without these knobs,
78
+ // porsager/postgres keeps stale sockets in the pool; the
79
+ // next query on one rejects with
80
+ // `write CONNECTION_ENDED <host>:5432` at `durMs=0`,
81
+ // surfacing as a hard failure to the caller.
67
82
  //
68
- // - `idle_timeout: 20` closes idle connections client-side
69
- // after 20s, before any reasonable provider-side timer
70
- // fires. Fresh connection on next checkout = no stale
71
- // socket race.
72
- // - `max_lifetime: 600` (10 min) recycles long-lived
73
- // connections defensively even if they've stayed busy,
74
- // which sidesteps a separate class of provider-side
75
- // "max connection age" limits.
83
+ // - `idle_timeout: 5` closes idle connections client-side
84
+ // aggressively. Empirically Railway's pg drops sockets
85
+ // well before the 20s value that managed-provider docs
86
+ // suggest; 5s is short enough to win the race in
87
+ // practice while staying long enough that bursty
88
+ // workloads still get connection reuse.
89
+ // - `max_lifetime: 300` (5 min) recycles long-lived
90
+ // connections defensively. Even with idle_timeout, a
91
+ // connection that's been actively serving small queries
92
+ // for an hour can hit provider-side max-age limits.
93
+ // - `connect_timeout: 10` — slightly less patient on
94
+ // initial connect than the 30s default. Combined with
95
+ // the retry below, "connection refused" surfaces faster
96
+ // during incidents and the caller can shed load instead
97
+ // of stacking up.
76
98
  //
77
- // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
78
- // pool size + initial connect behavior unchanged.
79
- idle_timeout: 20,
80
- max_lifetime: 60 * 10,
99
+ // Pool size (`max: 10`) unchanged.
100
+ idle_timeout: 5,
101
+ max_lifetime: 60 * 5,
102
+ connect_timeout: 10,
81
103
  });
82
104
  }
83
105
 
@@ -147,33 +169,53 @@ export class PostgresEngine extends SqlStorageEngine {
147
169
  }
148
170
 
149
171
  /**
150
- * Single retry on a transient connection-layer failure. The
151
- * `idle_timeout` / `max_lifetime` config above prevents *most*
152
- * stale-connection cases, but a query can still race a
153
- * provider-initiated drop in flight the postgres.js client
154
- * rejects with `code: "CONNECTION_ENDED"` and the next attempt
155
- * checks out a fresh connection from the pool. One retry is
156
- * enough; if it fails again the host-side network is genuinely
157
- * broken and the caller should see the error.
172
+ * Retry on transient connection-layer failures. Three attempts
173
+ * with exponential-ish backoff (0, 50ms, 200ms) — the pool may
174
+ * have multiple stale sockets accumulated during an idle period
175
+ * (especially on managed Postgres after boot when no traffic
176
+ * has flowed for a while), so a single retry can land on a
177
+ * second stale socket and still fail. Three attempts virtually
178
+ * always exhausts the staleness wave; if all three throw, the
179
+ * failure is real and the caller should see it.
158
180
  *
159
- * Only retries reads + the standard exec/run paths in `query`;
160
- * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
161
- * `sql.begin(...)` transactions are unwrapped those are
162
- * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
163
- * atomically scoped (transactions roll back cleanly), and adding
164
- * a retry around them would complicate the transaction
165
- * semantics.
181
+ * Applied to every pg path the executor exposes:
182
+ * - `query()` (run/get/all) — natural retry: queries are
183
+ * idempotent at the connection-failure boundary because the
184
+ * server-side rollback runs cleanly on socket close.
185
+ * - `exec(sql)` for DDL — `CREATE TABLE IF NOT EXISTS` and
186
+ * friends are idempotent by construction.
187
+ * - `transaction(fn)` — only retried when the
188
+ * CONNECTION_ENDED reject arrives *before* the transaction
189
+ * body started executing on the connection; if it errors
190
+ * mid-transaction, the postgres.js client surfaces a
191
+ * different error class (the inner SQL error) and bypasses
192
+ * this retry, preserving the all-or-nothing semantics.
166
193
  */
167
194
  private async runWithRetry<T>(fn: () => Promise<T>): Promise<T> {
168
- try {
169
- return await fn();
170
- } catch (err) {
171
- const code = (err as { code?: string } | null | undefined)?.code;
172
- if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
195
+ const backoffs = [0, 50, 200];
196
+ let lastErr: unknown;
197
+ for (let attempt = 0; attempt < backoffs.length; attempt++) {
198
+ if (backoffs[attempt] > 0) {
199
+ await new Promise((r) => setTimeout(r, backoffs[attempt]));
200
+ }
201
+ try {
173
202
  return await fn();
203
+ } catch (err) {
204
+ lastErr = err;
205
+ const code = (err as { code?: string } | null | undefined)?.code;
206
+ if (
207
+ code === "CONNECTION_ENDED" ||
208
+ code === "CONNECTION_CLOSED" ||
209
+ code === "CONNECTION_DESTROYED" ||
210
+ code === "CONNECT_TIMEOUT" ||
211
+ code === "ECONNRESET"
212
+ ) {
213
+ continue;
214
+ }
215
+ throw err;
174
216
  }
175
- throw err;
176
217
  }
218
+ throw lastErr;
177
219
  }
178
220
 
179
221
  private addToPathCache(tenantId: string, path: string): void {
@@ -40,6 +40,10 @@ export interface SubagentManager {
40
40
  /** Inherit the parent run's telemetry choice — when true, the subagent
41
41
  * run (and its re-runs) emit no telemetry. */
42
42
  suppressTelemetry?: boolean;
43
+ /** The id of the `spawn_subagent` tool call that produced this subagent,
44
+ * so its events can carry `parentToolCallId` and a client can attach
45
+ * subagent state to the spawning tool's pill. */
46
+ parentToolCallId?: string;
43
47
  }): Promise<SubagentSpawnResult>;
44
48
 
45
49
  sendMessage(subagentId: string, message: string): Promise<SubagentSpawnResult>;
@@ -46,6 +46,7 @@ export const createSubagentTools = (
46
46
  ownerId,
47
47
  tenantId: context.tenantId,
48
48
  suppressTelemetry: context.suppressTelemetry,
49
+ parentToolCallId: context.toolCallId,
49
50
  });
50
51
  return { subagentId, status: "running" };
51
52
  },
@@ -62,7 +62,10 @@ export class ToolDispatcher {
62
62
  }
63
63
 
64
64
  try {
65
- const output = await definition.handler(call.input, context);
65
+ // Per-call context: stamp the current tool call's id so handlers that
66
+ // spawn further work (spawn_subagent) can record `parentToolCallId`.
67
+ // A fresh object — `context` is shared across a batch, don't mutate it.
68
+ const output = await definition.handler(call.input, { ...context, toolCallId: call.id });
66
69
  if (context.abortSignal?.aborted) {
67
70
  return {
68
71
  callId: call.id,