npm - @poncho-ai/harness - Versions diffs - 0.52.1 → 0.53.0 - Mend

@poncho-ai/harness 0.52.1 → 0.53.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/.turbo/turbo-build.log +5 -5
package/CHANGELOG.md +106 -0
package/dist/index.d.ts +43 -15
package/dist/index.js +198 -82
package/package.json +2 -2
package/src/harness.ts +3 -1
package/src/orchestrator/orchestrator.ts +162 -37
package/src/state.ts +3 -0
package/src/storage/postgres-engine.ts +83 -41
package/src/subagent-manager.ts +4 -0
package/src/subagent-tools.ts +1 -0
package/src/tool-dispatcher.ts +4 -1

package/.turbo/turbo-build.log CHANGED Viewed

@@ -1,5 +1,5 @@
-> @poncho-ai/harness@0.52.1 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
+> @poncho-ai/harness@0.53.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
 > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
 [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -8,9 +8,9 @@
 [34mCLI[39m tsup v8.5.1
 [34mCLI[39m Target: es2022
 [34mESM[39m Build start
-[32mESM[39m [1mdist/index.js            [22m[32m536.24 KB[39m
 [32mESM[39m [1mdist/isolate-F2PPSUL6.js [22m[32m53.82 KB[39m
-[32mESM[39m ⚡️ Build success in 235ms
+[32mESM[39m [1mdist/index.js            [22m[32m541.42 KB[39m
+[32mESM[39m ⚡️ Build success in 239ms
 [34mDTS[39m Build start
-[32mDTS[39m ⚡️ Build success in 8126ms
-[32mDTS[39m [1mdist/index.d.ts [22m[32m92.40 KB[39m
+[32mDTS[39m ⚡️ Build success in 7708ms
+[32mDTS[39m [1mdist/index.d.ts [22m[32m94.04 KB[39m

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,111 @@
 # @poncho-ai/harness
+## 0.53.0
+### Minor Changes
+- [#145](https://github.com/cesr/poncho-ai/pull/145) [`bfa4976`](https://github.com/cesr/poncho-ai/commit/bfa4976ac8b05a300e22271e23c3bae4aadae2a8) Thanks [@cesr](https://github.com/cesr)! - events: add stable identity so streaming clients match instead of guess
+  Additive fields that let a streaming client reconstruct view-state by
+  identity rather than inferring structure from event order (the source of a
+  class of reconnect/subagent rendering bugs):
+  - `tool:started` / `tool:completed` / `tool:error` now carry `toolCallId`
+    (already in scope as `call.id` / `result.callId`). Clients match tool
+    pills by id instead of by tool name.
+  - `subagent:spawned|completed|error|stopped` now carry `parentToolCallId`
+    (the `spawn_subagent` tool call's id) and `task`; `completed`/`error`
+    also carry `resultText`. Clients attach subagent state to the spawning
+    tool's pill and render the result inline — no header-regex or
+    sequential-cursor pairing needed.
+  - `ToolContext` gains `toolCallId` so the `spawn_subagent` handler can
+    record which call produced the subagent (plumbed: tool-dispatcher →
+    spawn handler → `SubagentSpawnOptions.parentToolCallId` →
+    `subagentMeta.parentToolCallId` → the events above).
+  - `run:started` gains an optional `cause` field in the type
+    (`user|continuation|subagent_callback|approval_resume`); emission is
+    deferred to a later pass.
+  All fields are additive; older clients ignore them.
+### Patch Changes
+- Updated dependencies [[`bfa4976`](https://github.com/cesr/poncho-ai/commit/bfa4976ac8b05a300e22271e23c3bae4aadae2a8)]:
+  - @poncho-ai/sdk@1.15.0
+## 0.52.2
+### Patch Changes
+- [#124](https://github.com/cesr/poncho-ai/pull/124) [`4ae26e0`](https://github.com/cesr/poncho-ai/commit/4ae26e0d8d2788f57411f9c17e10766769514f9b) Thanks [@cesr](https://github.com/cesr)! - harness: postgres retry covers exec/transaction + 3 attempts + tighter idle
+  Follow-up to the previous `idle_timeout`/`max_lifetime`/retry patch.
+  Live testing on Railway showed the previous values weren't tight
+  enough — `write CONNECTION_ENDED postgres.railway.internal:5432`
+  still surfaced both during user-facing chat turns and during
+  subagent auto-callback reruns, despite the new config and the
+  one-shot retry.
+  Two failure modes the previous version didn't cover:
+  1. The retry only wrapped `private query()` (executor.run/get/all),
+     but `executor.exec` (`sql.unsafe`) and `executor.transaction`
+     (`sql.begin`) called the postgres.js client directly. A pg drop
+     inside a transaction or migration write threw straight through.
+  2. After an idle period the pool can have multiple stale sockets;
+     a single retry can checkout a second stale socket from the pool
+     and fail again. One-shot retry exhausted into an error visible
+     to the caller.
+  Fixes:
+  - All three executor paths (`run/get/all`, `exec`, `transaction`)
+    now go through the same `runWithRetry` wrapper. Transactions
+    only retry the connection-level `CONNECTION_ENDED` reject from
+    the postgres.js client — actual SQL errors mid-transaction
+    surface as a different error class and bypass the retry,
+    preserving atomic semantics.
+  - Three attempts with light exponential backoff (0, 50ms, 200ms).
+    Enough to ride out a typical staleness wave; if all three fail
+    the network is genuinely broken.
+  - `CONNECT_TIMEOUT` and `ECONNRESET` added to the retry-eligible
+    error codes.
+  Config knobs tightened:
+  - `idle_timeout: 5` (was 20). Empirically Railway's pg drops
+    sockets well before 20s; 5s wins the race in practice while
+    staying long enough for bursty workloads to reuse connections.
+  - `max_lifetime: 300` (was 600). Same reasoning — recycle more
+    aggressively.
+  - `connect_timeout: 10` (was 30 default). Faster failure during
+    incidents lets callers shed load instead of stacking up.
+- [#144](https://github.com/cesr/poncho-ai/pull/144) [`28d640b`](https://github.com/cesr/poncho-ai/commit/28d640b2f82ea780f8e0be90965972d9903c01d7) Thanks [@cesr](https://github.com/cesr)! - orchestrator: make subagent result delivery reliable
+  Subagent results could silently never reach the parent agent. Several
+  plumbing bugs in `runSubagent` / `runSubagentContinuation`:
+  - **Emit-before-persist race.** `subagent:completed` / `subagent:error`
+    were emitted to the parent's event stream _before_ the result was
+    written to the store, so a consumer reacting to the event (the parent
+    callback, the streaming client) could race the write. Now the result
+    is persisted first, then the event is emitted.
+  - **Silently swallowed writes.** Two `appendSubagentResult(...).catch(() => {})`
+    call sites (the error path and the continuation-error path) dropped the
+    result with no trace on a transient store failure. Replaced with a
+    shared `appendSubagentResultReliable` helper that retries once and then
+    logs loudly — a dropped result is the worst failure mode (the parent
+    waits forever on a subagent it thinks is still running).
+  - **Un-awaited eventSink.** The subagent-callback run path was the lone
+    `this.eventSink(...)` call site that didn't `await` (every other site
+    does), so callback-turn events could interleave out of order. Now awaited.
+  - **Spawn rejections went to a bare `console.error`.** A background
+    `runSubagent` that rejected outside its own try/catch left the parent
+    hanging. Both fire-and-forget spawn paths now route to a
+    `handleSpawnFailure` that marks the child errored and hands the parent
+    an error result so the turn can resume.
+  - **`recoverStaleSubagents` now also drains undelivered results.** It
+    previously only rescued children stuck in `running`; it now also
+    re-triggers the parent callback for any parent that has results sitting
+    in the store with no active run (e.g. a result persisted just before a
+    process restart, whose in-memory callback trigger was lost).
 ## 0.52.1
 ### Patch Changes

package/dist/index.d.ts CHANGED Viewed

@@ -199,6 +199,9 @@ interface Conversation {
          *  subagent's runs emit no telemetry (e.g. spawned from an incognito
          *  turn). Read by the orchestrator's runSubagent / continuation. */
         suppressTelemetry?: boolean;
+        /** The parent's `spawn_subagent` tool call id — echoed onto subagent:*
+         *  events so a client can attach subagent state to that tool's pill. */
+        parentToolCallId?: string;
     };
     channelMeta?: {
         platform: string;
@@ -1190,6 +1193,10 @@ interface SubagentManager {
         /** Inherit the parent run's telemetry choice — when true, the subagent
          *  run (and its re-runs) emit no telemetry. */
         suppressTelemetry?: boolean;
+        /** The id of the `spawn_subagent` tool call that produced this subagent,
+         *  so its events can carry `parentToolCallId` and a client can attach
+         *  subagent state to the spawning tool's pill. */
+        parentToolCallId?: string;
     }): Promise<SubagentSpawnResult>;
     sendMessage(subagentId: string, message: string): Promise<SubagentSpawnResult>;
     stop(subagentId: string): Promise<void>;
@@ -1802,22 +1809,27 @@ declare class PostgresEngine extends SqlStorageEngine {
     private patchVfs;
     private query;
     /**
-     * Single retry on a transient connection-layer failure. The
-     * `idle_timeout` / `max_lifetime` config above prevents *most*
-     * stale-connection cases, but a query can still race a
-     * provider-initiated drop in flight — the postgres.js client
-     * rejects with `code: "CONNECTION_ENDED"` and the next attempt
-     * checks out a fresh connection from the pool. One retry is
-     * enough; if it fails again the host-side network is genuinely
-     * broken and the caller should see the error.
+     * Retry on transient connection-layer failures. Three attempts
+     * with exponential-ish backoff (0, 50ms, 200ms) — the pool may
+     * have multiple stale sockets accumulated during an idle period
+     * (especially on managed Postgres after boot when no traffic
+     * has flowed for a while), so a single retry can land on a
+     * second stale socket and still fail. Three attempts virtually
+     * always exhausts the staleness wave; if all three throw, the
+     * failure is real and the caller should see it.
      *
-     * Only retries reads + the standard exec/run paths in `query`;
-     * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
-     * `sql.begin(...)` transactions are unwrapped — those are
-     * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
-     * atomically scoped (transactions roll back cleanly), and adding
-     * a retry around them would complicate the transaction
-     * semantics.
+     * Applied to every pg path the executor exposes:
+     *  - `query()` (run/get/all)  — natural retry: queries are
+     *    idempotent at the connection-failure boundary because the
+     *    server-side rollback runs cleanly on socket close.
+     *  - `exec(sql)` for DDL      — `CREATE TABLE IF NOT EXISTS` and
+     *    friends are idempotent by construction.
+     *  - `transaction(fn)`        — only retried when the
+     *    CONNECTION_ENDED reject arrives *before* the transaction
+     *    body started executing on the connection; if it errors
+     *    mid-transaction, the postgres.js client surfaces a
+     *    different error class (the inner SQL error) and bypasses
+     *    this retry, preserving the all-or-nothing semantics.
      */
     private runWithRetry;
     private addToPathCache;
@@ -2145,6 +2157,22 @@ declare class AgentOrchestrator {
     processSubagentCallback(conversationId: string, skipLockCheck?: boolean): Promise<void>;
     runSubagentContinuation(conversationId: string, conversation: Conversation, continuationMessages: Message[]): AsyncGenerator<AgentEvent>;
     createSubagentManager(): SubagentManager;
+    /**
+     * Append a subagent result to its parent, retrying once on a transient
+     * store failure before giving up loudly. A silently dropped result is the
+     * worst subagent failure mode — the parent waits forever on a subagent it
+     * thinks is still running — so this never swallows the error the way the
+     * old `.catch(() => {})` call sites did. Returns whether the result landed.
+     */
+    private appendSubagentResultReliable;
+    /**
+     * A subagent's fire-and-forget background run rejected outside its own
+     * error handling (e.g. it threw before entering its try block, or the
+     * catch block itself threw). Without this the parent is left waiting on a
+     * subagent that will never report back. Record the failure on the child
+     * and hand the parent an error result so the turn can resume.
+     */
+    private handleSpawnFailure;
     recoverStaleSubagents(): Promise<void>;
 }

package/dist/index.js CHANGED Viewed

@@ -4415,12 +4415,12 @@ var PostgresEngine = class extends SqlStorageEngine {
         return rows;
       },
       exec: async (sql) => {
-        await this.sql.unsafe(sql);
+        await this.runWithRetry(() => this.sql.unsafe(sql));
       },
       transaction: async (fn) => {
-        await this.sql.begin(async () => {
+        await this.runWithRetry(() => this.sql.begin(async () => {
           await fn();
-        });
+        }));
       }
     };
   }
@@ -4438,25 +4438,34 @@ var PostgresEngine = class extends SqlStorageEngine {
       prepare: false,
       // Connection-pool resilience. Managed Postgres providers
       // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
-      // connections server-side after a few minutes. Without these
-      // knobs, porsager/postgres keeps stale sockets in the pool;
-      // the next query on one rejects with
-      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
-      // as a hard failure to the caller. Two complementary settings:
+      // connections server-side after a few minutes — and on
+      // Railway in particular, mid-stream drops within a few
+      // seconds of inactivity are common. Without these knobs,
+      // porsager/postgres keeps stale sockets in the pool; the
+      // next query on one rejects with
+      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`,
+      // surfacing as a hard failure to the caller.
       //
-      //   - `idle_timeout: 20` closes idle connections client-side
-      //     after 20s, before any reasonable provider-side timer
-      //     fires. Fresh connection on next checkout = no stale
-      //     socket race.
-      //   - `max_lifetime: 600` (10 min) recycles long-lived
-      //     connections defensively even if they've stayed busy,
-      //     which sidesteps a separate class of provider-side
-      //     "max connection age" limits.
+      //   - `idle_timeout: 5` closes idle connections client-side
+      //     aggressively. Empirically Railway's pg drops sockets
+      //     well before the 20s value that managed-provider docs
+      //     suggest; 5s is short enough to win the race in
+      //     practice while staying long enough that bursty
+      //     workloads still get connection reuse.
+      //   - `max_lifetime: 300` (5 min) recycles long-lived
+      //     connections defensively. Even with idle_timeout, a
+      //     connection that's been actively serving small queries
+      //     for an hour can hit provider-side max-age limits.
+      //   - `connect_timeout: 10` — slightly less patient on
+      //     initial connect than the 30s default. Combined with
+      //     the retry below, "connection refused" surfaces faster
+      //     during incidents and the caller can shed load instead
+      //     of stacking up.
       //
-      // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
-      // pool size + initial connect behavior unchanged.
-      idle_timeout: 20,
-      max_lifetime: 60 * 10
+      // Pool size (`max: 10`) unchanged.
+      idle_timeout: 5,
+      max_lifetime: 60 * 5,
+      connect_timeout: 10
     });
   }
   async initialize() {
@@ -4505,33 +4514,47 @@ var PostgresEngine = class extends SqlStorageEngine {
     );
   }
   /**
-   * Single retry on a transient connection-layer failure. The
-   * `idle_timeout` / `max_lifetime` config above prevents *most*
-   * stale-connection cases, but a query can still race a
-   * provider-initiated drop in flight — the postgres.js client
-   * rejects with `code: "CONNECTION_ENDED"` and the next attempt
-   * checks out a fresh connection from the pool. One retry is
-   * enough; if it fails again the host-side network is genuinely
-   * broken and the caller should see the error.
+   * Retry on transient connection-layer failures. Three attempts
+   * with exponential-ish backoff (0, 50ms, 200ms) — the pool may
+   * have multiple stale sockets accumulated during an idle period
+   * (especially on managed Postgres after boot when no traffic
+   * has flowed for a while), so a single retry can land on a
+   * second stale socket and still fail. Three attempts virtually
+   * always exhausts the staleness wave; if all three throw, the
+   * failure is real and the caller should see it.
    *
-   * Only retries reads + the standard exec/run paths in `query`;
-   * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
-   * `sql.begin(...)` transactions are unwrapped — those are
-   * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
-   * atomically scoped (transactions roll back cleanly), and adding
-   * a retry around them would complicate the transaction
-   * semantics.
+   * Applied to every pg path the executor exposes:
+   *  - `query()` (run/get/all)  — natural retry: queries are
+   *    idempotent at the connection-failure boundary because the
+   *    server-side rollback runs cleanly on socket close.
+   *  - `exec(sql)` for DDL      — `CREATE TABLE IF NOT EXISTS` and
+   *    friends are idempotent by construction.
+   *  - `transaction(fn)`        — only retried when the
+   *    CONNECTION_ENDED reject arrives *before* the transaction
+   *    body started executing on the connection; if it errors
+   *    mid-transaction, the postgres.js client surfaces a
+   *    different error class (the inner SQL error) and bypasses
+   *    this retry, preserving the all-or-nothing semantics.
    */
   async runWithRetry(fn) {
-    try {
-      return await fn();
-    } catch (err) {
-      const code = err?.code;
-      if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
+    const backoffs = [0, 50, 200];
+    let lastErr;
+    for (let attempt = 0; attempt < backoffs.length; attempt++) {
+      if (backoffs[attempt] > 0) {
+        await new Promise((r) => setTimeout(r, backoffs[attempt]));
+      }
+      try {
         return await fn();
+      } catch (err) {
+        lastErr = err;
+        const code = err?.code;
+        if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED" || code === "CONNECT_TIMEOUT" || code === "ECONNRESET") {
+          continue;
+        }
+        throw err;
       }
-      throw err;
     }
+    throw lastErr;
   }
   addToPathCache(tenantId, path) {
     const paths = this.pathCache.get(tenantId);
@@ -8300,7 +8323,8 @@ var createSubagentTools = (manager) => [
         parentConversationId: conversationId,
         ownerId,
         tenantId: context.tenantId,
-        suppressTelemetry: context.suppressTelemetry
+        suppressTelemetry: context.suppressTelemetry,
+        parentToolCallId: context.toolCallId
       });
       return { subagentId, status: "running" };
     }
@@ -8562,7 +8586,7 @@ var ToolDispatcher = class {
       };
     }
     try {
-      const output = await definition.handler(call.input, context);
+      const output = await definition.handler(call.input, { ...context, toolCallId: call.id });
       if (context.abortSignal?.aborted) {
         return {
           callId: call.id,
@@ -11147,7 +11171,7 @@ ${textContent}` };
               return;
             }
             const runtimeToolName = exposedToolNames.get(call.name) ?? call.name;
-            yield pushEvent({ type: "tool:started", tool: runtimeToolName, input: call.input });
+            yield pushEvent({ type: "tool:started", tool: runtimeToolName, toolCallId: call.id, input: call.input });
             if (this.requiresApprovalForToolCall(runtimeToolName, call.input)) {
               approvalNeeded.push({
                 approvalId: `approval_${randomUUID5()}`,
@@ -11340,6 +11364,7 @@ ${textContent}` };
               yield pushEvent({
                 type: "tool:error",
                 tool: result2.tool,
+                toolCallId: result2.callId,
                 error: result2.error,
                 recoverable: true
               });
@@ -11381,6 +11406,7 @@ ${textContent}` };
               yield pushEvent({
                 type: "tool:completed",
                 tool: result2.tool,
+                toolCallId: result2.callId,
                 input: callInputMap.get(result2.callId),
                 output: result2.output,
                 duration: now() - batchStart,
@@ -12803,11 +12829,14 @@ var AgentOrchestrator = class {
       result: { status: "completed", response: responseText, steps: 0, tokens: { input: 0, output: 0, cached: 0 }, duration: 0 },
       timestamp: Date.now()
     };
-    await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
+    await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
     await this.eventSink(conv.parentConversationId, {
       type: "subagent:completed",
       subagentId,
-      conversationId: subagentId
+      conversationId: subagentId,
+      task: conv.subagentMeta?.task ?? conv.title,
+      parentToolCallId: conv.subagentMeta?.parentToolCallId,
+      resultText: responseText
     });
     await this.triggerParentCallback(conv.parentConversationId);
   }
@@ -12883,9 +12912,11 @@ var AgentOrchestrator = class {
     let latestRunId = "";
     let runResult;
     let runError;
+    let parentToolCallId;
     try {
       const conversation = await this.conversationStore.getWithArchive(childConversationId);
       if (!conversation) throw new Error("Subagent conversation not found");
+      parentToolCallId = conversation.subagentMeta?.parentToolCallId;
       if (conversation.subagentMeta?.status === "stopped") return;
       conversation.lastActivityAt = Date.now();
       await this.conversationStore.update(conversation);
@@ -13077,12 +13108,6 @@ var AgentOrchestrator = class {
         };
         await this.conversationStore.update(conv);
       }
-      this.hooks?.onStreamEnd?.(childConversationId);
-      await this.eventSink(parentConversationId, {
-        type: "subagent:completed",
-        subagentId: childConversationId,
-        conversationId: childConversationId
-      });
       let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
       if (!gathered) {
         const freshSubConv = await this.conversationStore.get(childConversationId);
@@ -13104,7 +13129,16 @@ var AgentOrchestrator = class {
         ...abnormal ? { error: { code: runError?.code ?? "SUBAGENT_INCOMPLETE", message: runError?.message ?? "subagent ended without a result" } } : {},
         timestamp: Date.now()
       };
-      await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult);
+      await this.appendSubagentResultReliable(parentConversationId, pendingResult);
+      this.hooks?.onStreamEnd?.(childConversationId);
+      await this.eventSink(parentConversationId, {
+        type: "subagent:completed",
+        subagentId: childConversationId,
+        conversationId: childConversationId,
+        task,
+        parentToolCallId,
+        resultText: subagentResponse
+      });
       this.triggerParentCallback(parentConversationId).catch(
         (err) => console.error(`[poncho][subagent] Parent callback failed:`, err instanceof Error ? err.message : err)
       );
@@ -13121,13 +13155,6 @@ var AgentOrchestrator = class {
         conv.updatedAt = Date.now();
         await this.conversationStore.update(conv);
       }
-      this.hooks?.onStreamEnd?.(childConversationId);
-      await this.eventSink(parentConversationId, {
-        type: "subagent:error",
-        subagentId: childConversationId,
-        conversationId: childConversationId,
-        error: errMsg
-      });
       const pendingResult = {
         subagentId: childConversationId,
         task,
@@ -13135,7 +13162,15 @@ var AgentOrchestrator = class {
         error: { code: "SUBAGENT_ERROR", message: errMsg },
         timestamp: Date.now()
       };
-      await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult).catch(() => {
+      await this.appendSubagentResultReliable(parentConversationId, pendingResult);
+      this.hooks?.onStreamEnd?.(childConversationId);
+      await this.eventSink(parentConversationId, {
+        type: "subagent:error",
+        subagentId: childConversationId,
+        conversationId: childConversationId,
+        error: errMsg,
+        task,
+        parentToolCallId
       });
       this.triggerParentCallback(parentConversationId).catch(
         (err2) => console.error(`[poncho][subagent] Parent callback failed:`, err2 instanceof Error ? err2.message : err2)
@@ -13251,12 +13286,12 @@ ${resultBody}`,
         },
         initialContextTokens: conversation.contextTokens ?? 0,
         initialContextWindow: conversation.contextWindow ?? 0,
-        onEvent: (event) => {
+        onEvent: async (event) => {
           if (event.type === "run:started") {
             const active = this.activeConversationRuns.get(conversationId);
             if (active) active.runId = event.runId;
           }
-          this.eventSink(conversationId, event);
+          await this.eventSink(conversationId, event);
         }
       });
       flushTurnDraft(execution.draft);
@@ -13442,11 +13477,6 @@ ${resultBody}`,
         await this.conversationStore.update(conv);
       }
       this.activeSubagentRuns.delete(conversationId);
-      await this.eventSink(parentConversationId, {
-        type: "subagent:completed",
-        subagentId: conversationId,
-        conversationId
-      });
       let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
       if (!gathered) {
         const freshSubConv = await this.conversationStore.get(conversationId);
@@ -13464,7 +13494,17 @@ ${resultBody}`,
           ...abnormal ? { error: { code: runError?.code ?? "SUBAGENT_INCOMPLETE", message: runError?.message ?? "subagent ended without a result" } } : {},
           timestamp: Date.now()
         };
-        await this.conversationStore.appendSubagentResult(parentConversationId, result);
+        await this.appendSubagentResultReliable(parentConversationId, result);
+      }
+      await this.eventSink(parentConversationId, {
+        type: "subagent:completed",
+        subagentId: conversationId,
+        conversationId,
+        task,
+        parentToolCallId: conversation.subagentMeta?.parentToolCallId,
+        resultText: subagentResponse
+      });
+      if (parentConv) {
         if (this.isServerless) {
           this.hooks.dispatchBackground("subagent-callback", parentConversationId);
         } else {
@@ -13492,11 +13532,6 @@ ${resultBody}`,
         conv.updatedAt = Date.now();
         await this.conversationStore.update(conv);
       }
-      await this.eventSink(conversation.parentConversationId, {
-        type: "subagent:completed",
-        subagentId: conversationId,
-        conversationId
-      });
       const parentConv = await this.conversationStore.get(conversation.parentConversationId);
       if (parentConv) {
         const result = {
@@ -13506,12 +13541,22 @@ ${resultBody}`,
           error: { code: "CONTINUATION_ERROR", message: err instanceof Error ? err.message : String(err) },
           timestamp: Date.now()
         };
-        await this.conversationStore.appendSubagentResult(conversation.parentConversationId, result);
+        await this.appendSubagentResultReliable(conversation.parentConversationId, result);
+      }
+      await this.eventSink(conversation.parentConversationId, {
+        type: "subagent:completed",
+        subagentId: conversationId,
+        conversationId,
+        task,
+        parentToolCallId: conversation.subagentMeta?.parentToolCallId
+      });
+      if (parentConv) {
         if (this.isServerless) {
           this.hooks.dispatchBackground("subagent-callback", conversation.parentConversationId);
         } else {
-          this.processSubagentCallback(conversation.parentConversationId).catch(() => {
-          });
+          this.processSubagentCallback(conversation.parentConversationId).catch(
+            (err2) => console.error(`[poncho][subagent] Continuation-error callback failed:`, err2 instanceof Error ? err2.message : err2)
+          );
         }
       }
     }
@@ -13533,7 +13578,7 @@ ${resultBody}`,
           opts.tenantId ?? null,
           {
             parentConversationId: opts.parentConversationId,
-            subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry },
+            subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry, parentToolCallId: opts.parentToolCallId },
             messages: [{ role: "user", content: opts.task }]
           }
         );
@@ -13545,7 +13590,8 @@ ${resultBody}`,
           type: "subagent:spawned",
           subagentId: conversation.conversationId,
           conversationId: conversation.conversationId,
-          task: opts.task
+          task: opts.task,
+          parentToolCallId: opts.parentToolCallId
         });
         if (this.isServerless) {
           this.hooks.dispatchBackground("subagent-run", conversation.conversationId);
@@ -13555,7 +13601,7 @@ ${resultBody}`,
             opts.parentConversationId,
             opts.task,
             opts.ownerId
-          ).catch((err) => console.error(`[poncho][subagent] Background spawn failed:`, err instanceof Error ? err.message : err));
+          ).catch((err) => this.handleSpawnFailure(conversation.conversationId, opts.parentConversationId, opts.task, err));
         }
         return { subagentId: conversation.conversationId };
       },
@@ -13588,7 +13634,7 @@ ${resultBody}`,
             conversation.parentConversationId,
             message,
             conversation.ownerId
-          ).catch((err) => console.error(`[poncho][subagent] Background sendMessage failed:`, err instanceof Error ? err.message : err));
+          ).catch((err) => this.handleSpawnFailure(subagentId, conversation.parentConversationId, message, err));
         }
         return { subagentId };
       },
@@ -13667,6 +13713,67 @@ ${resultBody}`,
     };
   }
   // ── Stale subagent recovery ──
+  /**
+   * Append a subagent result to its parent, retrying once on a transient
+   * store failure before giving up loudly. A silently dropped result is the
+   * worst subagent failure mode — the parent waits forever on a subagent it
+   * thinks is still running — so this never swallows the error the way the
+   * old `.catch(() => {})` call sites did. Returns whether the result landed.
+   */
+  async appendSubagentResultReliable(parentConversationId, result) {
+    try {
+      await this.conversationStore.appendSubagentResult(parentConversationId, result);
+      return true;
+    } catch (firstErr) {
+      try {
+        await this.conversationStore.appendSubagentResult(parentConversationId, result);
+        return true;
+      } catch (secondErr) {
+        console.error(
+          `[poncho][subagent] FAILED to persist result for subagent ${result.subagentId} to parent ${parentConversationId} after 2 attempts \u2014 the parent will not see this result:`,
+          secondErr instanceof Error ? secondErr.message : secondErr,
+          `(first attempt: ${firstErr instanceof Error ? firstErr.message : firstErr})`
+        );
+        return false;
+      }
+    }
+  }
+  /**
+   * A subagent's fire-and-forget background run rejected outside its own
+   * error handling (e.g. it threw before entering its try block, or the
+   * catch block itself threw). Without this the parent is left waiting on a
+   * subagent that will never report back. Record the failure on the child
+   * and hand the parent an error result so the turn can resume.
+   */
+  async handleSpawnFailure(childConversationId, parentConversationId, task, err) {
+    const message = err instanceof Error ? err.message : String(err);
+    console.error(`[poncho][subagent] Background run failed for ${childConversationId}:`, message);
+    try {
+      const conv = await this.conversationStore.get(childConversationId);
+      if (conv?.subagentMeta && conv.subagentMeta.status === "running") {
+        conv.subagentMeta = {
+          ...conv.subagentMeta,
+          status: "error",
+          error: { code: "SUBAGENT_SPAWN_FAILED", message }
+        };
+        conv.updatedAt = Date.now();
+        await this.conversationStore.update(conv);
+      }
+    } catch {
+    }
+    const appended = await this.appendSubagentResultReliable(parentConversationId, {
+      subagentId: childConversationId,
+      task,
+      status: "error",
+      error: { code: "SUBAGENT_SPAWN_FAILED", message },
+      timestamp: Date.now()
+    });
+    if (appended) {
+      this.triggerParentCallback(parentConversationId).catch(
+        (e) => console.error(`[poncho][subagent] Parent callback failed after spawn failure:`, e instanceof Error ? e.message : e)
+      );
+    }
+  }
   async recoverStaleSubagents() {
     const allSummaries = await this.conversationStore.listSummaries();
     const subagentSummaries = allSummaries.filter((s) => s.parentConversationId);
@@ -13692,11 +13799,20 @@ ${resultBody}`,
             error: conv.subagentMeta.error,
             timestamp: Date.now()
           };
-          await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
+          await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
           parentsToCallback.add(conv.parentConversationId);
         }
       }
     }
+    const parentIds = new Set(
+      subagentSummaries.map((s) => s.parentConversationId).filter((id) => !!id)
+    );
+    for (const parentId of parentIds) {
+      if (parentsToCallback.has(parentId)) continue;
+      if (this.activeConversationRuns.has(parentId)) continue;
+      const parent = await this.conversationStore.get(parentId);
+      if (parent?.pendingSubagentResults?.length) parentsToCallback.add(parentId);
+    }
     for (const parentId of parentsToCallback) {
       this.processSubagentCallback(parentId).catch(
         (err) => console.error(`[poncho][subagent] Recovery callback failed for ${parentId}:`, err instanceof Error ? err.message : err)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@poncho-ai/harness",
-  "version": "0.52.1",
+  "version": "0.53.0",
   "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
   "repository": {
     "type": "git",
@@ -34,7 +34,7 @@
     "mustache": "^4.2.0",
     "yaml": "^2.4.0",
     "zod": "^3.22.0",
-    "@poncho-ai/sdk": "1.14.0"
+    "@poncho-ai/sdk": "1.15.0"
   },
   "peerDependencies": {
     "esbuild": ">=0.17.0",

package/src/harness.ts CHANGED Viewed

@@ -3344,7 +3344,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
           return;
         }
         const runtimeToolName = exposedToolNames.get(call.name) ?? call.name;
-        yield pushEvent({ type: "tool:started", tool: runtimeToolName, input: call.input });
+        yield pushEvent({ type: "tool:started", tool: runtimeToolName, toolCallId: call.id, input: call.input });
         if (this.requiresApprovalForToolCall(runtimeToolName, call.input)) {
           approvalNeeded.push({
             approvalId: `approval_${randomUUID()}`,
@@ -3563,6 +3563,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
           yield pushEvent({
             type: "tool:error",
             tool: result.tool,
+            toolCallId: result.callId,
             error: result.error,
             recoverable: true,
           });
@@ -3604,6 +3605,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
           yield pushEvent({
             type: "tool:completed",
             tool: result.tool,
+            toolCallId: result.callId,
             input: callInputMap.get(result.callId),
             output: result.output,
             duration: now() - batchStart,

package/src/orchestrator/orchestrator.ts CHANGED Viewed

@@ -693,12 +693,15 @@ export class AgentOrchestrator {
       result: { status: "completed", response: responseText, steps: 0, tokens: { input: 0, output: 0, cached: 0 }, duration: 0 },
       timestamp: Date.now(),
     };
-    await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
+    await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
     await this.eventSink(conv.parentConversationId, {
       type: "subagent:completed",
       subagentId,
       conversationId: subagentId,
+      task: conv.subagentMeta?.task ?? conv.title,
+      parentToolCallId: conv.subagentMeta?.parentToolCallId,
+      resultText: responseText,
     });
     await this.triggerParentCallback(conv.parentConversationId);
@@ -796,10 +799,14 @@ export class AgentOrchestrator {
     let latestRunId = "";
     let runResult: { status: "completed" | "error" | "cancelled"; response?: string; steps: number; duration: number; continuation?: boolean; continuationMessages?: Message[] } | undefined;
     let runError: { code?: string; message?: string } | undefined;
+    // The spawning tool call's id — echoed onto subagent:* events so the
+    // client can attach subagent state to that tool's pill.
+    let parentToolCallId: string | undefined;
     try {
       const conversation = await this.conversationStore.getWithArchive(childConversationId);
       if (!conversation) throw new Error("Subagent conversation not found");
+      parentToolCallId = conversation.subagentMeta?.parentToolCallId;
       if (conversation.subagentMeta?.status === "stopped") return;
@@ -1012,13 +1019,6 @@ export class AgentOrchestrator {
         await this.conversationStore.update(conv);
       }
-      this.hooks?.onStreamEnd?.(childConversationId);
-      await this.eventSink(parentConversationId, {
-        type: "subagent:completed",
-        subagentId: childConversationId,
-        conversationId: childConversationId,
-      });
       // Recover the subagent's real output: prefer the run response, then the
       // streamed draft, then walk the transcript — discarding the synthetic
       // "[Error: ...]" placeholder at each step.
@@ -1051,7 +1051,21 @@ export class AgentOrchestrator {
           : {}),
         timestamp: Date.now(),
       };
-      await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult);
+      // Persist the result BEFORE emitting subagent:completed: a consumer
+      // reacting to the event (the parent callback, the streaming client)
+      // must find the result already durable in the store, not race its write.
+      await this.appendSubagentResultReliable(parentConversationId, pendingResult);
+      this.hooks?.onStreamEnd?.(childConversationId);
+      await this.eventSink(parentConversationId, {
+        type: "subagent:completed",
+        subagentId: childConversationId,
+        conversationId: childConversationId,
+        task,
+        parentToolCallId,
+        resultText: subagentResponse,
+      });
       this.triggerParentCallback(parentConversationId).catch(err =>
         console.error(`[poncho][subagent] Parent callback failed:`, err instanceof Error ? err.message : err),
       );
@@ -1070,22 +1084,26 @@ export class AgentOrchestrator {
         await this.conversationStore.update(conv);
       }
+      const pendingResult: PendingSubagentResult = {
+        subagentId: childConversationId,
+        task,
+        status: "error",
+        error: { code: "SUBAGENT_ERROR", message: errMsg },
+        timestamp: Date.now(),
+      };
+      // Persist before emitting (see the success path); never swallow.
+      await this.appendSubagentResultReliable(parentConversationId, pendingResult);
       this.hooks?.onStreamEnd?.(childConversationId);
       await this.eventSink(parentConversationId, {
         type: "subagent:error",
         subagentId: childConversationId,
         conversationId: childConversationId,
         error: errMsg,
+        task,
+        parentToolCallId,
       });
-      const pendingResult: PendingSubagentResult = {
-        subagentId: childConversationId,
-        task,
-        status: "error",
-        error: { code: "SUBAGENT_ERROR", message: errMsg },
-        timestamp: Date.now(),
-      };
-      await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult).catch(() => {});
       this.triggerParentCallback(parentConversationId).catch(err2 =>
         console.error(`[poncho][subagent] Parent callback failed:`, err2 instanceof Error ? err2.message : err2),
       );
@@ -1221,12 +1239,15 @@ export class AgentOrchestrator {
         },
         initialContextTokens: conversation.contextTokens ?? 0,
         initialContextWindow: conversation.contextWindow ?? 0,
-        onEvent: (event) => {
+        onEvent: async (event) => {
           if (event.type === "run:started") {
             const active = this.activeConversationRuns.get(conversationId);
             if (active) active.runId = event.runId;
           }
-          this.eventSink(conversationId, event);
+          // Await so the event is fully sunk before the next step's events,
+          // matching every other eventSink call site (the callback run path
+          // was the lone fire-and-forget exception).
+          await this.eventSink(conversationId, event);
         },
       });
       flushTurnDraft(execution.draft);
@@ -1436,11 +1457,6 @@ export class AgentOrchestrator {
       }
       this.activeSubagentRuns.delete(conversationId);
-      await this.eventSink(parentConversationId, {
-        type: "subagent:completed",
-        subagentId: conversationId,
-        conversationId,
-      });
       let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
       if (!gathered) {
@@ -1464,8 +1480,20 @@ export class AgentOrchestrator {
             : {}),
           timestamp: Date.now(),
         };
-        await this.conversationStore.appendSubagentResult(parentConversationId, result);
+        // Persist before emitting completion (see runSubagent).
+        await this.appendSubagentResultReliable(parentConversationId, result);
+      }
+      await this.eventSink(parentConversationId, {
+        type: "subagent:completed",
+        subagentId: conversationId,
+        conversationId,
+        task,
+        parentToolCallId: conversation.subagentMeta?.parentToolCallId,
+        resultText: subagentResponse,
+      });
+      if (parentConv) {
         if (this.isServerless) {
           this.hooks!.dispatchBackground!("subagent-callback", parentConversationId);
         } else {
@@ -1490,12 +1518,6 @@ export class AgentOrchestrator {
         await this.conversationStore.update(conv);
       }
-      await this.eventSink(conversation.parentConversationId!, {
-        type: "subagent:completed",
-        subagentId: conversationId,
-        conversationId,
-      });
       const parentConv = await this.conversationStore.get(conversation.parentConversationId!);
       if (parentConv) {
         const result: PendingSubagentResult = {
@@ -1505,11 +1527,25 @@ export class AgentOrchestrator {
           error: { code: "CONTINUATION_ERROR", message: err instanceof Error ? err.message : String(err) },
           timestamp: Date.now(),
         };
-        await this.conversationStore.appendSubagentResult(conversation.parentConversationId!, result);
+        // Persist before emitting; never swallow (was `.catch(() => {})`).
+        await this.appendSubagentResultReliable(conversation.parentConversationId!, result);
+      }
+      await this.eventSink(conversation.parentConversationId!, {
+        type: "subagent:completed",
+        subagentId: conversationId,
+        conversationId,
+        task,
+        parentToolCallId: conversation.subagentMeta?.parentToolCallId,
+      });
+      if (parentConv) {
         if (this.isServerless) {
           this.hooks!.dispatchBackground!("subagent-callback", conversation.parentConversationId!);
         } else {
-          this.processSubagentCallback(conversation.parentConversationId!).catch(() => {});
+          this.processSubagentCallback(conversation.parentConversationId!).catch(err2 =>
+            console.error(`[poncho][subagent] Continuation-error callback failed:`, err2 instanceof Error ? err2.message : err2),
+          );
         }
       }
     }
@@ -1534,7 +1570,7 @@ export class AgentOrchestrator {
           opts.tenantId ?? null,
           {
             parentConversationId: opts.parentConversationId,
-            subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry },
+            subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry, parentToolCallId: opts.parentToolCallId },
             messages: [{ role: "user", content: opts.task }],
           },
         );
@@ -1549,6 +1585,7 @@ export class AgentOrchestrator {
           subagentId: conversation.conversationId,
           conversationId: conversation.conversationId,
           task: opts.task,
+          parentToolCallId: opts.parentToolCallId,
         });
         if (this.isServerless) {
@@ -1559,7 +1596,7 @@ export class AgentOrchestrator {
             opts.parentConversationId,
             opts.task,
             opts.ownerId,
-          ).catch(err => console.error(`[poncho][subagent] Background spawn failed:`, err instanceof Error ? err.message : err));
+          ).catch(err => this.handleSpawnFailure(conversation.conversationId, opts.parentConversationId, opts.task, err));
         }
         return { subagentId: conversation.conversationId };
@@ -1596,7 +1633,7 @@ export class AgentOrchestrator {
             conversation.parentConversationId,
             message,
             conversation.ownerId,
-          ).catch(err => console.error(`[poncho][subagent] Background sendMessage failed:`, err instanceof Error ? err.message : err));
+          ).catch(err => this.handleSpawnFailure(subagentId, conversation.parentConversationId!, message, err));
         }
         return { subagentId };
@@ -1684,6 +1721,79 @@ export class AgentOrchestrator {
   // ── Stale subagent recovery ──
+  /**
+   * Append a subagent result to its parent, retrying once on a transient
+   * store failure before giving up loudly. A silently dropped result is the
+   * worst subagent failure mode — the parent waits forever on a subagent it
+   * thinks is still running — so this never swallows the error the way the
+   * old `.catch(() => {})` call sites did. Returns whether the result landed.
+   */
+  private async appendSubagentResultReliable(
+    parentConversationId: string,
+    result: PendingSubagentResult,
+  ): Promise<boolean> {
+    try {
+      await this.conversationStore.appendSubagentResult(parentConversationId, result);
+      return true;
+    } catch (firstErr) {
+      try {
+        await this.conversationStore.appendSubagentResult(parentConversationId, result);
+        return true;
+      } catch (secondErr) {
+        console.error(
+          `[poncho][subagent] FAILED to persist result for subagent ${result.subagentId} ` +
+            `to parent ${parentConversationId} after 2 attempts — the parent will not see this result:`,
+          secondErr instanceof Error ? secondErr.message : secondErr,
+          `(first attempt: ${firstErr instanceof Error ? firstErr.message : firstErr})`,
+        );
+        return false;
+      }
+    }
+  }
+  /**
+   * A subagent's fire-and-forget background run rejected outside its own
+   * error handling (e.g. it threw before entering its try block, or the
+   * catch block itself threw). Without this the parent is left waiting on a
+   * subagent that will never report back. Record the failure on the child
+   * and hand the parent an error result so the turn can resume.
+   */
+  private async handleSpawnFailure(
+    childConversationId: string,
+    parentConversationId: string,
+    task: string,
+    err: unknown,
+  ): Promise<void> {
+    const message = err instanceof Error ? err.message : String(err);
+    console.error(`[poncho][subagent] Background run failed for ${childConversationId}:`, message);
+    try {
+      const conv = await this.conversationStore.get(childConversationId);
+      if (conv?.subagentMeta && conv.subagentMeta.status === "running") {
+        conv.subagentMeta = {
+          ...conv.subagentMeta,
+          status: "error",
+          error: { code: "SUBAGENT_SPAWN_FAILED", message },
+        };
+        conv.updatedAt = Date.now();
+        await this.conversationStore.update(conv);
+      }
+    } catch {
+      // best-effort: the result append below is what the parent actually needs
+    }
+    const appended = await this.appendSubagentResultReliable(parentConversationId, {
+      subagentId: childConversationId,
+      task,
+      status: "error",
+      error: { code: "SUBAGENT_SPAWN_FAILED", message },
+      timestamp: Date.now(),
+    });
+    if (appended) {
+      this.triggerParentCallback(parentConversationId).catch(e =>
+        console.error(`[poncho][subagent] Parent callback failed after spawn failure:`, e instanceof Error ? e.message : e),
+      );
+    }
+  }
   async recoverStaleSubagents(): Promise<void> {
     const allSummaries = await this.conversationStore.listSummaries();
     const subagentSummaries = allSummaries.filter((s) => s.parentConversationId);
@@ -1711,11 +1821,26 @@ export class AgentOrchestrator {
             error: conv.subagentMeta.error,
             timestamp: Date.now(),
           };
-          await this.conversationStore.appendSubagentResult(conv.parentConversationId, pendingResult);
+          await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
           parentsToCallback.add(conv.parentConversationId);
         }
       }
     }
+    // Also drain parents that already have results sitting in the store but
+    // no active run to deliver them — e.g. a result persisted just before a
+    // process restart, whose in-memory callback trigger was lost. Without
+    // this the parent stays stuck even though its result landed durably.
+    const parentIds = new Set(
+      subagentSummaries.map(s => s.parentConversationId).filter((id): id is string => !!id),
+    );
+    for (const parentId of parentIds) {
+      if (parentsToCallback.has(parentId)) continue;
+      if (this.activeConversationRuns.has(parentId)) continue;
+      const parent = await this.conversationStore.get(parentId);
+      if (parent?.pendingSubagentResults?.length) parentsToCallback.add(parentId);
+    }
     for (const parentId of parentsToCallback) {
       this.processSubagentCallback(parentId).catch(err =>
         console.error(`[poncho][subagent] Recovery callback failed for ${parentId}:`, err instanceof Error ? err.message : err),

package/src/state.ts CHANGED Viewed

@@ -79,6 +79,9 @@ export interface Conversation {
      *  subagent's runs emit no telemetry (e.g. spawned from an incognito
      *  turn). Read by the orchestrator's runSubagent / continuation. */
     suppressTelemetry?: boolean;
+    /** The parent's `spawn_subagent` tool call id — echoed onto subagent:*
+     *  events so a client can attach subagent state to that tool's pill. */
+    parentToolCallId?: string;
   };
   channelMeta?: {
     platform: string;

package/src/storage/postgres-engine.ts CHANGED Viewed

@@ -36,12 +36,25 @@ export class PostgresEngine extends SqlStorageEngine {
         return rows as T[];
       },
       exec: async (sql: string): Promise<void> => {
-        await this.sql.unsafe(sql);
+        // DDL is idempotent in our migrations (`CREATE TABLE IF NOT
+        // EXISTS`, etc.), so retrying on a stale-socket drop is
+        // safe — same idempotency as `query()` reads/writes.
+        await this.runWithRetry(() => this.sql.unsafe(sql));
       },
       transaction: async (fn: () => Promise<void>): Promise<void> => {
-        await this.sql.begin(async () => {
+        // Transactions are inherently retry-safe at the
+        // CONNECTION_ENDED boundary: if the connection dies before
+        // BEGIN takes effect server-side, no work was committed and
+        // re-running `fn` produces the correct end state. The retry
+        // only catches the connection-level reject from the
+        // postgres.js client; a partial-commit + drop scenario
+        // surfaces as a different error code and bypasses the
+        // retry, preserving the caller's expectation that a
+        // returned transaction either fully committed or fully
+        // rolled back.
+        await this.runWithRetry(() => this.sql.begin(async () => {
           await fn();
-        });
+        }));
       },
     };
   }
@@ -59,25 +72,34 @@ export class PostgresEngine extends SqlStorageEngine {
       prepare: false,
       // Connection-pool resilience. Managed Postgres providers
       // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
-      // connections server-side after a few minutes. Without these
-      // knobs, porsager/postgres keeps stale sockets in the pool;
-      // the next query on one rejects with
-      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
-      // as a hard failure to the caller. Two complementary settings:
+      // connections server-side after a few minutes — and on
+      // Railway in particular, mid-stream drops within a few
+      // seconds of inactivity are common. Without these knobs,
+      // porsager/postgres keeps stale sockets in the pool; the
+      // next query on one rejects with
+      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`,
+      // surfacing as a hard failure to the caller.
       //
-      //   - `idle_timeout: 20` closes idle connections client-side
-      //     after 20s, before any reasonable provider-side timer
-      //     fires. Fresh connection on next checkout = no stale
-      //     socket race.
-      //   - `max_lifetime: 600` (10 min) recycles long-lived
-      //     connections defensively even if they've stayed busy,
-      //     which sidesteps a separate class of provider-side
-      //     "max connection age" limits.
+      //   - `idle_timeout: 5` closes idle connections client-side
+      //     aggressively. Empirically Railway's pg drops sockets
+      //     well before the 20s value that managed-provider docs
+      //     suggest; 5s is short enough to win the race in
+      //     practice while staying long enough that bursty
+      //     workloads still get connection reuse.
+      //   - `max_lifetime: 300` (5 min) recycles long-lived
+      //     connections defensively. Even with idle_timeout, a
+      //     connection that's been actively serving small queries
+      //     for an hour can hit provider-side max-age limits.
+      //   - `connect_timeout: 10` — slightly less patient on
+      //     initial connect than the 30s default. Combined with
+      //     the retry below, "connection refused" surfaces faster
+      //     during incidents and the caller can shed load instead
+      //     of stacking up.
       //
-      // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
-      // pool size + initial connect behavior unchanged.
-      idle_timeout: 20,
-      max_lifetime: 60 * 10,
+      // Pool size (`max: 10`) unchanged.
+      idle_timeout: 5,
+      max_lifetime: 60 * 5,
+      connect_timeout: 10,
     });
   }
@@ -147,33 +169,53 @@ export class PostgresEngine extends SqlStorageEngine {
   }
   /**
-   * Single retry on a transient connection-layer failure. The
-   * `idle_timeout` / `max_lifetime` config above prevents *most*
-   * stale-connection cases, but a query can still race a
-   * provider-initiated drop in flight — the postgres.js client
-   * rejects with `code: "CONNECTION_ENDED"` and the next attempt
-   * checks out a fresh connection from the pool. One retry is
-   * enough; if it fails again the host-side network is genuinely
-   * broken and the caller should see the error.
+   * Retry on transient connection-layer failures. Three attempts
+   * with exponential-ish backoff (0, 50ms, 200ms) — the pool may
+   * have multiple stale sockets accumulated during an idle period
+   * (especially on managed Postgres after boot when no traffic
+   * has flowed for a while), so a single retry can land on a
+   * second stale socket and still fail. Three attempts virtually
+   * always exhausts the staleness wave; if all three throw, the
+   * failure is real and the caller should see it.
    *
-   * Only retries reads + the standard exec/run paths in `query`;
-   * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
-   * `sql.begin(...)` transactions are unwrapped — those are
-   * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
-   * atomically scoped (transactions roll back cleanly), and adding
-   * a retry around them would complicate the transaction
-   * semantics.
+   * Applied to every pg path the executor exposes:
+   *  - `query()` (run/get/all)  — natural retry: queries are
+   *    idempotent at the connection-failure boundary because the
+   *    server-side rollback runs cleanly on socket close.
+   *  - `exec(sql)` for DDL      — `CREATE TABLE IF NOT EXISTS` and
+   *    friends are idempotent by construction.
+   *  - `transaction(fn)`        — only retried when the
+   *    CONNECTION_ENDED reject arrives *before* the transaction
+   *    body started executing on the connection; if it errors
+   *    mid-transaction, the postgres.js client surfaces a
+   *    different error class (the inner SQL error) and bypasses
+   *    this retry, preserving the all-or-nothing semantics.
    */
   private async runWithRetry<T>(fn: () => Promise<T>): Promise<T> {
-    try {
-      return await fn();
-    } catch (err) {
-      const code = (err as { code?: string } | null | undefined)?.code;
-      if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
+    const backoffs = [0, 50, 200];
+    let lastErr: unknown;
+    for (let attempt = 0; attempt < backoffs.length; attempt++) {
+      if (backoffs[attempt] > 0) {
+        await new Promise((r) => setTimeout(r, backoffs[attempt]));
+      }
+      try {
         return await fn();
+      } catch (err) {
+        lastErr = err;
+        const code = (err as { code?: string } | null | undefined)?.code;
+        if (
+          code === "CONNECTION_ENDED" ||
+          code === "CONNECTION_CLOSED" ||
+          code === "CONNECTION_DESTROYED" ||
+          code === "CONNECT_TIMEOUT" ||
+          code === "ECONNRESET"
+        ) {
+          continue;
+        }
+        throw err;
       }
-      throw err;
     }
+    throw lastErr;
   }
   private addToPathCache(tenantId: string, path: string): void {

package/src/subagent-manager.ts CHANGED Viewed

@@ -40,6 +40,10 @@ export interface SubagentManager {
     /** Inherit the parent run's telemetry choice — when true, the subagent
      *  run (and its re-runs) emit no telemetry. */
     suppressTelemetry?: boolean;
+    /** The id of the `spawn_subagent` tool call that produced this subagent,
+     *  so its events can carry `parentToolCallId` and a client can attach
+     *  subagent state to the spawning tool's pill. */
+    parentToolCallId?: string;
   }): Promise<SubagentSpawnResult>;
   sendMessage(subagentId: string, message: string): Promise<SubagentSpawnResult>;

package/src/subagent-tools.ts CHANGED Viewed

@@ -46,6 +46,7 @@ export const createSubagentTools = (
         ownerId,
         tenantId: context.tenantId,
         suppressTelemetry: context.suppressTelemetry,
+        parentToolCallId: context.toolCallId,
       });
       return { subagentId, status: "running" };
     },

package/src/tool-dispatcher.ts CHANGED Viewed

@@ -62,7 +62,10 @@ export class ToolDispatcher {
     }
     try {
-      const output = await definition.handler(call.input, context);
+      // Per-call context: stamp the current tool call's id so handlers that
+      // spawn further work (spawn_subagent) can record `parentToolCallId`.
+      // A fresh object — `context` is shared across a batch, don't mutate it.
+      const output = await definition.handler(call.input, { ...context, toolCallId: call.id });
       if (context.abortSignal?.aborted) {
         return {
           callId: call.id,