@poncho-ai/harness 0.52.1 → 0.53.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +5 -5
- package/CHANGELOG.md +106 -0
- package/dist/index.d.ts +43 -15
- package/dist/index.js +198 -82
- package/package.json +2 -2
- package/src/harness.ts +3 -1
- package/src/orchestrator/orchestrator.ts +162 -37
- package/src/state.ts +3 -0
- package/src/storage/postgres-engine.ts +83 -41
- package/src/subagent-manager.ts +4 -0
- package/src/subagent-tools.ts +1 -0
- package/src/tool-dispatcher.ts +4 -1
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
|
|
2
|
-
> @poncho-ai/harness@0.
|
|
2
|
+
> @poncho-ai/harness@0.53.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
|
|
3
3
|
> node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
|
|
4
4
|
|
|
5
5
|
[embed-docs] Generated poncho-docs.ts with 4 topics
|
|
@@ -8,9 +8,9 @@
|
|
|
8
8
|
[34mCLI[39m tsup v8.5.1
|
|
9
9
|
[34mCLI[39m Target: es2022
|
|
10
10
|
[34mESM[39m Build start
|
|
11
|
-
[32mESM[39m [1mdist/index.js [22m[32m536.24 KB[39m
|
|
12
11
|
[32mESM[39m [1mdist/isolate-F2PPSUL6.js [22m[32m53.82 KB[39m
|
|
13
|
-
[32mESM[39m
|
|
12
|
+
[32mESM[39m [1mdist/index.js [22m[32m541.42 KB[39m
|
|
13
|
+
[32mESM[39m ⚡️ Build success in 239ms
|
|
14
14
|
[34mDTS[39m Build start
|
|
15
|
-
[32mDTS[39m ⚡️ Build success in
|
|
16
|
-
[32mDTS[39m [1mdist/index.d.ts [22m[
|
|
15
|
+
[32mDTS[39m ⚡️ Build success in 7708ms
|
|
16
|
+
[32mDTS[39m [1mdist/index.d.ts [22m[32m94.04 KB[39m
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,111 @@
|
|
|
1
1
|
# @poncho-ai/harness
|
|
2
2
|
|
|
3
|
+
## 0.53.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- [#145](https://github.com/cesr/poncho-ai/pull/145) [`bfa4976`](https://github.com/cesr/poncho-ai/commit/bfa4976ac8b05a300e22271e23c3bae4aadae2a8) Thanks [@cesr](https://github.com/cesr)! - events: add stable identity so streaming clients match instead of guess
|
|
8
|
+
|
|
9
|
+
Additive fields that let a streaming client reconstruct view-state by
|
|
10
|
+
identity rather than inferring structure from event order (the source of a
|
|
11
|
+
class of reconnect/subagent rendering bugs):
|
|
12
|
+
- `tool:started` / `tool:completed` / `tool:error` now carry `toolCallId`
|
|
13
|
+
(already in scope as `call.id` / `result.callId`). Clients match tool
|
|
14
|
+
pills by id instead of by tool name.
|
|
15
|
+
- `subagent:spawned|completed|error|stopped` now carry `parentToolCallId`
|
|
16
|
+
(the `spawn_subagent` tool call's id) and `task`; `completed`/`error`
|
|
17
|
+
also carry `resultText`. Clients attach subagent state to the spawning
|
|
18
|
+
tool's pill and render the result inline — no header-regex or
|
|
19
|
+
sequential-cursor pairing needed.
|
|
20
|
+
- `ToolContext` gains `toolCallId` so the `spawn_subagent` handler can
|
|
21
|
+
record which call produced the subagent (plumbed: tool-dispatcher →
|
|
22
|
+
spawn handler → `SubagentSpawnOptions.parentToolCallId` →
|
|
23
|
+
`subagentMeta.parentToolCallId` → the events above).
|
|
24
|
+
- `run:started` gains an optional `cause` field in the type
|
|
25
|
+
(`user|continuation|subagent_callback|approval_resume`); emission is
|
|
26
|
+
deferred to a later pass.
|
|
27
|
+
|
|
28
|
+
All fields are additive; older clients ignore them.
|
|
29
|
+
|
|
30
|
+
### Patch Changes
|
|
31
|
+
|
|
32
|
+
- Updated dependencies [[`bfa4976`](https://github.com/cesr/poncho-ai/commit/bfa4976ac8b05a300e22271e23c3bae4aadae2a8)]:
|
|
33
|
+
- @poncho-ai/sdk@1.15.0
|
|
34
|
+
|
|
35
|
+
## 0.52.2
|
|
36
|
+
|
|
37
|
+
### Patch Changes
|
|
38
|
+
|
|
39
|
+
- [#124](https://github.com/cesr/poncho-ai/pull/124) [`4ae26e0`](https://github.com/cesr/poncho-ai/commit/4ae26e0d8d2788f57411f9c17e10766769514f9b) Thanks [@cesr](https://github.com/cesr)! - harness: postgres retry covers exec/transaction + 3 attempts + tighter idle
|
|
40
|
+
|
|
41
|
+
Follow-up to the previous `idle_timeout`/`max_lifetime`/retry patch.
|
|
42
|
+
Live testing on Railway showed the previous values weren't tight
|
|
43
|
+
enough — `write CONNECTION_ENDED postgres.railway.internal:5432`
|
|
44
|
+
still surfaced both during user-facing chat turns and during
|
|
45
|
+
subagent auto-callback reruns, despite the new config and the
|
|
46
|
+
one-shot retry.
|
|
47
|
+
|
|
48
|
+
Two failure modes the previous version didn't cover:
|
|
49
|
+
1. The retry only wrapped `private query()` (executor.run/get/all),
|
|
50
|
+
but `executor.exec` (`sql.unsafe`) and `executor.transaction`
|
|
51
|
+
(`sql.begin`) called the postgres.js client directly. A pg drop
|
|
52
|
+
inside a transaction or migration write threw straight through.
|
|
53
|
+
2. After an idle period the pool can have multiple stale sockets;
|
|
54
|
+
a single retry can checkout a second stale socket from the pool
|
|
55
|
+
and fail again. One-shot retry exhausted into an error visible
|
|
56
|
+
to the caller.
|
|
57
|
+
|
|
58
|
+
Fixes:
|
|
59
|
+
- All three executor paths (`run/get/all`, `exec`, `transaction`)
|
|
60
|
+
now go through the same `runWithRetry` wrapper. Transactions
|
|
61
|
+
only retry the connection-level `CONNECTION_ENDED` reject from
|
|
62
|
+
the postgres.js client — actual SQL errors mid-transaction
|
|
63
|
+
surface as a different error class and bypass the retry,
|
|
64
|
+
preserving atomic semantics.
|
|
65
|
+
- Three attempts with light exponential backoff (0, 50ms, 200ms).
|
|
66
|
+
Enough to ride out a typical staleness wave; if all three fail
|
|
67
|
+
the network is genuinely broken.
|
|
68
|
+
- `CONNECT_TIMEOUT` and `ECONNRESET` added to the retry-eligible
|
|
69
|
+
error codes.
|
|
70
|
+
|
|
71
|
+
Config knobs tightened:
|
|
72
|
+
- `idle_timeout: 5` (was 20). Empirically Railway's pg drops
|
|
73
|
+
sockets well before 20s; 5s wins the race in practice while
|
|
74
|
+
staying long enough for bursty workloads to reuse connections.
|
|
75
|
+
- `max_lifetime: 300` (was 600). Same reasoning — recycle more
|
|
76
|
+
aggressively.
|
|
77
|
+
- `connect_timeout: 10` (was 30 default). Faster failure during
|
|
78
|
+
incidents lets callers shed load instead of stacking up.
|
|
79
|
+
|
|
80
|
+
- [#144](https://github.com/cesr/poncho-ai/pull/144) [`28d640b`](https://github.com/cesr/poncho-ai/commit/28d640b2f82ea780f8e0be90965972d9903c01d7) Thanks [@cesr](https://github.com/cesr)! - orchestrator: make subagent result delivery reliable
|
|
81
|
+
|
|
82
|
+
Subagent results could silently never reach the parent agent. Several
|
|
83
|
+
plumbing bugs in `runSubagent` / `runSubagentContinuation`:
|
|
84
|
+
- **Emit-before-persist race.** `subagent:completed` / `subagent:error`
|
|
85
|
+
were emitted to the parent's event stream _before_ the result was
|
|
86
|
+
written to the store, so a consumer reacting to the event (the parent
|
|
87
|
+
callback, the streaming client) could race the write. Now the result
|
|
88
|
+
is persisted first, then the event is emitted.
|
|
89
|
+
- **Silently swallowed writes.** Two `appendSubagentResult(...).catch(() => {})`
|
|
90
|
+
call sites (the error path and the continuation-error path) dropped the
|
|
91
|
+
result with no trace on a transient store failure. Replaced with a
|
|
92
|
+
shared `appendSubagentResultReliable` helper that retries once and then
|
|
93
|
+
logs loudly — a dropped result is the worst failure mode (the parent
|
|
94
|
+
waits forever on a subagent it thinks is still running).
|
|
95
|
+
- **Un-awaited eventSink.** The subagent-callback run path was the lone
|
|
96
|
+
`this.eventSink(...)` call site that didn't `await` (every other site
|
|
97
|
+
does), so callback-turn events could interleave out of order. Now awaited.
|
|
98
|
+
- **Spawn rejections went to a bare `console.error`.** A background
|
|
99
|
+
`runSubagent` that rejected outside its own try/catch left the parent
|
|
100
|
+
hanging. Both fire-and-forget spawn paths now route to a
|
|
101
|
+
`handleSpawnFailure` that marks the child errored and hands the parent
|
|
102
|
+
an error result so the turn can resume.
|
|
103
|
+
- **`recoverStaleSubagents` now also drains undelivered results.** It
|
|
104
|
+
previously only rescued children stuck in `running`; it now also
|
|
105
|
+
re-triggers the parent callback for any parent that has results sitting
|
|
106
|
+
in the store with no active run (e.g. a result persisted just before a
|
|
107
|
+
process restart, whose in-memory callback trigger was lost).
|
|
108
|
+
|
|
3
109
|
## 0.52.1
|
|
4
110
|
|
|
5
111
|
### Patch Changes
|
package/dist/index.d.ts
CHANGED
|
@@ -199,6 +199,9 @@ interface Conversation {
|
|
|
199
199
|
* subagent's runs emit no telemetry (e.g. spawned from an incognito
|
|
200
200
|
* turn). Read by the orchestrator's runSubagent / continuation. */
|
|
201
201
|
suppressTelemetry?: boolean;
|
|
202
|
+
/** The parent's `spawn_subagent` tool call id — echoed onto subagent:*
|
|
203
|
+
* events so a client can attach subagent state to that tool's pill. */
|
|
204
|
+
parentToolCallId?: string;
|
|
202
205
|
};
|
|
203
206
|
channelMeta?: {
|
|
204
207
|
platform: string;
|
|
@@ -1190,6 +1193,10 @@ interface SubagentManager {
|
|
|
1190
1193
|
/** Inherit the parent run's telemetry choice — when true, the subagent
|
|
1191
1194
|
* run (and its re-runs) emit no telemetry. */
|
|
1192
1195
|
suppressTelemetry?: boolean;
|
|
1196
|
+
/** The id of the `spawn_subagent` tool call that produced this subagent,
|
|
1197
|
+
* so its events can carry `parentToolCallId` and a client can attach
|
|
1198
|
+
* subagent state to the spawning tool's pill. */
|
|
1199
|
+
parentToolCallId?: string;
|
|
1193
1200
|
}): Promise<SubagentSpawnResult>;
|
|
1194
1201
|
sendMessage(subagentId: string, message: string): Promise<SubagentSpawnResult>;
|
|
1195
1202
|
stop(subagentId: string): Promise<void>;
|
|
@@ -1802,22 +1809,27 @@ declare class PostgresEngine extends SqlStorageEngine {
|
|
|
1802
1809
|
private patchVfs;
|
|
1803
1810
|
private query;
|
|
1804
1811
|
/**
|
|
1805
|
-
*
|
|
1806
|
-
*
|
|
1807
|
-
*
|
|
1808
|
-
*
|
|
1809
|
-
*
|
|
1810
|
-
*
|
|
1811
|
-
*
|
|
1812
|
-
*
|
|
1812
|
+
* Retry on transient connection-layer failures. Three attempts
|
|
1813
|
+
* with exponential-ish backoff (0, 50ms, 200ms) — the pool may
|
|
1814
|
+
* have multiple stale sockets accumulated during an idle period
|
|
1815
|
+
* (especially on managed Postgres after boot when no traffic
|
|
1816
|
+
* has flowed for a while), so a single retry can land on a
|
|
1817
|
+
* second stale socket and still fail. Three attempts virtually
|
|
1818
|
+
* always exhausts the staleness wave; if all three throw, the
|
|
1819
|
+
* failure is real and the caller should see it.
|
|
1813
1820
|
*
|
|
1814
|
-
*
|
|
1815
|
-
* `
|
|
1816
|
-
*
|
|
1817
|
-
*
|
|
1818
|
-
*
|
|
1819
|
-
*
|
|
1820
|
-
*
|
|
1821
|
+
* Applied to every pg path the executor exposes:
|
|
1822
|
+
* - `query()` (run/get/all) — natural retry: queries are
|
|
1823
|
+
* idempotent at the connection-failure boundary because the
|
|
1824
|
+
* server-side rollback runs cleanly on socket close.
|
|
1825
|
+
* - `exec(sql)` for DDL — `CREATE TABLE IF NOT EXISTS` and
|
|
1826
|
+
* friends are idempotent by construction.
|
|
1827
|
+
* - `transaction(fn)` — only retried when the
|
|
1828
|
+
* CONNECTION_ENDED reject arrives *before* the transaction
|
|
1829
|
+
* body started executing on the connection; if it errors
|
|
1830
|
+
* mid-transaction, the postgres.js client surfaces a
|
|
1831
|
+
* different error class (the inner SQL error) and bypasses
|
|
1832
|
+
* this retry, preserving the all-or-nothing semantics.
|
|
1821
1833
|
*/
|
|
1822
1834
|
private runWithRetry;
|
|
1823
1835
|
private addToPathCache;
|
|
@@ -2145,6 +2157,22 @@ declare class AgentOrchestrator {
|
|
|
2145
2157
|
processSubagentCallback(conversationId: string, skipLockCheck?: boolean): Promise<void>;
|
|
2146
2158
|
runSubagentContinuation(conversationId: string, conversation: Conversation, continuationMessages: Message[]): AsyncGenerator<AgentEvent>;
|
|
2147
2159
|
createSubagentManager(): SubagentManager;
|
|
2160
|
+
/**
|
|
2161
|
+
* Append a subagent result to its parent, retrying once on a transient
|
|
2162
|
+
* store failure before giving up loudly. A silently dropped result is the
|
|
2163
|
+
* worst subagent failure mode — the parent waits forever on a subagent it
|
|
2164
|
+
* thinks is still running — so this never swallows the error the way the
|
|
2165
|
+
* old `.catch(() => {})` call sites did. Returns whether the result landed.
|
|
2166
|
+
*/
|
|
2167
|
+
private appendSubagentResultReliable;
|
|
2168
|
+
/**
|
|
2169
|
+
* A subagent's fire-and-forget background run rejected outside its own
|
|
2170
|
+
* error handling (e.g. it threw before entering its try block, or the
|
|
2171
|
+
* catch block itself threw). Without this the parent is left waiting on a
|
|
2172
|
+
* subagent that will never report back. Record the failure on the child
|
|
2173
|
+
* and hand the parent an error result so the turn can resume.
|
|
2174
|
+
*/
|
|
2175
|
+
private handleSpawnFailure;
|
|
2148
2176
|
recoverStaleSubagents(): Promise<void>;
|
|
2149
2177
|
}
|
|
2150
2178
|
|
package/dist/index.js
CHANGED
|
@@ -4415,12 +4415,12 @@ var PostgresEngine = class extends SqlStorageEngine {
|
|
|
4415
4415
|
return rows;
|
|
4416
4416
|
},
|
|
4417
4417
|
exec: async (sql) => {
|
|
4418
|
-
await this.sql.unsafe(sql);
|
|
4418
|
+
await this.runWithRetry(() => this.sql.unsafe(sql));
|
|
4419
4419
|
},
|
|
4420
4420
|
transaction: async (fn) => {
|
|
4421
|
-
await this.sql.begin(async () => {
|
|
4421
|
+
await this.runWithRetry(() => this.sql.begin(async () => {
|
|
4422
4422
|
await fn();
|
|
4423
|
-
});
|
|
4423
|
+
}));
|
|
4424
4424
|
}
|
|
4425
4425
|
};
|
|
4426
4426
|
}
|
|
@@ -4438,25 +4438,34 @@ var PostgresEngine = class extends SqlStorageEngine {
|
|
|
4438
4438
|
prepare: false,
|
|
4439
4439
|
// Connection-pool resilience. Managed Postgres providers
|
|
4440
4440
|
// (Railway, Neon, Heroku, etc.) routinely drop idle TCP
|
|
4441
|
-
// connections server-side after a few minutes
|
|
4442
|
-
//
|
|
4443
|
-
//
|
|
4444
|
-
//
|
|
4445
|
-
//
|
|
4441
|
+
// connections server-side after a few minutes — and on
|
|
4442
|
+
// Railway in particular, mid-stream drops within a few
|
|
4443
|
+
// seconds of inactivity are common. Without these knobs,
|
|
4444
|
+
// porsager/postgres keeps stale sockets in the pool; the
|
|
4445
|
+
// next query on one rejects with
|
|
4446
|
+
// `write CONNECTION_ENDED <host>:5432` at `durMs=0`,
|
|
4447
|
+
// surfacing as a hard failure to the caller.
|
|
4446
4448
|
//
|
|
4447
|
-
// - `idle_timeout:
|
|
4448
|
-
//
|
|
4449
|
-
//
|
|
4450
|
-
//
|
|
4451
|
-
//
|
|
4452
|
-
//
|
|
4453
|
-
//
|
|
4454
|
-
//
|
|
4449
|
+
// - `idle_timeout: 5` closes idle connections client-side
|
|
4450
|
+
// aggressively. Empirically Railway's pg drops sockets
|
|
4451
|
+
// well before the 20s value that managed-provider docs
|
|
4452
|
+
// suggest; 5s is short enough to win the race in
|
|
4453
|
+
// practice while staying long enough that bursty
|
|
4454
|
+
// workloads still get connection reuse.
|
|
4455
|
+
// - `max_lifetime: 300` (5 min) recycles long-lived
|
|
4456
|
+
// connections defensively. Even with idle_timeout, a
|
|
4457
|
+
// connection that's been actively serving small queries
|
|
4458
|
+
// for an hour can hit provider-side max-age limits.
|
|
4459
|
+
// - `connect_timeout: 10` — slightly less patient on
|
|
4460
|
+
// initial connect than the 30s default. Combined with
|
|
4461
|
+
// the retry below, "connection refused" surfaces faster
|
|
4462
|
+
// during incidents and the caller can shed load instead
|
|
4463
|
+
// of stacking up.
|
|
4455
4464
|
//
|
|
4456
|
-
//
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4465
|
+
// Pool size (`max: 10`) unchanged.
|
|
4466
|
+
idle_timeout: 5,
|
|
4467
|
+
max_lifetime: 60 * 5,
|
|
4468
|
+
connect_timeout: 10
|
|
4460
4469
|
});
|
|
4461
4470
|
}
|
|
4462
4471
|
async initialize() {
|
|
@@ -4505,33 +4514,47 @@ var PostgresEngine = class extends SqlStorageEngine {
|
|
|
4505
4514
|
);
|
|
4506
4515
|
}
|
|
4507
4516
|
/**
|
|
4508
|
-
*
|
|
4509
|
-
*
|
|
4510
|
-
*
|
|
4511
|
-
*
|
|
4512
|
-
*
|
|
4513
|
-
*
|
|
4514
|
-
*
|
|
4515
|
-
*
|
|
4517
|
+
* Retry on transient connection-layer failures. Three attempts
|
|
4518
|
+
* with exponential-ish backoff (0, 50ms, 200ms) — the pool may
|
|
4519
|
+
* have multiple stale sockets accumulated during an idle period
|
|
4520
|
+
* (especially on managed Postgres after boot when no traffic
|
|
4521
|
+
* has flowed for a while), so a single retry can land on a
|
|
4522
|
+
* second stale socket and still fail. Three attempts virtually
|
|
4523
|
+
* always exhausts the staleness wave; if all three throw, the
|
|
4524
|
+
* failure is real and the caller should see it.
|
|
4516
4525
|
*
|
|
4517
|
-
*
|
|
4518
|
-
* `
|
|
4519
|
-
*
|
|
4520
|
-
*
|
|
4521
|
-
*
|
|
4522
|
-
*
|
|
4523
|
-
*
|
|
4526
|
+
* Applied to every pg path the executor exposes:
|
|
4527
|
+
* - `query()` (run/get/all) — natural retry: queries are
|
|
4528
|
+
* idempotent at the connection-failure boundary because the
|
|
4529
|
+
* server-side rollback runs cleanly on socket close.
|
|
4530
|
+
* - `exec(sql)` for DDL — `CREATE TABLE IF NOT EXISTS` and
|
|
4531
|
+
* friends are idempotent by construction.
|
|
4532
|
+
* - `transaction(fn)` — only retried when the
|
|
4533
|
+
* CONNECTION_ENDED reject arrives *before* the transaction
|
|
4534
|
+
* body started executing on the connection; if it errors
|
|
4535
|
+
* mid-transaction, the postgres.js client surfaces a
|
|
4536
|
+
* different error class (the inner SQL error) and bypasses
|
|
4537
|
+
* this retry, preserving the all-or-nothing semantics.
|
|
4524
4538
|
*/
|
|
4525
4539
|
async runWithRetry(fn) {
|
|
4526
|
-
|
|
4527
|
-
|
|
4528
|
-
|
|
4529
|
-
|
|
4530
|
-
|
|
4540
|
+
const backoffs = [0, 50, 200];
|
|
4541
|
+
let lastErr;
|
|
4542
|
+
for (let attempt = 0; attempt < backoffs.length; attempt++) {
|
|
4543
|
+
if (backoffs[attempt] > 0) {
|
|
4544
|
+
await new Promise((r) => setTimeout(r, backoffs[attempt]));
|
|
4545
|
+
}
|
|
4546
|
+
try {
|
|
4531
4547
|
return await fn();
|
|
4548
|
+
} catch (err) {
|
|
4549
|
+
lastErr = err;
|
|
4550
|
+
const code = err?.code;
|
|
4551
|
+
if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED" || code === "CONNECT_TIMEOUT" || code === "ECONNRESET") {
|
|
4552
|
+
continue;
|
|
4553
|
+
}
|
|
4554
|
+
throw err;
|
|
4532
4555
|
}
|
|
4533
|
-
throw err;
|
|
4534
4556
|
}
|
|
4557
|
+
throw lastErr;
|
|
4535
4558
|
}
|
|
4536
4559
|
addToPathCache(tenantId, path) {
|
|
4537
4560
|
const paths = this.pathCache.get(tenantId);
|
|
@@ -8300,7 +8323,8 @@ var createSubagentTools = (manager) => [
|
|
|
8300
8323
|
parentConversationId: conversationId,
|
|
8301
8324
|
ownerId,
|
|
8302
8325
|
tenantId: context.tenantId,
|
|
8303
|
-
suppressTelemetry: context.suppressTelemetry
|
|
8326
|
+
suppressTelemetry: context.suppressTelemetry,
|
|
8327
|
+
parentToolCallId: context.toolCallId
|
|
8304
8328
|
});
|
|
8305
8329
|
return { subagentId, status: "running" };
|
|
8306
8330
|
}
|
|
@@ -8562,7 +8586,7 @@ var ToolDispatcher = class {
|
|
|
8562
8586
|
};
|
|
8563
8587
|
}
|
|
8564
8588
|
try {
|
|
8565
|
-
const output = await definition.handler(call.input, context);
|
|
8589
|
+
const output = await definition.handler(call.input, { ...context, toolCallId: call.id });
|
|
8566
8590
|
if (context.abortSignal?.aborted) {
|
|
8567
8591
|
return {
|
|
8568
8592
|
callId: call.id,
|
|
@@ -11147,7 +11171,7 @@ ${textContent}` };
|
|
|
11147
11171
|
return;
|
|
11148
11172
|
}
|
|
11149
11173
|
const runtimeToolName = exposedToolNames.get(call.name) ?? call.name;
|
|
11150
|
-
yield pushEvent({ type: "tool:started", tool: runtimeToolName, input: call.input });
|
|
11174
|
+
yield pushEvent({ type: "tool:started", tool: runtimeToolName, toolCallId: call.id, input: call.input });
|
|
11151
11175
|
if (this.requiresApprovalForToolCall(runtimeToolName, call.input)) {
|
|
11152
11176
|
approvalNeeded.push({
|
|
11153
11177
|
approvalId: `approval_${randomUUID5()}`,
|
|
@@ -11340,6 +11364,7 @@ ${textContent}` };
|
|
|
11340
11364
|
yield pushEvent({
|
|
11341
11365
|
type: "tool:error",
|
|
11342
11366
|
tool: result2.tool,
|
|
11367
|
+
toolCallId: result2.callId,
|
|
11343
11368
|
error: result2.error,
|
|
11344
11369
|
recoverable: true
|
|
11345
11370
|
});
|
|
@@ -11381,6 +11406,7 @@ ${textContent}` };
|
|
|
11381
11406
|
yield pushEvent({
|
|
11382
11407
|
type: "tool:completed",
|
|
11383
11408
|
tool: result2.tool,
|
|
11409
|
+
toolCallId: result2.callId,
|
|
11384
11410
|
input: callInputMap.get(result2.callId),
|
|
11385
11411
|
output: result2.output,
|
|
11386
11412
|
duration: now() - batchStart,
|
|
@@ -12803,11 +12829,14 @@ var AgentOrchestrator = class {
|
|
|
12803
12829
|
result: { status: "completed", response: responseText, steps: 0, tokens: { input: 0, output: 0, cached: 0 }, duration: 0 },
|
|
12804
12830
|
timestamp: Date.now()
|
|
12805
12831
|
};
|
|
12806
|
-
await this.
|
|
12832
|
+
await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
|
|
12807
12833
|
await this.eventSink(conv.parentConversationId, {
|
|
12808
12834
|
type: "subagent:completed",
|
|
12809
12835
|
subagentId,
|
|
12810
|
-
conversationId: subagentId
|
|
12836
|
+
conversationId: subagentId,
|
|
12837
|
+
task: conv.subagentMeta?.task ?? conv.title,
|
|
12838
|
+
parentToolCallId: conv.subagentMeta?.parentToolCallId,
|
|
12839
|
+
resultText: responseText
|
|
12811
12840
|
});
|
|
12812
12841
|
await this.triggerParentCallback(conv.parentConversationId);
|
|
12813
12842
|
}
|
|
@@ -12883,9 +12912,11 @@ var AgentOrchestrator = class {
|
|
|
12883
12912
|
let latestRunId = "";
|
|
12884
12913
|
let runResult;
|
|
12885
12914
|
let runError;
|
|
12915
|
+
let parentToolCallId;
|
|
12886
12916
|
try {
|
|
12887
12917
|
const conversation = await this.conversationStore.getWithArchive(childConversationId);
|
|
12888
12918
|
if (!conversation) throw new Error("Subagent conversation not found");
|
|
12919
|
+
parentToolCallId = conversation.subagentMeta?.parentToolCallId;
|
|
12889
12920
|
if (conversation.subagentMeta?.status === "stopped") return;
|
|
12890
12921
|
conversation.lastActivityAt = Date.now();
|
|
12891
12922
|
await this.conversationStore.update(conversation);
|
|
@@ -13077,12 +13108,6 @@ var AgentOrchestrator = class {
|
|
|
13077
13108
|
};
|
|
13078
13109
|
await this.conversationStore.update(conv);
|
|
13079
13110
|
}
|
|
13080
|
-
this.hooks?.onStreamEnd?.(childConversationId);
|
|
13081
|
-
await this.eventSink(parentConversationId, {
|
|
13082
|
-
type: "subagent:completed",
|
|
13083
|
-
subagentId: childConversationId,
|
|
13084
|
-
conversationId: childConversationId
|
|
13085
|
-
});
|
|
13086
13111
|
let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
|
|
13087
13112
|
if (!gathered) {
|
|
13088
13113
|
const freshSubConv = await this.conversationStore.get(childConversationId);
|
|
@@ -13104,7 +13129,16 @@ var AgentOrchestrator = class {
|
|
|
13104
13129
|
...abnormal ? { error: { code: runError?.code ?? "SUBAGENT_INCOMPLETE", message: runError?.message ?? "subagent ended without a result" } } : {},
|
|
13105
13130
|
timestamp: Date.now()
|
|
13106
13131
|
};
|
|
13107
|
-
await this.
|
|
13132
|
+
await this.appendSubagentResultReliable(parentConversationId, pendingResult);
|
|
13133
|
+
this.hooks?.onStreamEnd?.(childConversationId);
|
|
13134
|
+
await this.eventSink(parentConversationId, {
|
|
13135
|
+
type: "subagent:completed",
|
|
13136
|
+
subagentId: childConversationId,
|
|
13137
|
+
conversationId: childConversationId,
|
|
13138
|
+
task,
|
|
13139
|
+
parentToolCallId,
|
|
13140
|
+
resultText: subagentResponse
|
|
13141
|
+
});
|
|
13108
13142
|
this.triggerParentCallback(parentConversationId).catch(
|
|
13109
13143
|
(err) => console.error(`[poncho][subagent] Parent callback failed:`, err instanceof Error ? err.message : err)
|
|
13110
13144
|
);
|
|
@@ -13121,13 +13155,6 @@ var AgentOrchestrator = class {
|
|
|
13121
13155
|
conv.updatedAt = Date.now();
|
|
13122
13156
|
await this.conversationStore.update(conv);
|
|
13123
13157
|
}
|
|
13124
|
-
this.hooks?.onStreamEnd?.(childConversationId);
|
|
13125
|
-
await this.eventSink(parentConversationId, {
|
|
13126
|
-
type: "subagent:error",
|
|
13127
|
-
subagentId: childConversationId,
|
|
13128
|
-
conversationId: childConversationId,
|
|
13129
|
-
error: errMsg
|
|
13130
|
-
});
|
|
13131
13158
|
const pendingResult = {
|
|
13132
13159
|
subagentId: childConversationId,
|
|
13133
13160
|
task,
|
|
@@ -13135,7 +13162,15 @@ var AgentOrchestrator = class {
|
|
|
13135
13162
|
error: { code: "SUBAGENT_ERROR", message: errMsg },
|
|
13136
13163
|
timestamp: Date.now()
|
|
13137
13164
|
};
|
|
13138
|
-
await this.
|
|
13165
|
+
await this.appendSubagentResultReliable(parentConversationId, pendingResult);
|
|
13166
|
+
this.hooks?.onStreamEnd?.(childConversationId);
|
|
13167
|
+
await this.eventSink(parentConversationId, {
|
|
13168
|
+
type: "subagent:error",
|
|
13169
|
+
subagentId: childConversationId,
|
|
13170
|
+
conversationId: childConversationId,
|
|
13171
|
+
error: errMsg,
|
|
13172
|
+
task,
|
|
13173
|
+
parentToolCallId
|
|
13139
13174
|
});
|
|
13140
13175
|
this.triggerParentCallback(parentConversationId).catch(
|
|
13141
13176
|
(err2) => console.error(`[poncho][subagent] Parent callback failed:`, err2 instanceof Error ? err2.message : err2)
|
|
@@ -13251,12 +13286,12 @@ ${resultBody}`,
|
|
|
13251
13286
|
},
|
|
13252
13287
|
initialContextTokens: conversation.contextTokens ?? 0,
|
|
13253
13288
|
initialContextWindow: conversation.contextWindow ?? 0,
|
|
13254
|
-
onEvent: (event) => {
|
|
13289
|
+
onEvent: async (event) => {
|
|
13255
13290
|
if (event.type === "run:started") {
|
|
13256
13291
|
const active = this.activeConversationRuns.get(conversationId);
|
|
13257
13292
|
if (active) active.runId = event.runId;
|
|
13258
13293
|
}
|
|
13259
|
-
this.eventSink(conversationId, event);
|
|
13294
|
+
await this.eventSink(conversationId, event);
|
|
13260
13295
|
}
|
|
13261
13296
|
});
|
|
13262
13297
|
flushTurnDraft(execution.draft);
|
|
@@ -13442,11 +13477,6 @@ ${resultBody}`,
|
|
|
13442
13477
|
await this.conversationStore.update(conv);
|
|
13443
13478
|
}
|
|
13444
13479
|
this.activeSubagentRuns.delete(conversationId);
|
|
13445
|
-
await this.eventSink(parentConversationId, {
|
|
13446
|
-
type: "subagent:completed",
|
|
13447
|
-
subagentId: conversationId,
|
|
13448
|
-
conversationId
|
|
13449
|
-
});
|
|
13450
13480
|
let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
|
|
13451
13481
|
if (!gathered) {
|
|
13452
13482
|
const freshSubConv = await this.conversationStore.get(conversationId);
|
|
@@ -13464,7 +13494,17 @@ ${resultBody}`,
|
|
|
13464
13494
|
...abnormal ? { error: { code: runError?.code ?? "SUBAGENT_INCOMPLETE", message: runError?.message ?? "subagent ended without a result" } } : {},
|
|
13465
13495
|
timestamp: Date.now()
|
|
13466
13496
|
};
|
|
13467
|
-
await this.
|
|
13497
|
+
await this.appendSubagentResultReliable(parentConversationId, result);
|
|
13498
|
+
}
|
|
13499
|
+
await this.eventSink(parentConversationId, {
|
|
13500
|
+
type: "subagent:completed",
|
|
13501
|
+
subagentId: conversationId,
|
|
13502
|
+
conversationId,
|
|
13503
|
+
task,
|
|
13504
|
+
parentToolCallId: conversation.subagentMeta?.parentToolCallId,
|
|
13505
|
+
resultText: subagentResponse
|
|
13506
|
+
});
|
|
13507
|
+
if (parentConv) {
|
|
13468
13508
|
if (this.isServerless) {
|
|
13469
13509
|
this.hooks.dispatchBackground("subagent-callback", parentConversationId);
|
|
13470
13510
|
} else {
|
|
@@ -13492,11 +13532,6 @@ ${resultBody}`,
|
|
|
13492
13532
|
conv.updatedAt = Date.now();
|
|
13493
13533
|
await this.conversationStore.update(conv);
|
|
13494
13534
|
}
|
|
13495
|
-
await this.eventSink(conversation.parentConversationId, {
|
|
13496
|
-
type: "subagent:completed",
|
|
13497
|
-
subagentId: conversationId,
|
|
13498
|
-
conversationId
|
|
13499
|
-
});
|
|
13500
13535
|
const parentConv = await this.conversationStore.get(conversation.parentConversationId);
|
|
13501
13536
|
if (parentConv) {
|
|
13502
13537
|
const result = {
|
|
@@ -13506,12 +13541,22 @@ ${resultBody}`,
|
|
|
13506
13541
|
error: { code: "CONTINUATION_ERROR", message: err instanceof Error ? err.message : String(err) },
|
|
13507
13542
|
timestamp: Date.now()
|
|
13508
13543
|
};
|
|
13509
|
-
await this.
|
|
13544
|
+
await this.appendSubagentResultReliable(conversation.parentConversationId, result);
|
|
13545
|
+
}
|
|
13546
|
+
await this.eventSink(conversation.parentConversationId, {
|
|
13547
|
+
type: "subagent:completed",
|
|
13548
|
+
subagentId: conversationId,
|
|
13549
|
+
conversationId,
|
|
13550
|
+
task,
|
|
13551
|
+
parentToolCallId: conversation.subagentMeta?.parentToolCallId
|
|
13552
|
+
});
|
|
13553
|
+
if (parentConv) {
|
|
13510
13554
|
if (this.isServerless) {
|
|
13511
13555
|
this.hooks.dispatchBackground("subagent-callback", conversation.parentConversationId);
|
|
13512
13556
|
} else {
|
|
13513
|
-
this.processSubagentCallback(conversation.parentConversationId).catch(
|
|
13514
|
-
|
|
13557
|
+
this.processSubagentCallback(conversation.parentConversationId).catch(
|
|
13558
|
+
(err2) => console.error(`[poncho][subagent] Continuation-error callback failed:`, err2 instanceof Error ? err2.message : err2)
|
|
13559
|
+
);
|
|
13515
13560
|
}
|
|
13516
13561
|
}
|
|
13517
13562
|
}
|
|
@@ -13533,7 +13578,7 @@ ${resultBody}`,
|
|
|
13533
13578
|
opts.tenantId ?? null,
|
|
13534
13579
|
{
|
|
13535
13580
|
parentConversationId: opts.parentConversationId,
|
|
13536
|
-
subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry },
|
|
13581
|
+
subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry, parentToolCallId: opts.parentToolCallId },
|
|
13537
13582
|
messages: [{ role: "user", content: opts.task }]
|
|
13538
13583
|
}
|
|
13539
13584
|
);
|
|
@@ -13545,7 +13590,8 @@ ${resultBody}`,
|
|
|
13545
13590
|
type: "subagent:spawned",
|
|
13546
13591
|
subagentId: conversation.conversationId,
|
|
13547
13592
|
conversationId: conversation.conversationId,
|
|
13548
|
-
task: opts.task
|
|
13593
|
+
task: opts.task,
|
|
13594
|
+
parentToolCallId: opts.parentToolCallId
|
|
13549
13595
|
});
|
|
13550
13596
|
if (this.isServerless) {
|
|
13551
13597
|
this.hooks.dispatchBackground("subagent-run", conversation.conversationId);
|
|
@@ -13555,7 +13601,7 @@ ${resultBody}`,
|
|
|
13555
13601
|
opts.parentConversationId,
|
|
13556
13602
|
opts.task,
|
|
13557
13603
|
opts.ownerId
|
|
13558
|
-
).catch((err) =>
|
|
13604
|
+
).catch((err) => this.handleSpawnFailure(conversation.conversationId, opts.parentConversationId, opts.task, err));
|
|
13559
13605
|
}
|
|
13560
13606
|
return { subagentId: conversation.conversationId };
|
|
13561
13607
|
},
|
|
@@ -13588,7 +13634,7 @@ ${resultBody}`,
|
|
|
13588
13634
|
conversation.parentConversationId,
|
|
13589
13635
|
message,
|
|
13590
13636
|
conversation.ownerId
|
|
13591
|
-
).catch((err) =>
|
|
13637
|
+
).catch((err) => this.handleSpawnFailure(subagentId, conversation.parentConversationId, message, err));
|
|
13592
13638
|
}
|
|
13593
13639
|
return { subagentId };
|
|
13594
13640
|
},
|
|
@@ -13667,6 +13713,67 @@ ${resultBody}`,
|
|
|
13667
13713
|
};
|
|
13668
13714
|
}
|
|
13669
13715
|
// ── Stale subagent recovery ──
|
|
13716
|
+
/**
|
|
13717
|
+
* Append a subagent result to its parent, retrying once on a transient
|
|
13718
|
+
* store failure before giving up loudly. A silently dropped result is the
|
|
13719
|
+
* worst subagent failure mode — the parent waits forever on a subagent it
|
|
13720
|
+
* thinks is still running — so this never swallows the error the way the
|
|
13721
|
+
* old `.catch(() => {})` call sites did. Returns whether the result landed.
|
|
13722
|
+
*/
|
|
13723
|
+
async appendSubagentResultReliable(parentConversationId, result) {
|
|
13724
|
+
try {
|
|
13725
|
+
await this.conversationStore.appendSubagentResult(parentConversationId, result);
|
|
13726
|
+
return true;
|
|
13727
|
+
} catch (firstErr) {
|
|
13728
|
+
try {
|
|
13729
|
+
await this.conversationStore.appendSubagentResult(parentConversationId, result);
|
|
13730
|
+
return true;
|
|
13731
|
+
} catch (secondErr) {
|
|
13732
|
+
console.error(
|
|
13733
|
+
`[poncho][subagent] FAILED to persist result for subagent ${result.subagentId} to parent ${parentConversationId} after 2 attempts \u2014 the parent will not see this result:`,
|
|
13734
|
+
secondErr instanceof Error ? secondErr.message : secondErr,
|
|
13735
|
+
`(first attempt: ${firstErr instanceof Error ? firstErr.message : firstErr})`
|
|
13736
|
+
);
|
|
13737
|
+
return false;
|
|
13738
|
+
}
|
|
13739
|
+
}
|
|
13740
|
+
}
|
|
13741
|
+
/**
|
|
13742
|
+
* A subagent's fire-and-forget background run rejected outside its own
|
|
13743
|
+
* error handling (e.g. it threw before entering its try block, or the
|
|
13744
|
+
* catch block itself threw). Without this the parent is left waiting on a
|
|
13745
|
+
* subagent that will never report back. Record the failure on the child
|
|
13746
|
+
* and hand the parent an error result so the turn can resume.
|
|
13747
|
+
*/
|
|
13748
|
+
async handleSpawnFailure(childConversationId, parentConversationId, task, err) {
|
|
13749
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
13750
|
+
console.error(`[poncho][subagent] Background run failed for ${childConversationId}:`, message);
|
|
13751
|
+
try {
|
|
13752
|
+
const conv = await this.conversationStore.get(childConversationId);
|
|
13753
|
+
if (conv?.subagentMeta && conv.subagentMeta.status === "running") {
|
|
13754
|
+
conv.subagentMeta = {
|
|
13755
|
+
...conv.subagentMeta,
|
|
13756
|
+
status: "error",
|
|
13757
|
+
error: { code: "SUBAGENT_SPAWN_FAILED", message }
|
|
13758
|
+
};
|
|
13759
|
+
conv.updatedAt = Date.now();
|
|
13760
|
+
await this.conversationStore.update(conv);
|
|
13761
|
+
}
|
|
13762
|
+
} catch {
|
|
13763
|
+
}
|
|
13764
|
+
const appended = await this.appendSubagentResultReliable(parentConversationId, {
|
|
13765
|
+
subagentId: childConversationId,
|
|
13766
|
+
task,
|
|
13767
|
+
status: "error",
|
|
13768
|
+
error: { code: "SUBAGENT_SPAWN_FAILED", message },
|
|
13769
|
+
timestamp: Date.now()
|
|
13770
|
+
});
|
|
13771
|
+
if (appended) {
|
|
13772
|
+
this.triggerParentCallback(parentConversationId).catch(
|
|
13773
|
+
(e) => console.error(`[poncho][subagent] Parent callback failed after spawn failure:`, e instanceof Error ? e.message : e)
|
|
13774
|
+
);
|
|
13775
|
+
}
|
|
13776
|
+
}
|
|
13670
13777
|
async recoverStaleSubagents() {
|
|
13671
13778
|
const allSummaries = await this.conversationStore.listSummaries();
|
|
13672
13779
|
const subagentSummaries = allSummaries.filter((s) => s.parentConversationId);
|
|
@@ -13692,11 +13799,20 @@ ${resultBody}`,
|
|
|
13692
13799
|
error: conv.subagentMeta.error,
|
|
13693
13800
|
timestamp: Date.now()
|
|
13694
13801
|
};
|
|
13695
|
-
await this.
|
|
13802
|
+
await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
|
|
13696
13803
|
parentsToCallback.add(conv.parentConversationId);
|
|
13697
13804
|
}
|
|
13698
13805
|
}
|
|
13699
13806
|
}
|
|
13807
|
+
const parentIds = new Set(
|
|
13808
|
+
subagentSummaries.map((s) => s.parentConversationId).filter((id) => !!id)
|
|
13809
|
+
);
|
|
13810
|
+
for (const parentId of parentIds) {
|
|
13811
|
+
if (parentsToCallback.has(parentId)) continue;
|
|
13812
|
+
if (this.activeConversationRuns.has(parentId)) continue;
|
|
13813
|
+
const parent = await this.conversationStore.get(parentId);
|
|
13814
|
+
if (parent?.pendingSubagentResults?.length) parentsToCallback.add(parentId);
|
|
13815
|
+
}
|
|
13700
13816
|
for (const parentId of parentsToCallback) {
|
|
13701
13817
|
this.processSubagentCallback(parentId).catch(
|
|
13702
13818
|
(err) => console.error(`[poncho][subagent] Recovery callback failed for ${parentId}:`, err instanceof Error ? err.message : err)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@poncho-ai/harness",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.53.0",
|
|
4
4
|
"description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -34,7 +34,7 @@
|
|
|
34
34
|
"mustache": "^4.2.0",
|
|
35
35
|
"yaml": "^2.4.0",
|
|
36
36
|
"zod": "^3.22.0",
|
|
37
|
-
"@poncho-ai/sdk": "1.
|
|
37
|
+
"@poncho-ai/sdk": "1.15.0"
|
|
38
38
|
},
|
|
39
39
|
"peerDependencies": {
|
|
40
40
|
"esbuild": ">=0.17.0",
|
package/src/harness.ts
CHANGED
|
@@ -3344,7 +3344,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
|
|
|
3344
3344
|
return;
|
|
3345
3345
|
}
|
|
3346
3346
|
const runtimeToolName = exposedToolNames.get(call.name) ?? call.name;
|
|
3347
|
-
yield pushEvent({ type: "tool:started", tool: runtimeToolName, input: call.input });
|
|
3347
|
+
yield pushEvent({ type: "tool:started", tool: runtimeToolName, toolCallId: call.id, input: call.input });
|
|
3348
3348
|
if (this.requiresApprovalForToolCall(runtimeToolName, call.input)) {
|
|
3349
3349
|
approvalNeeded.push({
|
|
3350
3350
|
approvalId: `approval_${randomUUID()}`,
|
|
@@ -3563,6 +3563,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
|
|
|
3563
3563
|
yield pushEvent({
|
|
3564
3564
|
type: "tool:error",
|
|
3565
3565
|
tool: result.tool,
|
|
3566
|
+
toolCallId: result.callId,
|
|
3566
3567
|
error: result.error,
|
|
3567
3568
|
recoverable: true,
|
|
3568
3569
|
});
|
|
@@ -3604,6 +3605,7 @@ Code is wrapped in an async IIFE — use \`return\` to return a value to the too
|
|
|
3604
3605
|
yield pushEvent({
|
|
3605
3606
|
type: "tool:completed",
|
|
3606
3607
|
tool: result.tool,
|
|
3608
|
+
toolCallId: result.callId,
|
|
3607
3609
|
input: callInputMap.get(result.callId),
|
|
3608
3610
|
output: result.output,
|
|
3609
3611
|
duration: now() - batchStart,
|
|
@@ -693,12 +693,15 @@ export class AgentOrchestrator {
|
|
|
693
693
|
result: { status: "completed", response: responseText, steps: 0, tokens: { input: 0, output: 0, cached: 0 }, duration: 0 },
|
|
694
694
|
timestamp: Date.now(),
|
|
695
695
|
};
|
|
696
|
-
await this.
|
|
696
|
+
await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
|
|
697
697
|
|
|
698
698
|
await this.eventSink(conv.parentConversationId, {
|
|
699
699
|
type: "subagent:completed",
|
|
700
700
|
subagentId,
|
|
701
701
|
conversationId: subagentId,
|
|
702
|
+
task: conv.subagentMeta?.task ?? conv.title,
|
|
703
|
+
parentToolCallId: conv.subagentMeta?.parentToolCallId,
|
|
704
|
+
resultText: responseText,
|
|
702
705
|
});
|
|
703
706
|
|
|
704
707
|
await this.triggerParentCallback(conv.parentConversationId);
|
|
@@ -796,10 +799,14 @@ export class AgentOrchestrator {
|
|
|
796
799
|
let latestRunId = "";
|
|
797
800
|
let runResult: { status: "completed" | "error" | "cancelled"; response?: string; steps: number; duration: number; continuation?: boolean; continuationMessages?: Message[] } | undefined;
|
|
798
801
|
let runError: { code?: string; message?: string } | undefined;
|
|
802
|
+
// The spawning tool call's id — echoed onto subagent:* events so the
|
|
803
|
+
// client can attach subagent state to that tool's pill.
|
|
804
|
+
let parentToolCallId: string | undefined;
|
|
799
805
|
|
|
800
806
|
try {
|
|
801
807
|
const conversation = await this.conversationStore.getWithArchive(childConversationId);
|
|
802
808
|
if (!conversation) throw new Error("Subagent conversation not found");
|
|
809
|
+
parentToolCallId = conversation.subagentMeta?.parentToolCallId;
|
|
803
810
|
|
|
804
811
|
if (conversation.subagentMeta?.status === "stopped") return;
|
|
805
812
|
|
|
@@ -1012,13 +1019,6 @@ export class AgentOrchestrator {
|
|
|
1012
1019
|
await this.conversationStore.update(conv);
|
|
1013
1020
|
}
|
|
1014
1021
|
|
|
1015
|
-
this.hooks?.onStreamEnd?.(childConversationId);
|
|
1016
|
-
await this.eventSink(parentConversationId, {
|
|
1017
|
-
type: "subagent:completed",
|
|
1018
|
-
subagentId: childConversationId,
|
|
1019
|
-
conversationId: childConversationId,
|
|
1020
|
-
});
|
|
1021
|
-
|
|
1022
1022
|
// Recover the subagent's real output: prefer the run response, then the
|
|
1023
1023
|
// streamed draft, then walk the transcript — discarding the synthetic
|
|
1024
1024
|
// "[Error: ...]" placeholder at each step.
|
|
@@ -1051,7 +1051,21 @@ export class AgentOrchestrator {
|
|
|
1051
1051
|
: {}),
|
|
1052
1052
|
timestamp: Date.now(),
|
|
1053
1053
|
};
|
|
1054
|
-
|
|
1054
|
+
// Persist the result BEFORE emitting subagent:completed: a consumer
|
|
1055
|
+
// reacting to the event (the parent callback, the streaming client)
|
|
1056
|
+
// must find the result already durable in the store, not race its write.
|
|
1057
|
+
await this.appendSubagentResultReliable(parentConversationId, pendingResult);
|
|
1058
|
+
|
|
1059
|
+
this.hooks?.onStreamEnd?.(childConversationId);
|
|
1060
|
+
await this.eventSink(parentConversationId, {
|
|
1061
|
+
type: "subagent:completed",
|
|
1062
|
+
subagentId: childConversationId,
|
|
1063
|
+
conversationId: childConversationId,
|
|
1064
|
+
task,
|
|
1065
|
+
parentToolCallId,
|
|
1066
|
+
resultText: subagentResponse,
|
|
1067
|
+
});
|
|
1068
|
+
|
|
1055
1069
|
this.triggerParentCallback(parentConversationId).catch(err =>
|
|
1056
1070
|
console.error(`[poncho][subagent] Parent callback failed:`, err instanceof Error ? err.message : err),
|
|
1057
1071
|
);
|
|
@@ -1070,22 +1084,26 @@ export class AgentOrchestrator {
|
|
|
1070
1084
|
await this.conversationStore.update(conv);
|
|
1071
1085
|
}
|
|
1072
1086
|
|
|
1087
|
+
const pendingResult: PendingSubagentResult = {
|
|
1088
|
+
subagentId: childConversationId,
|
|
1089
|
+
task,
|
|
1090
|
+
status: "error",
|
|
1091
|
+
error: { code: "SUBAGENT_ERROR", message: errMsg },
|
|
1092
|
+
timestamp: Date.now(),
|
|
1093
|
+
};
|
|
1094
|
+
// Persist before emitting (see the success path); never swallow.
|
|
1095
|
+
await this.appendSubagentResultReliable(parentConversationId, pendingResult);
|
|
1096
|
+
|
|
1073
1097
|
this.hooks?.onStreamEnd?.(childConversationId);
|
|
1074
1098
|
await this.eventSink(parentConversationId, {
|
|
1075
1099
|
type: "subagent:error",
|
|
1076
1100
|
subagentId: childConversationId,
|
|
1077
1101
|
conversationId: childConversationId,
|
|
1078
1102
|
error: errMsg,
|
|
1103
|
+
task,
|
|
1104
|
+
parentToolCallId,
|
|
1079
1105
|
});
|
|
1080
1106
|
|
|
1081
|
-
const pendingResult: PendingSubagentResult = {
|
|
1082
|
-
subagentId: childConversationId,
|
|
1083
|
-
task,
|
|
1084
|
-
status: "error",
|
|
1085
|
-
error: { code: "SUBAGENT_ERROR", message: errMsg },
|
|
1086
|
-
timestamp: Date.now(),
|
|
1087
|
-
};
|
|
1088
|
-
await this.conversationStore.appendSubagentResult(parentConversationId, pendingResult).catch(() => {});
|
|
1089
1107
|
this.triggerParentCallback(parentConversationId).catch(err2 =>
|
|
1090
1108
|
console.error(`[poncho][subagent] Parent callback failed:`, err2 instanceof Error ? err2.message : err2),
|
|
1091
1109
|
);
|
|
@@ -1221,12 +1239,15 @@ export class AgentOrchestrator {
|
|
|
1221
1239
|
},
|
|
1222
1240
|
initialContextTokens: conversation.contextTokens ?? 0,
|
|
1223
1241
|
initialContextWindow: conversation.contextWindow ?? 0,
|
|
1224
|
-
onEvent: (event) => {
|
|
1242
|
+
onEvent: async (event) => {
|
|
1225
1243
|
if (event.type === "run:started") {
|
|
1226
1244
|
const active = this.activeConversationRuns.get(conversationId);
|
|
1227
1245
|
if (active) active.runId = event.runId;
|
|
1228
1246
|
}
|
|
1229
|
-
|
|
1247
|
+
// Await so the event is fully sunk before the next step's events,
|
|
1248
|
+
// matching every other eventSink call site (the callback run path
|
|
1249
|
+
// was the lone fire-and-forget exception).
|
|
1250
|
+
await this.eventSink(conversationId, event);
|
|
1230
1251
|
},
|
|
1231
1252
|
});
|
|
1232
1253
|
flushTurnDraft(execution.draft);
|
|
@@ -1436,11 +1457,6 @@ export class AgentOrchestrator {
|
|
|
1436
1457
|
}
|
|
1437
1458
|
|
|
1438
1459
|
this.activeSubagentRuns.delete(conversationId);
|
|
1439
|
-
await this.eventSink(parentConversationId, {
|
|
1440
|
-
type: "subagent:completed",
|
|
1441
|
-
subagentId: conversationId,
|
|
1442
|
-
conversationId,
|
|
1443
|
-
});
|
|
1444
1460
|
|
|
1445
1461
|
let gathered = realResponseText(runResult?.response) || realResponseText(draft.assistantResponse);
|
|
1446
1462
|
if (!gathered) {
|
|
@@ -1464,8 +1480,20 @@ export class AgentOrchestrator {
|
|
|
1464
1480
|
: {}),
|
|
1465
1481
|
timestamp: Date.now(),
|
|
1466
1482
|
};
|
|
1467
|
-
|
|
1483
|
+
// Persist before emitting completion (see runSubagent).
|
|
1484
|
+
await this.appendSubagentResultReliable(parentConversationId, result);
|
|
1485
|
+
}
|
|
1468
1486
|
|
|
1487
|
+
await this.eventSink(parentConversationId, {
|
|
1488
|
+
type: "subagent:completed",
|
|
1489
|
+
subagentId: conversationId,
|
|
1490
|
+
conversationId,
|
|
1491
|
+
task,
|
|
1492
|
+
parentToolCallId: conversation.subagentMeta?.parentToolCallId,
|
|
1493
|
+
resultText: subagentResponse,
|
|
1494
|
+
});
|
|
1495
|
+
|
|
1496
|
+
if (parentConv) {
|
|
1469
1497
|
if (this.isServerless) {
|
|
1470
1498
|
this.hooks!.dispatchBackground!("subagent-callback", parentConversationId);
|
|
1471
1499
|
} else {
|
|
@@ -1490,12 +1518,6 @@ export class AgentOrchestrator {
|
|
|
1490
1518
|
await this.conversationStore.update(conv);
|
|
1491
1519
|
}
|
|
1492
1520
|
|
|
1493
|
-
await this.eventSink(conversation.parentConversationId!, {
|
|
1494
|
-
type: "subagent:completed",
|
|
1495
|
-
subagentId: conversationId,
|
|
1496
|
-
conversationId,
|
|
1497
|
-
});
|
|
1498
|
-
|
|
1499
1521
|
const parentConv = await this.conversationStore.get(conversation.parentConversationId!);
|
|
1500
1522
|
if (parentConv) {
|
|
1501
1523
|
const result: PendingSubagentResult = {
|
|
@@ -1505,11 +1527,25 @@ export class AgentOrchestrator {
|
|
|
1505
1527
|
error: { code: "CONTINUATION_ERROR", message: err instanceof Error ? err.message : String(err) },
|
|
1506
1528
|
timestamp: Date.now(),
|
|
1507
1529
|
};
|
|
1508
|
-
|
|
1530
|
+
// Persist before emitting; never swallow (was `.catch(() => {})`).
|
|
1531
|
+
await this.appendSubagentResultReliable(conversation.parentConversationId!, result);
|
|
1532
|
+
}
|
|
1533
|
+
|
|
1534
|
+
await this.eventSink(conversation.parentConversationId!, {
|
|
1535
|
+
type: "subagent:completed",
|
|
1536
|
+
subagentId: conversationId,
|
|
1537
|
+
conversationId,
|
|
1538
|
+
task,
|
|
1539
|
+
parentToolCallId: conversation.subagentMeta?.parentToolCallId,
|
|
1540
|
+
});
|
|
1541
|
+
|
|
1542
|
+
if (parentConv) {
|
|
1509
1543
|
if (this.isServerless) {
|
|
1510
1544
|
this.hooks!.dispatchBackground!("subagent-callback", conversation.parentConversationId!);
|
|
1511
1545
|
} else {
|
|
1512
|
-
this.processSubagentCallback(conversation.parentConversationId!).catch(
|
|
1546
|
+
this.processSubagentCallback(conversation.parentConversationId!).catch(err2 =>
|
|
1547
|
+
console.error(`[poncho][subagent] Continuation-error callback failed:`, err2 instanceof Error ? err2.message : err2),
|
|
1548
|
+
);
|
|
1513
1549
|
}
|
|
1514
1550
|
}
|
|
1515
1551
|
}
|
|
@@ -1534,7 +1570,7 @@ export class AgentOrchestrator {
|
|
|
1534
1570
|
opts.tenantId ?? null,
|
|
1535
1571
|
{
|
|
1536
1572
|
parentConversationId: opts.parentConversationId,
|
|
1537
|
-
subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry },
|
|
1573
|
+
subagentMeta: { task: opts.task, status: "running", suppressTelemetry: opts.suppressTelemetry, parentToolCallId: opts.parentToolCallId },
|
|
1538
1574
|
messages: [{ role: "user", content: opts.task }],
|
|
1539
1575
|
},
|
|
1540
1576
|
);
|
|
@@ -1549,6 +1585,7 @@ export class AgentOrchestrator {
|
|
|
1549
1585
|
subagentId: conversation.conversationId,
|
|
1550
1586
|
conversationId: conversation.conversationId,
|
|
1551
1587
|
task: opts.task,
|
|
1588
|
+
parentToolCallId: opts.parentToolCallId,
|
|
1552
1589
|
});
|
|
1553
1590
|
|
|
1554
1591
|
if (this.isServerless) {
|
|
@@ -1559,7 +1596,7 @@ export class AgentOrchestrator {
|
|
|
1559
1596
|
opts.parentConversationId,
|
|
1560
1597
|
opts.task,
|
|
1561
1598
|
opts.ownerId,
|
|
1562
|
-
).catch(err =>
|
|
1599
|
+
).catch(err => this.handleSpawnFailure(conversation.conversationId, opts.parentConversationId, opts.task, err));
|
|
1563
1600
|
}
|
|
1564
1601
|
|
|
1565
1602
|
return { subagentId: conversation.conversationId };
|
|
@@ -1596,7 +1633,7 @@ export class AgentOrchestrator {
|
|
|
1596
1633
|
conversation.parentConversationId,
|
|
1597
1634
|
message,
|
|
1598
1635
|
conversation.ownerId,
|
|
1599
|
-
).catch(err =>
|
|
1636
|
+
).catch(err => this.handleSpawnFailure(subagentId, conversation.parentConversationId!, message, err));
|
|
1600
1637
|
}
|
|
1601
1638
|
|
|
1602
1639
|
return { subagentId };
|
|
@@ -1684,6 +1721,79 @@ export class AgentOrchestrator {
|
|
|
1684
1721
|
|
|
1685
1722
|
// ── Stale subagent recovery ──
|
|
1686
1723
|
|
|
1724
|
+
/**
|
|
1725
|
+
* Append a subagent result to its parent, retrying once on a transient
|
|
1726
|
+
* store failure before giving up loudly. A silently dropped result is the
|
|
1727
|
+
* worst subagent failure mode — the parent waits forever on a subagent it
|
|
1728
|
+
* thinks is still running — so this never swallows the error the way the
|
|
1729
|
+
* old `.catch(() => {})` call sites did. Returns whether the result landed.
|
|
1730
|
+
*/
|
|
1731
|
+
private async appendSubagentResultReliable(
|
|
1732
|
+
parentConversationId: string,
|
|
1733
|
+
result: PendingSubagentResult,
|
|
1734
|
+
): Promise<boolean> {
|
|
1735
|
+
try {
|
|
1736
|
+
await this.conversationStore.appendSubagentResult(parentConversationId, result);
|
|
1737
|
+
return true;
|
|
1738
|
+
} catch (firstErr) {
|
|
1739
|
+
try {
|
|
1740
|
+
await this.conversationStore.appendSubagentResult(parentConversationId, result);
|
|
1741
|
+
return true;
|
|
1742
|
+
} catch (secondErr) {
|
|
1743
|
+
console.error(
|
|
1744
|
+
`[poncho][subagent] FAILED to persist result for subagent ${result.subagentId} ` +
|
|
1745
|
+
`to parent ${parentConversationId} after 2 attempts — the parent will not see this result:`,
|
|
1746
|
+
secondErr instanceof Error ? secondErr.message : secondErr,
|
|
1747
|
+
`(first attempt: ${firstErr instanceof Error ? firstErr.message : firstErr})`,
|
|
1748
|
+
);
|
|
1749
|
+
return false;
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
}
|
|
1753
|
+
|
|
1754
|
+
/**
|
|
1755
|
+
* A subagent's fire-and-forget background run rejected outside its own
|
|
1756
|
+
* error handling (e.g. it threw before entering its try block, or the
|
|
1757
|
+
* catch block itself threw). Without this the parent is left waiting on a
|
|
1758
|
+
* subagent that will never report back. Record the failure on the child
|
|
1759
|
+
* and hand the parent an error result so the turn can resume.
|
|
1760
|
+
*/
|
|
1761
|
+
private async handleSpawnFailure(
|
|
1762
|
+
childConversationId: string,
|
|
1763
|
+
parentConversationId: string,
|
|
1764
|
+
task: string,
|
|
1765
|
+
err: unknown,
|
|
1766
|
+
): Promise<void> {
|
|
1767
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1768
|
+
console.error(`[poncho][subagent] Background run failed for ${childConversationId}:`, message);
|
|
1769
|
+
try {
|
|
1770
|
+
const conv = await this.conversationStore.get(childConversationId);
|
|
1771
|
+
if (conv?.subagentMeta && conv.subagentMeta.status === "running") {
|
|
1772
|
+
conv.subagentMeta = {
|
|
1773
|
+
...conv.subagentMeta,
|
|
1774
|
+
status: "error",
|
|
1775
|
+
error: { code: "SUBAGENT_SPAWN_FAILED", message },
|
|
1776
|
+
};
|
|
1777
|
+
conv.updatedAt = Date.now();
|
|
1778
|
+
await this.conversationStore.update(conv);
|
|
1779
|
+
}
|
|
1780
|
+
} catch {
|
|
1781
|
+
// best-effort: the result append below is what the parent actually needs
|
|
1782
|
+
}
|
|
1783
|
+
const appended = await this.appendSubagentResultReliable(parentConversationId, {
|
|
1784
|
+
subagentId: childConversationId,
|
|
1785
|
+
task,
|
|
1786
|
+
status: "error",
|
|
1787
|
+
error: { code: "SUBAGENT_SPAWN_FAILED", message },
|
|
1788
|
+
timestamp: Date.now(),
|
|
1789
|
+
});
|
|
1790
|
+
if (appended) {
|
|
1791
|
+
this.triggerParentCallback(parentConversationId).catch(e =>
|
|
1792
|
+
console.error(`[poncho][subagent] Parent callback failed after spawn failure:`, e instanceof Error ? e.message : e),
|
|
1793
|
+
);
|
|
1794
|
+
}
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1687
1797
|
async recoverStaleSubagents(): Promise<void> {
|
|
1688
1798
|
const allSummaries = await this.conversationStore.listSummaries();
|
|
1689
1799
|
const subagentSummaries = allSummaries.filter((s) => s.parentConversationId);
|
|
@@ -1711,11 +1821,26 @@ export class AgentOrchestrator {
|
|
|
1711
1821
|
error: conv.subagentMeta.error,
|
|
1712
1822
|
timestamp: Date.now(),
|
|
1713
1823
|
};
|
|
1714
|
-
await this.
|
|
1824
|
+
await this.appendSubagentResultReliable(conv.parentConversationId, pendingResult);
|
|
1715
1825
|
parentsToCallback.add(conv.parentConversationId);
|
|
1716
1826
|
}
|
|
1717
1827
|
}
|
|
1718
1828
|
}
|
|
1829
|
+
|
|
1830
|
+
// Also drain parents that already have results sitting in the store but
|
|
1831
|
+
// no active run to deliver them — e.g. a result persisted just before a
|
|
1832
|
+
// process restart, whose in-memory callback trigger was lost. Without
|
|
1833
|
+
// this the parent stays stuck even though its result landed durably.
|
|
1834
|
+
const parentIds = new Set(
|
|
1835
|
+
subagentSummaries.map(s => s.parentConversationId).filter((id): id is string => !!id),
|
|
1836
|
+
);
|
|
1837
|
+
for (const parentId of parentIds) {
|
|
1838
|
+
if (parentsToCallback.has(parentId)) continue;
|
|
1839
|
+
if (this.activeConversationRuns.has(parentId)) continue;
|
|
1840
|
+
const parent = await this.conversationStore.get(parentId);
|
|
1841
|
+
if (parent?.pendingSubagentResults?.length) parentsToCallback.add(parentId);
|
|
1842
|
+
}
|
|
1843
|
+
|
|
1719
1844
|
for (const parentId of parentsToCallback) {
|
|
1720
1845
|
this.processSubagentCallback(parentId).catch(err =>
|
|
1721
1846
|
console.error(`[poncho][subagent] Recovery callback failed for ${parentId}:`, err instanceof Error ? err.message : err),
|
package/src/state.ts
CHANGED
|
@@ -79,6 +79,9 @@ export interface Conversation {
|
|
|
79
79
|
* subagent's runs emit no telemetry (e.g. spawned from an incognito
|
|
80
80
|
* turn). Read by the orchestrator's runSubagent / continuation. */
|
|
81
81
|
suppressTelemetry?: boolean;
|
|
82
|
+
/** The parent's `spawn_subagent` tool call id — echoed onto subagent:*
|
|
83
|
+
* events so a client can attach subagent state to that tool's pill. */
|
|
84
|
+
parentToolCallId?: string;
|
|
82
85
|
};
|
|
83
86
|
channelMeta?: {
|
|
84
87
|
platform: string;
|
|
@@ -36,12 +36,25 @@ export class PostgresEngine extends SqlStorageEngine {
|
|
|
36
36
|
return rows as T[];
|
|
37
37
|
},
|
|
38
38
|
exec: async (sql: string): Promise<void> => {
|
|
39
|
-
|
|
39
|
+
// DDL is idempotent in our migrations (`CREATE TABLE IF NOT
|
|
40
|
+
// EXISTS`, etc.), so retrying on a stale-socket drop is
|
|
41
|
+
// safe — same idempotency as `query()` reads/writes.
|
|
42
|
+
await this.runWithRetry(() => this.sql.unsafe(sql));
|
|
40
43
|
},
|
|
41
44
|
transaction: async (fn: () => Promise<void>): Promise<void> => {
|
|
42
|
-
|
|
45
|
+
// Transactions are inherently retry-safe at the
|
|
46
|
+
// CONNECTION_ENDED boundary: if the connection dies before
|
|
47
|
+
// BEGIN takes effect server-side, no work was committed and
|
|
48
|
+
// re-running `fn` produces the correct end state. The retry
|
|
49
|
+
// only catches the connection-level reject from the
|
|
50
|
+
// postgres.js client; a partial-commit + drop scenario
|
|
51
|
+
// surfaces as a different error code and bypasses the
|
|
52
|
+
// retry, preserving the caller's expectation that a
|
|
53
|
+
// returned transaction either fully committed or fully
|
|
54
|
+
// rolled back.
|
|
55
|
+
await this.runWithRetry(() => this.sql.begin(async () => {
|
|
43
56
|
await fn();
|
|
44
|
-
});
|
|
57
|
+
}));
|
|
45
58
|
},
|
|
46
59
|
};
|
|
47
60
|
}
|
|
@@ -59,25 +72,34 @@ export class PostgresEngine extends SqlStorageEngine {
|
|
|
59
72
|
prepare: false,
|
|
60
73
|
// Connection-pool resilience. Managed Postgres providers
|
|
61
74
|
// (Railway, Neon, Heroku, etc.) routinely drop idle TCP
|
|
62
|
-
// connections server-side after a few minutes
|
|
63
|
-
//
|
|
64
|
-
//
|
|
65
|
-
//
|
|
66
|
-
//
|
|
75
|
+
// connections server-side after a few minutes — and on
|
|
76
|
+
// Railway in particular, mid-stream drops within a few
|
|
77
|
+
// seconds of inactivity are common. Without these knobs,
|
|
78
|
+
// porsager/postgres keeps stale sockets in the pool; the
|
|
79
|
+
// next query on one rejects with
|
|
80
|
+
// `write CONNECTION_ENDED <host>:5432` at `durMs=0`,
|
|
81
|
+
// surfacing as a hard failure to the caller.
|
|
67
82
|
//
|
|
68
|
-
// - `idle_timeout:
|
|
69
|
-
//
|
|
70
|
-
//
|
|
71
|
-
//
|
|
72
|
-
//
|
|
73
|
-
//
|
|
74
|
-
//
|
|
75
|
-
//
|
|
83
|
+
// - `idle_timeout: 5` closes idle connections client-side
|
|
84
|
+
// aggressively. Empirically Railway's pg drops sockets
|
|
85
|
+
// well before the 20s value that managed-provider docs
|
|
86
|
+
// suggest; 5s is short enough to win the race in
|
|
87
|
+
// practice while staying long enough that bursty
|
|
88
|
+
// workloads still get connection reuse.
|
|
89
|
+
// - `max_lifetime: 300` (5 min) recycles long-lived
|
|
90
|
+
// connections defensively. Even with idle_timeout, a
|
|
91
|
+
// connection that's been actively serving small queries
|
|
92
|
+
// for an hour can hit provider-side max-age limits.
|
|
93
|
+
// - `connect_timeout: 10` — slightly less patient on
|
|
94
|
+
// initial connect than the 30s default. Combined with
|
|
95
|
+
// the retry below, "connection refused" surfaces faster
|
|
96
|
+
// during incidents and the caller can shed load instead
|
|
97
|
+
// of stacking up.
|
|
76
98
|
//
|
|
77
|
-
//
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
99
|
+
// Pool size (`max: 10`) unchanged.
|
|
100
|
+
idle_timeout: 5,
|
|
101
|
+
max_lifetime: 60 * 5,
|
|
102
|
+
connect_timeout: 10,
|
|
81
103
|
});
|
|
82
104
|
}
|
|
83
105
|
|
|
@@ -147,33 +169,53 @@ export class PostgresEngine extends SqlStorageEngine {
|
|
|
147
169
|
}
|
|
148
170
|
|
|
149
171
|
/**
|
|
150
|
-
*
|
|
151
|
-
*
|
|
152
|
-
*
|
|
153
|
-
*
|
|
154
|
-
*
|
|
155
|
-
*
|
|
156
|
-
*
|
|
157
|
-
*
|
|
172
|
+
* Retry on transient connection-layer failures. Three attempts
|
|
173
|
+
* with exponential-ish backoff (0, 50ms, 200ms) — the pool may
|
|
174
|
+
* have multiple stale sockets accumulated during an idle period
|
|
175
|
+
* (especially on managed Postgres after boot when no traffic
|
|
176
|
+
* has flowed for a while), so a single retry can land on a
|
|
177
|
+
* second stale socket and still fail. Three attempts virtually
|
|
178
|
+
* always exhausts the staleness wave; if all three throw, the
|
|
179
|
+
* failure is real and the caller should see it.
|
|
158
180
|
*
|
|
159
|
-
*
|
|
160
|
-
* `
|
|
161
|
-
*
|
|
162
|
-
*
|
|
163
|
-
*
|
|
164
|
-
*
|
|
165
|
-
*
|
|
181
|
+
* Applied to every pg path the executor exposes:
|
|
182
|
+
* - `query()` (run/get/all) — natural retry: queries are
|
|
183
|
+
* idempotent at the connection-failure boundary because the
|
|
184
|
+
* server-side rollback runs cleanly on socket close.
|
|
185
|
+
* - `exec(sql)` for DDL — `CREATE TABLE IF NOT EXISTS` and
|
|
186
|
+
* friends are idempotent by construction.
|
|
187
|
+
* - `transaction(fn)` — only retried when the
|
|
188
|
+
* CONNECTION_ENDED reject arrives *before* the transaction
|
|
189
|
+
* body started executing on the connection; if it errors
|
|
190
|
+
* mid-transaction, the postgres.js client surfaces a
|
|
191
|
+
* different error class (the inner SQL error) and bypasses
|
|
192
|
+
* this retry, preserving the all-or-nothing semantics.
|
|
166
193
|
*/
|
|
167
194
|
private async runWithRetry<T>(fn: () => Promise<T>): Promise<T> {
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
195
|
+
const backoffs = [0, 50, 200];
|
|
196
|
+
let lastErr: unknown;
|
|
197
|
+
for (let attempt = 0; attempt < backoffs.length; attempt++) {
|
|
198
|
+
if (backoffs[attempt] > 0) {
|
|
199
|
+
await new Promise((r) => setTimeout(r, backoffs[attempt]));
|
|
200
|
+
}
|
|
201
|
+
try {
|
|
173
202
|
return await fn();
|
|
203
|
+
} catch (err) {
|
|
204
|
+
lastErr = err;
|
|
205
|
+
const code = (err as { code?: string } | null | undefined)?.code;
|
|
206
|
+
if (
|
|
207
|
+
code === "CONNECTION_ENDED" ||
|
|
208
|
+
code === "CONNECTION_CLOSED" ||
|
|
209
|
+
code === "CONNECTION_DESTROYED" ||
|
|
210
|
+
code === "CONNECT_TIMEOUT" ||
|
|
211
|
+
code === "ECONNRESET"
|
|
212
|
+
) {
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
throw err;
|
|
174
216
|
}
|
|
175
|
-
throw err;
|
|
176
217
|
}
|
|
218
|
+
throw lastErr;
|
|
177
219
|
}
|
|
178
220
|
|
|
179
221
|
private addToPathCache(tenantId: string, path: string): void {
|
package/src/subagent-manager.ts
CHANGED
|
@@ -40,6 +40,10 @@ export interface SubagentManager {
|
|
|
40
40
|
/** Inherit the parent run's telemetry choice — when true, the subagent
|
|
41
41
|
* run (and its re-runs) emit no telemetry. */
|
|
42
42
|
suppressTelemetry?: boolean;
|
|
43
|
+
/** The id of the `spawn_subagent` tool call that produced this subagent,
|
|
44
|
+
* so its events can carry `parentToolCallId` and a client can attach
|
|
45
|
+
* subagent state to the spawning tool's pill. */
|
|
46
|
+
parentToolCallId?: string;
|
|
43
47
|
}): Promise<SubagentSpawnResult>;
|
|
44
48
|
|
|
45
49
|
sendMessage(subagentId: string, message: string): Promise<SubagentSpawnResult>;
|
package/src/subagent-tools.ts
CHANGED
package/src/tool-dispatcher.ts
CHANGED
|
@@ -62,7 +62,10 @@ export class ToolDispatcher {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
try {
|
|
65
|
-
|
|
65
|
+
// Per-call context: stamp the current tool call's id so handlers that
|
|
66
|
+
// spawn further work (spawn_subagent) can record `parentToolCallId`.
|
|
67
|
+
// A fresh object — `context` is shared across a batch, don't mutate it.
|
|
68
|
+
const output = await definition.handler(call.input, { ...context, toolCallId: call.id });
|
|
66
69
|
if (context.abortSignal?.aborted) {
|
|
67
70
|
return {
|
|
68
71
|
callId: call.id,
|