@poncho-ai/harness 0.47.0 → 0.47.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- > @poncho-ai/harness@0.47.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
2
+ > @poncho-ai/harness@0.47.1 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
3
3
  > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
4
4
 
5
5
  [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -8,9 +8,9 @@
8
8
  CLI tsup v8.5.1
9
9
  CLI Target: es2022
10
10
  ESM Build start
11
+ ESM dist/index.js 527.60 KB
11
12
  ESM dist/isolate-VY35DGLM.js 49.43 KB
12
- ESM dist/index.js 525.35 KB
13
- ESM ⚡️ Build success in 249ms
13
+ ESM ⚡️ Build success in 275ms
14
14
  DTS Build start
15
- DTS ⚡️ Build success in 7482ms
16
- DTS dist/index.d.ts 85.30 KB
15
+ DTS ⚡️ Build success in 7513ms
16
+ DTS dist/index.d.ts 86.25 KB
package/CHANGELOG.md CHANGED
@@ -1,5 +1,44 @@
1
1
  # @poncho-ai/harness
2
2
 
3
+ ## 0.47.1
4
+
5
+ ### Patch Changes
6
+
7
+ - [#122](https://github.com/cesr/poncho-ai/pull/122) [`661536b`](https://github.com/cesr/poncho-ai/commit/661536b8d24691d91dc01e345b828ef6c9884beb) Thanks [@cesr](https://github.com/cesr)! - harness: postgres connection-pool resilience for managed-postgres hosts
8
+
9
+ Managed Postgres providers (Railway, Neon, Heroku, etc.) drop idle
10
+ TCP connections server-side after a few minutes. The previous
11
+ postgres-engine config left `idle_timeout` at the porsager/postgres
12
+ default (0 = never close client-side), so the pool accumulated stale
13
+ sockets; the first query on one rejected with `write CONNECTION_ENDED
14
+ <host>:5432` at `durMs=0` and bubbled up as a hard failure to the
15
+ caller — including user-facing chat turns and the orchestrator's
16
+ subagent callback rerun.
17
+
18
+ Two complementary settings, plus one belt-and-suspenders retry:
19
+ - `idle_timeout: 20` — close idle client-side connections before
20
+ any reasonable provider-side timer fires. Fresh connection on
21
+ next checkout, no stale-socket race.
22
+ - `max_lifetime: 60 * 10` (10 min) — recycle long-lived
23
+ connections defensively, sidestepping provider-side
24
+ "max connection age" limits.
25
+ - `private query()` now retries once on `CONNECTION_ENDED` /
26
+ `CONNECTION_CLOSED` / `CONNECTION_DESTROYED`. Covers the
27
+ narrow race where a query lands on a connection at the exact
28
+ instant the provider drops it.
29
+
30
+ Defaults unchanged: `max: 10`, `connect_timeout: 30`. Migration DDL
31
+ (`sql.unsafe(sql)` inside `executeRaw`) and transactions
32
+ (`sql.begin(...)`) deliberately don't go through the retry — DDL
33
+ is `IF NOT EXISTS` idempotent and transactions need atomic scoping.
34
+
35
+ Observed in production: the PonchOS api running on Railway hit this
36
+ during a subagent test, the orchestrator's auto-callback rerun
37
+ threw the connection-ended error, a concurrent unhandled async
38
+ rejection killed the node process, and Railway restarted the
39
+ replica (~50s). User-facing chat turns started seeing the same
40
+ error after that. Patch eliminates the source.
41
+
3
42
  ## 0.47.0
4
43
 
5
44
  ### Minor Changes
package/dist/index.d.ts CHANGED
@@ -1714,6 +1714,25 @@ declare class PostgresEngine extends SqlStorageEngine {
1714
1714
  refreshPathCache(tenantId: string): Promise<void>;
1715
1715
  private patchVfs;
1716
1716
  private query;
1717
+ /**
1718
+ * Single retry on a transient connection-layer failure. The
1719
+ * `idle_timeout` / `max_lifetime` config above prevents *most*
1720
+ * stale-connection cases, but a query can still race a
1721
+ * provider-initiated drop in flight — the postgres.js client
1722
+ * rejects with `code: "CONNECTION_ENDED"` and the next attempt
1723
+ * checks out a fresh connection from the pool. One retry is
1724
+ * enough; if it fails again the host-side network is genuinely
1725
+ * broken and the caller should see the error.
1726
+ *
1727
+ * Only retries reads + the standard exec/run paths in `query`;
1728
+ * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
1729
+ * `sql.begin(...)` transactions are unwrapped — those are
1730
+ * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
1731
+ * atomically scoped (transactions roll back cleanly), and adding
1732
+ * a retry around them would complicate the transaction
1733
+ * semantics.
1734
+ */
1735
+ private runWithRetry;
1717
1736
  private addToPathCache;
1718
1737
  private removeFromPathCache;
1719
1738
  }
package/dist/index.js CHANGED
@@ -4433,7 +4433,28 @@ var PostgresEngine = class extends SqlStorageEngine {
4433
4433
  this.sql = postgres(url, {
4434
4434
  onnotice: () => {
4435
4435
  },
4436
- prepare: false
4436
+ prepare: false,
4437
+ // Connection-pool resilience. Managed Postgres providers
4438
+ // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
4439
+ // connections server-side after a few minutes. Without these
4440
+ // knobs, porsager/postgres keeps stale sockets in the pool;
4441
+ // the next query on one rejects with
4442
+ // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
4443
+ // as a hard failure to the caller. Two complementary settings:
4444
+ //
4445
+ // - `idle_timeout: 20` closes idle connections client-side
4446
+ // after 20s, before any reasonable provider-side timer
4447
+ // fires. Fresh connection on next checkout = no stale
4448
+ // socket race.
4449
+ // - `max_lifetime: 600` (10 min) recycles long-lived
4450
+ // connections defensively even if they've stayed busy,
4451
+ // which sidesteps a separate class of provider-side
4452
+ // "max connection age" limits.
4453
+ //
4454
+ // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
4455
+ // pool size + initial connect behavior unchanged.
4456
+ idle_timeout: 20,
4457
+ max_lifetime: 60 * 10
4437
4458
  });
4438
4459
  }
4439
4460
  async initialize() {
@@ -4477,10 +4498,38 @@ var PostgresEngine = class extends SqlStorageEngine {
4477
4498
  };
4478
4499
  }
4479
4500
  async query(sql, params) {
4480
- if (!params || params.length === 0) {
4481
- return this.sql.unsafe(sql);
4501
+ return this.runWithRetry(
4502
+ () => !params || params.length === 0 ? this.sql.unsafe(sql) : this.sql.unsafe(sql, params)
4503
+ );
4504
+ }
4505
+ /**
4506
+ * Single retry on a transient connection-layer failure. The
4507
+ * `idle_timeout` / `max_lifetime` config above prevents *most*
4508
+ * stale-connection cases, but a query can still race a
4509
+ * provider-initiated drop in flight — the postgres.js client
4510
+ * rejects with `code: "CONNECTION_ENDED"` and the next attempt
4511
+ * checks out a fresh connection from the pool. One retry is
4512
+ * enough; if it fails again the host-side network is genuinely
4513
+ * broken and the caller should see the error.
4514
+ *
4515
+ * Only retries reads + the standard exec/run paths in `query`;
4516
+ * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
4517
+ * `sql.begin(...)` transactions are unwrapped — those are
4518
+ * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
4519
+ * atomically scoped (transactions roll back cleanly), and adding
4520
+ * a retry around them would complicate the transaction
4521
+ * semantics.
4522
+ */
4523
+ async runWithRetry(fn) {
4524
+ try {
4525
+ return await fn();
4526
+ } catch (err) {
4527
+ const code = err?.code;
4528
+ if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
4529
+ return await fn();
4530
+ }
4531
+ throw err;
4482
4532
  }
4483
- return this.sql.unsafe(sql, params);
4484
4533
  }
4485
4534
  addToPathCache(tenantId, path) {
4486
4535
  const paths = this.pathCache.get(tenantId);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@poncho-ai/harness",
3
- "version": "0.47.0",
3
+ "version": "0.47.1",
4
4
  "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
5
5
  "repository": {
6
6
  "type": "git",
@@ -57,6 +57,27 @@ export class PostgresEngine extends SqlStorageEngine {
57
57
  this.sql = postgres(url, {
58
58
  onnotice: () => {},
59
59
  prepare: false,
60
+ // Connection-pool resilience. Managed Postgres providers
61
+ // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
62
+ // connections server-side after a few minutes. Without these
63
+ // knobs, porsager/postgres keeps stale sockets in the pool;
64
+ // the next query on one rejects with
65
+ // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
66
+ // as a hard failure to the caller. Two complementary settings:
67
+ //
68
+ // - `idle_timeout: 20` closes idle connections client-side
69
+ // after 20s, before any reasonable provider-side timer
70
+ // fires. Fresh connection on next checkout = no stale
71
+ // socket race.
72
+ // - `max_lifetime: 600` (10 min) recycles long-lived
73
+ // connections defensively even if they've stayed busy,
74
+ // which sidesteps a separate class of provider-side
75
+ // "max connection age" limits.
76
+ //
77
+ // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
78
+ // pool size + initial connect behavior unchanged.
79
+ idle_timeout: 20,
80
+ max_lifetime: 60 * 10,
60
81
  });
61
82
  }
62
83
 
@@ -118,10 +139,41 @@ export class PostgresEngine extends SqlStorageEngine {
118
139
  }
119
140
 
120
141
  private async query(sql: string, params?: unknown[]): Promise<any[]> {
121
- if (!params || params.length === 0) {
122
- return this.sql.unsafe(sql);
142
+ return this.runWithRetry(() =>
143
+ !params || params.length === 0
144
+ ? this.sql.unsafe(sql)
145
+ : this.sql.unsafe(sql, params),
146
+ );
147
+ }
148
+
149
+ /**
150
+ * Single retry on a transient connection-layer failure. The
151
+ * `idle_timeout` / `max_lifetime` config above prevents *most*
152
+ * stale-connection cases, but a query can still race a
153
+ * provider-initiated drop in flight — the postgres.js client
154
+ * rejects with `code: "CONNECTION_ENDED"` and the next attempt
155
+ * checks out a fresh connection from the pool. One retry is
156
+ * enough; if it fails again the host-side network is genuinely
157
+ * broken and the caller should see the error.
158
+ *
159
+ * Only retries reads + the standard exec/run paths in `query`;
160
+ * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
161
+ * `sql.begin(...)` transactions are unwrapped — those are
162
+ * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
163
+ * atomically scoped (transactions roll back cleanly), and adding
164
+ * a retry around them would complicate the transaction
165
+ * semantics.
166
+ */
167
+ private async runWithRetry<T>(fn: () => Promise<T>): Promise<T> {
168
+ try {
169
+ return await fn();
170
+ } catch (err) {
171
+ const code = (err as { code?: string } | null | undefined)?.code;
172
+ if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
173
+ return await fn();
174
+ }
175
+ throw err;
123
176
  }
124
- return this.sql.unsafe(sql, params);
125
177
  }
126
178
 
127
179
  private addToPathCache(tenantId: string, path: string): void {