npm - @poncho-ai/harness - Versions diffs - 0.47.0 → 0.47.1 - Mend

@poncho-ai/harness 0.47.0 → 0.47.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/.turbo/turbo-build.log +5 -5
package/CHANGELOG.md +39 -0
package/dist/index.d.ts +19 -0
package/dist/index.js +53 -4
package/package.json +1 -1
package/src/storage/postgres-engine.ts +55 -3

package/.turbo/turbo-build.log CHANGED Viewed

@@ -1,5 +1,5 @@
-> @poncho-ai/harness@0.47.0 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
+> @poncho-ai/harness@0.47.1 build /home/runner/work/poncho-ai/poncho-ai/packages/harness
 > node scripts/embed-docs.js && tsup src/index.ts --format esm --dts
 [embed-docs] Generated poncho-docs.ts with 4 topics
@@ -8,9 +8,9 @@
 [34mCLI[39m tsup v8.5.1
 [34mCLI[39m Target: es2022
 [34mESM[39m Build start
+[32mESM[39m [1mdist/index.js            [22m[32m527.60 KB[39m
 [32mESM[39m [1mdist/isolate-VY35DGLM.js [22m[32m49.43 KB[39m
-[32mESM[39m [1mdist/index.js            [22m[32m525.35 KB[39m
-[32mESM[39m ⚡️ Build success in 249ms
+[32mESM[39m ⚡️ Build success in 275ms
 [34mDTS[39m Build start
-[32mDTS[39m ⚡️ Build success in 7482ms
-[32mDTS[39m [1mdist/index.d.ts [22m[32m85.30 KB[39m
+[32mDTS[39m ⚡️ Build success in 7513ms
+[32mDTS[39m [1mdist/index.d.ts [22m[32m86.25 KB[39m

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,44 @@
 # @poncho-ai/harness
+## 0.47.1
+### Patch Changes
+- [#122](https://github.com/cesr/poncho-ai/pull/122) [`661536b`](https://github.com/cesr/poncho-ai/commit/661536b8d24691d91dc01e345b828ef6c9884beb) Thanks [@cesr](https://github.com/cesr)! - harness: postgres connection-pool resilience for managed-postgres hosts
+  Managed Postgres providers (Railway, Neon, Heroku, etc.) drop idle
+  TCP connections server-side after a few minutes. The previous
+  postgres-engine config left `idle_timeout` at the porsager/postgres
+  default (0 = never close client-side), so the pool accumulated stale
+  sockets; the first query on one rejected with `write CONNECTION_ENDED
+<host>:5432` at `durMs=0` and bubbled up as a hard failure to the
+  caller — including user-facing chat turns and the orchestrator's
+  subagent callback rerun.
+  Two complementary settings, plus one belt-and-suspenders retry:
+  - `idle_timeout: 20` — close idle client-side connections before
+    any reasonable provider-side timer fires. Fresh connection on
+    next checkout, no stale-socket race.
+  - `max_lifetime: 60 * 10` (10 min) — recycle long-lived
+    connections defensively, sidestepping provider-side
+    "max connection age" limits.
+  - `private query()` now retries once on `CONNECTION_ENDED` /
+    `CONNECTION_CLOSED` / `CONNECTION_DESTROYED`. Covers the
+    narrow race where a query lands on a connection at the exact
+    instant the provider drops it.
+  Defaults unchanged: `max: 10`, `connect_timeout: 30`. Migration DDL
+  (`sql.unsafe(sql)` inside `executeRaw`) and transactions
+  (`sql.begin(...)`) deliberately don't go through the retry — DDL
+  is `IF NOT EXISTS` idempotent and transactions need atomic scoping.
+  Observed in production: the PonchOS api running on Railway hit this
+  during a subagent test, the orchestrator's auto-callback rerun
+  threw the connection-ended error, a concurrent unhandled async
+  rejection killed the node process, and Railway restarted the
+  replica (~50s). User-facing chat turns started seeing the same
+  error after that. Patch eliminates the source.
 ## 0.47.0
 ### Minor Changes

package/dist/index.d.ts CHANGED Viewed

@@ -1714,6 +1714,25 @@ declare class PostgresEngine extends SqlStorageEngine {
     refreshPathCache(tenantId: string): Promise<void>;
     private patchVfs;
     private query;
+    /**
+     * Single retry on a transient connection-layer failure. The
+     * `idle_timeout` / `max_lifetime` config above prevents *most*
+     * stale-connection cases, but a query can still race a
+     * provider-initiated drop in flight — the postgres.js client
+     * rejects with `code: "CONNECTION_ENDED"` and the next attempt
+     * checks out a fresh connection from the pool. One retry is
+     * enough; if it fails again the host-side network is genuinely
+     * broken and the caller should see the error.
+     *
+     * Only retries reads + the standard exec/run paths in `query`;
+     * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
+     * `sql.begin(...)` transactions are unwrapped — those are
+     * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
+     * atomically scoped (transactions roll back cleanly), and adding
+     * a retry around them would complicate the transaction
+     * semantics.
+     */
+    private runWithRetry;
     private addToPathCache;
     private removeFromPathCache;
 }

package/dist/index.js CHANGED Viewed

@@ -4433,7 +4433,28 @@ var PostgresEngine = class extends SqlStorageEngine {
     this.sql = postgres(url, {
       onnotice: () => {
       },
-      prepare: false
+      prepare: false,
+      // Connection-pool resilience. Managed Postgres providers
+      // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
+      // connections server-side after a few minutes. Without these
+      // knobs, porsager/postgres keeps stale sockets in the pool;
+      // the next query on one rejects with
+      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
+      // as a hard failure to the caller. Two complementary settings:
+      //
+      //   - `idle_timeout: 20` closes idle connections client-side
+      //     after 20s, before any reasonable provider-side timer
+      //     fires. Fresh connection on next checkout = no stale
+      //     socket race.
+      //   - `max_lifetime: 600` (10 min) recycles long-lived
+      //     connections defensively even if they've stayed busy,
+      //     which sidesteps a separate class of provider-side
+      //     "max connection age" limits.
+      //
+      // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
+      // pool size + initial connect behavior unchanged.
+      idle_timeout: 20,
+      max_lifetime: 60 * 10
     });
   }
   async initialize() {
@@ -4477,10 +4498,38 @@ var PostgresEngine = class extends SqlStorageEngine {
     };
   }
   async query(sql, params) {
-    if (!params || params.length === 0) {
-      return this.sql.unsafe(sql);
+    return this.runWithRetry(
+      () => !params || params.length === 0 ? this.sql.unsafe(sql) : this.sql.unsafe(sql, params)
+    );
+  }
+  /**
+   * Single retry on a transient connection-layer failure. The
+   * `idle_timeout` / `max_lifetime` config above prevents *most*
+   * stale-connection cases, but a query can still race a
+   * provider-initiated drop in flight — the postgres.js client
+   * rejects with `code: "CONNECTION_ENDED"` and the next attempt
+   * checks out a fresh connection from the pool. One retry is
+   * enough; if it fails again the host-side network is genuinely
+   * broken and the caller should see the error.
+   *
+   * Only retries reads + the standard exec/run paths in `query`;
+   * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
+   * `sql.begin(...)` transactions are unwrapped — those are
+   * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
+   * atomically scoped (transactions roll back cleanly), and adding
+   * a retry around them would complicate the transaction
+   * semantics.
+   */
+  async runWithRetry(fn) {
+    try {
+      return await fn();
+    } catch (err) {
+      const code = err?.code;
+      if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
+        return await fn();
+      }
+      throw err;
     }
-    return this.sql.unsafe(sql, params);
   }
   addToPathCache(tenantId, path) {
     const paths = this.pathCache.get(tenantId);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@poncho-ai/harness",
-  "version": "0.47.0",
+  "version": "0.47.1",
   "description": "Agent execution runtime - conversation loop, tool dispatch, streaming",
   "repository": {
     "type": "git",

package/src/storage/postgres-engine.ts CHANGED Viewed

@@ -57,6 +57,27 @@ export class PostgresEngine extends SqlStorageEngine {
     this.sql = postgres(url, {
       onnotice: () => {},
       prepare: false,
+      // Connection-pool resilience. Managed Postgres providers
+      // (Railway, Neon, Heroku, etc.) routinely drop idle TCP
+      // connections server-side after a few minutes. Without these
+      // knobs, porsager/postgres keeps stale sockets in the pool;
+      // the next query on one rejects with
+      // `write CONNECTION_ENDED <host>:5432` at `durMs=0`, surfacing
+      // as a hard failure to the caller. Two complementary settings:
+      //
+      //   - `idle_timeout: 20` closes idle connections client-side
+      //     after 20s, before any reasonable provider-side timer
+      //     fires. Fresh connection on next checkout = no stale
+      //     socket race.
+      //   - `max_lifetime: 600` (10 min) recycles long-lived
+      //     connections defensively even if they've stayed busy,
+      //     which sidesteps a separate class of provider-side
+      //     "max connection age" limits.
+      //
+      // Defaults remain `max: 10`, `connect_timeout: 30` — leaving
+      // pool size + initial connect behavior unchanged.
+      idle_timeout: 20,
+      max_lifetime: 60 * 10,
     });
   }
@@ -118,10 +139,41 @@ export class PostgresEngine extends SqlStorageEngine {
   }
   private async query(sql: string, params?: unknown[]): Promise<any[]> {
-    if (!params || params.length === 0) {
-      return this.sql.unsafe(sql);
+    return this.runWithRetry(() =>
+      !params || params.length === 0
+        ? this.sql.unsafe(sql)
+        : this.sql.unsafe(sql, params),
+    );
+  }
+  /**
+   * Single retry on a transient connection-layer failure. The
+   * `idle_timeout` / `max_lifetime` config above prevents *most*
+   * stale-connection cases, but a query can still race a
+   * provider-initiated drop in flight — the postgres.js client
+   * rejects with `code: "CONNECTION_ENDED"` and the next attempt
+   * checks out a fresh connection from the pool. One retry is
+   * enough; if it fails again the host-side network is genuinely
+   * broken and the caller should see the error.
+   *
+   * Only retries reads + the standard exec/run paths in `query`;
+   * `sql.unsafe(sql)` calls in `executeRaw` (migration DDL) and
+   * `sql.begin(...)` transactions are unwrapped — those are
+   * idempotent-by-construction (DDL is `IF NOT EXISTS`) or
+   * atomically scoped (transactions roll back cleanly), and adding
+   * a retry around them would complicate the transaction
+   * semantics.
+   */
+  private async runWithRetry<T>(fn: () => Promise<T>): Promise<T> {
+    try {
+      return await fn();
+    } catch (err) {
+      const code = (err as { code?: string } | null | undefined)?.code;
+      if (code === "CONNECTION_ENDED" || code === "CONNECTION_CLOSED" || code === "CONNECTION_DESTROYED") {
+        return await fn();
+      }
+      throw err;
     }
-    return this.sql.unsafe(sql, params);
   }
   private addToPathCache(tenantId: string, path: string): void {