pg-aequor 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +24 -0
- package/README.md +197 -0
- package/index.d.ts +206 -0
- package/index.js +7 -0
- package/lib/client.js +418 -0
- package/lib/lease.js +124 -0
- package/lib/reaper.js +117 -0
- package/lib/retry.js +59 -0
- package/package.json +46 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Dmitry Serikoff
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
package/README.md
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/pg-aequor-banner.png" alt="PG-Aequor banner" width="720" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<a href="https://github.com/dimaq12/pg-aequor/actions/workflows/ci.yml">
|
|
7
|
+
<img alt="CI" src="https://github.com/dimaq12/pg-aequor/actions/workflows/ci.yml/badge.svg" />
|
|
8
|
+
</a>
|
|
9
|
+
<a href="https://www.npmjs.com/package/pg-aequor">
|
|
10
|
+
<img alt="npm" src="https://img.shields.io/npm/v/pg-aequor.svg" />
|
|
11
|
+
</a>
|
|
12
|
+
<a href="./LICENSE">
|
|
13
|
+
<img alt="license" src="https://img.shields.io/npm/l/pg-aequor.svg" />
|
|
14
|
+
</a>
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
<h1 align="center">pg-aequor</h1>
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
Crash-safe PostgreSQL client for <strong>Serverless runtimes</strong> (AWS Lambda / similar).
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
Have you ever…
|
|
24
|
+
|
|
25
|
+
- …had a Lambda “freeze”, then watched Postgres slowly fill up with <strong>idle zombie connections</strong> until you hit <code>max_connections</code>?
|
|
26
|
+
- …seen bursts of <code>sorry, too many clients already</code> during traffic spikes?
|
|
27
|
+
- …debugged <strong>random runtime crashes</strong> where the root cause was a dead PG socket (and the error bubbled out of an event handler)?
|
|
28
|
+
- …felt like you need PgBouncer/RDS Proxy, but you just want a client-side fix?
|
|
29
|
+
|
|
30
|
+
Standard <code>pg</code> + Lambda scale-outs often end in zombie connections: a Lambda freezes, its TCP socket stays alive on the DB, and a new wave of invocations keeps opening connections until you hit <code>max_connections</code>.
|
|
31
|
+
|
|
32
|
+
<strong>pg-aequor</strong> prevents this using <strong>Signed Leases</strong> + a lightweight <strong>Distributed Reaper</strong>.
|
|
33
|
+
|
|
34
|
+
## Use cases
|
|
35
|
+
|
|
36
|
+
- **Zombie connection storms**: old frozen containers keep sockets around; new invocations create more connections; the DB falls over.
|
|
37
|
+
- **“Unexplained” runtime exits**: some runtimes treat unhandled socket errors as fatal. `pg-aequor` swallows errors in pg event handlers and forces a safe reconnect path.
|
|
38
|
+
- **Spiky cold starts**: retries with decorrelated jitter + SQLSTATE filtering smooth transient network/DB restarts without turning retries into a synchronized stampede.
|
|
39
|
+
|
|
40
|
+
## Table of contents
|
|
41
|
+
|
|
42
|
+
- [Features](#features)
|
|
43
|
+
- [Install](#install)
|
|
44
|
+
- [Quick start](#quick-start)
|
|
45
|
+
- [Use cases](#use-cases)
|
|
46
|
+
- [How it works](#how-it-works-in-one-minute)
|
|
47
|
+
- [Operational rules](#operational-rules-important)
|
|
48
|
+
- [Configuration](#configuration)
|
|
49
|
+
- [Observability (hooks)](#observability-hooks)
|
|
50
|
+
- [Production checklist](#production-checklist)
|
|
51
|
+
- [FAQ](#faq)
|
|
52
|
+
|
|
53
|
+
## Features
|
|
54
|
+
|
|
55
|
+
- **Signed leases in `application_name`**: each connection self-identifies with expiration + HMAC.
|
|
56
|
+
- **Distributed reaper**: one request occasionally becomes the “leader” and reaps expired connections.
|
|
57
|
+
- **Advisory locks**: coordination via Postgres locks (no external coordinator).
|
|
58
|
+
- **Crash safety**: socket errors are swallowed from event handlers to prevent runtime crashes.
|
|
59
|
+
- **Safe retries**: decorrelated jitter + SQLSTATE filtering for transient failures.
|
|
60
|
+
- **Hooks**: lightweight observability callbacks (metrics/tracing).
|
|
61
|
+
|
|
62
|
+
## Install
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
npm install pg-aequor pg
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
> **Note:** `pg` is a peer dependency. Tested with `pg@^8.11.0`.
|
|
69
|
+
|
|
70
|
+
## Quick start
|
|
71
|
+
|
|
72
|
+
```js
|
|
73
|
+
const { AequorClient } = require('pg-aequor')
|
|
74
|
+
|
|
75
|
+
const client = new AequorClient({
|
|
76
|
+
host: process.env.DB_HOST,
|
|
77
|
+
user: process.env.DB_USER,
|
|
78
|
+
password: process.env.DB_PASSWORD,
|
|
79
|
+
database: process.env.DB_NAME,
|
|
80
|
+
|
|
81
|
+
// Coordination Secret (distinct from DB password)
|
|
82
|
+
coordinationSecret: process.env.COORD_SECRET,
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
await client.connect()
|
|
86
|
+
const res = await client.query('SELECT NOW()')
|
|
87
|
+
await client.clean() // or: await client.end()
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## How it works (in one minute)
|
|
91
|
+
|
|
92
|
+
In standard environments, connections live long. In serverless, containers “freeze”.
|
|
93
|
+
|
|
94
|
+
We solve this via:
|
|
95
|
+
|
|
96
|
+
1. **Signed Leases**: each connection stores `expiration + signature` in `application_name`.
|
|
97
|
+
2. **Distributed Reaper**: on connect (probabilistically), one instance scans `pg_stat_activity` and terminates expired connections.
|
|
98
|
+
3. **Advisory Locks**: `pg_try_advisory_lock` ensures only one leader reaps at a time.
|
|
99
|
+
|
|
100
|
+
## Operational rules (important)
|
|
101
|
+
|
|
102
|
+
- **Disposable idle**: if a connection is idle longer than its lease TTL, it becomes eligible to be reaped by another instance.
|
|
103
|
+
- **Single-connection architecture**: the reaper runs on the active connection (under lock) to avoid “reaper storms”.
|
|
104
|
+
- **Hooks must be fast**: don’t do heavy work inside hooks; use them for metrics/tracing only.
|
|
105
|
+
|
|
106
|
+
## Configuration
|
|
107
|
+
|
|
108
|
+
### Lease / reaper (recommended)
|
|
109
|
+
|
|
110
|
+
| Option | Type | Default | Notes |
|
|
111
|
+
| --- | --- | --- | --- |
|
|
112
|
+
| `coordinationSecret` | `string` | _(required)_ | Shared secret for HMAC signing. **Do not** use DB password. Must be at least 16 bytes. |
|
|
113
|
+
| `leaseMode` | `'required' \| 'optional'` | `'required'` | If `optional` and `coordinationSecret` is missing: lease/reaper/heartbeat are disabled. |
|
|
114
|
+
| `leaseTtlMs` | `number` | `90000` | Lease TTL. |
|
|
115
|
+
| `reaper` | `boolean` | `true` | Enable/disable reaper. |
|
|
116
|
+
| `reaperRunProbability` | `number` | `0.1` | Probability of trying a reaper pass on connect (0..1). |
|
|
117
|
+
| `reaperCooldownMs` | `number` | `30000` | Minimum time between reaper runs per container. |
|
|
118
|
+
| `minConnectionIdleTimeSec` | `number` | `180` | Minimum idle seconds to consider a connection a candidate. |
|
|
119
|
+
| `maxIdleConnectionsToKill` | `number` | `10` | Max zombies to kill in one pass. |
|
|
120
|
+
|
|
121
|
+
### Retries
|
|
122
|
+
|
|
123
|
+
| Option | Type | Default |
|
|
124
|
+
| --- | --- | --- |
|
|
125
|
+
| `retries` | `number` | `3` |
|
|
126
|
+
| `minBackoff` | `number` | `100` |
|
|
127
|
+
| `maxBackoff` | `number` | `2000` |
|
|
128
|
+
| `maxConnectRetryTimeMs` | `number` | `15000` |
|
|
129
|
+
| `maxQueryRetryTimeMs` | `number` | `15000` |
|
|
130
|
+
|
|
131
|
+
We use **decorrelated jitter** and **SQLSTATE-based** retry classification to avoid duplicating non-idempotent writes.
|
|
132
|
+
|
|
133
|
+
## Observability (hooks)
|
|
134
|
+
|
|
135
|
+
```js
|
|
136
|
+
const { AequorClient } = require('pg-aequor')
|
|
137
|
+
|
|
138
|
+
const client = new AequorClient({
|
|
139
|
+
// ...pg config...
|
|
140
|
+
coordinationSecret: process.env.COORD_SECRET,
|
|
141
|
+
|
|
142
|
+
hooks: {
|
|
143
|
+
onQueryRetry: ({ retries, err }) => {
|
|
144
|
+
console.warn(`Retry #${retries} due to ${err.code}`)
|
|
145
|
+
},
|
|
146
|
+
onReap: ({ locked, killed, durationMs }) => {
|
|
147
|
+
// metrics: how often we reap, and how many zombies we killed
|
|
148
|
+
if (locked && killed > 0) console.log(`Reaped ${killed} zombies in ${durationMs}ms`)
|
|
149
|
+
},
|
|
150
|
+
onClientDead: ({ source, meta }) => {
|
|
151
|
+
// Great place for EMF/X-Ray/etc
|
|
152
|
+
console.log('Client dead:', source, meta?.sqlstate)
|
|
153
|
+
},
|
|
154
|
+
onQueryStart: ({ startedAt }) => {
|
|
155
|
+
// tracing start
|
|
156
|
+
},
|
|
157
|
+
onQueryEnd: ({ duration }) => {
|
|
158
|
+
// tracing end
|
|
159
|
+
},
|
|
160
|
+
},
|
|
161
|
+
})
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Production checklist
|
|
165
|
+
|
|
166
|
+
### Required Postgres privileges
|
|
167
|
+
|
|
168
|
+
The reaper reads `pg_stat_activity` and calls `pg_terminate_backend()`.
|
|
169
|
+
|
|
170
|
+
> **Heads up:** on managed Postgres, this may require elevated privileges (or be restricted by policy).
|
|
171
|
+
> If the reaper can’t terminate backends, you’ll typically see permission errors and zombies will remain.
|
|
172
|
+
|
|
173
|
+
### Coordination secret hygiene
|
|
174
|
+
|
|
175
|
+
- Use a **separate secret** (not the DB password).
|
|
176
|
+
- Keep it at least **16 bytes**.
|
|
177
|
+
- Rotate carefully: a safe pattern is “deploy new secret everywhere” during a maintenance window, because old/new secrets won’t verify each other’s leases.
|
|
178
|
+
|
|
179
|
+
### Recommended defaults
|
|
180
|
+
|
|
181
|
+
- Start with a conservative `leaseTtlMs` (e.g. `90s`) and `minConnectionIdleTimeSec` (e.g. `180s`) to avoid self-inflicted churn.
|
|
182
|
+
- Keep hooks lightweight (metrics only).
|
|
183
|
+
|
|
184
|
+
## FAQ
|
|
185
|
+
|
|
186
|
+
### Will it kill my active connections?
|
|
187
|
+
|
|
188
|
+
No. The reaper only terminates connections that:
|
|
189
|
+
|
|
190
|
+
- match this service prefix, and
|
|
191
|
+
- have a **valid signature**, and
|
|
192
|
+
- are **expired**, and
|
|
193
|
+
- are **idle** for longer than your configured threshold.
|
|
194
|
+
|
|
195
|
+
### Do I still need PgBouncer/RDS Proxy?
|
|
196
|
+
|
|
197
|
+
If you already have a proxy and it works well for you, keep it. `pg-aequor` is a pure-client approach intended for cases where you can’t or don’t want to add extra infrastructure.
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import { Client, ClientConfig, QueryResult, QueryResultRow } from 'pg';
|
|
2
|
+
|
|
3
|
+
export interface AequorClientHooks {
|
|
4
|
+
/**
|
|
5
|
+
* Called when a new database connection is successfully established.
|
|
6
|
+
*/
|
|
7
|
+
onConnect?: (payload: { gen: number }) => void;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Called when a connection attempt fails and is about to be retried.
|
|
11
|
+
*/
|
|
12
|
+
onReconnect?: (payload: { gen: number; retries: number; delay: number; err: Error }) => void;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Called when a query fails with a retryable error and is about to be retried.
|
|
16
|
+
*/
|
|
17
|
+
onQueryRetry?: (payload: { retries: number; delay: number; err: Error }) => void;
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Called when a heartbeat (lease renewal) succeeds.
|
|
21
|
+
*/
|
|
22
|
+
onHeartbeat?: (payload: { gen: number }) => void;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Called when a heartbeat fails (either transiently or permanently).
|
|
26
|
+
*/
|
|
27
|
+
onHeartbeatFail?: (payload: { gen: number; err: Error }) => void;
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Called when a reaper pass is attempted (best effort).
|
|
31
|
+
* Useful for metrics: how many zombies were killed and how long it took.
|
|
32
|
+
*/
|
|
33
|
+
onReap?: (payload: { gen: number; locked: boolean; killed: number; durationMs: number }) => void;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Called when the underlying pg.Client emits an 'error' event or ends unexpectedly.
|
|
37
|
+
* This is a critical signal that the connection is dead.
|
|
38
|
+
*/
|
|
39
|
+
onClientDead?: (payload: { source: 'error' | 'end'; err?: Error; meta?: { sqlstate?: string; [key: string]: any } }) => void;
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Called immediately before a user query is executed. Useful for tracing start time.
|
|
43
|
+
*/
|
|
44
|
+
onQueryStart?: (payload: { args: any[]; startedAt: number }) => void;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Called immediately after a user query successfully completes.
|
|
48
|
+
*/
|
|
49
|
+
onQueryEnd?: (payload: { args: any[]; res: QueryResult<any>; duration: number }) => void;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Called when a user query fails (before retry logic kicks in).
|
|
53
|
+
*/
|
|
54
|
+
onQueryError?: (payload: { args: any[]; err: Error; duration: number }) => void;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface AequorClientConfig extends ClientConfig {
|
|
58
|
+
/**
|
|
59
|
+
* Shared coordination secret for signing leases. Required if leaseMode is 'required'.
|
|
60
|
+
* Conceptually distinct from DB password. Must be at least 16 bytes.
|
|
61
|
+
*/
|
|
62
|
+
coordinationSecret?: string;
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Logical name of the service using this client. Used for advisory lock namespace.
|
|
66
|
+
* Defaults to AWS_LAMBDA_FUNCTION_NAME or 'sls_pg'.
|
|
67
|
+
*/
|
|
68
|
+
serviceName?: string;
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Coordination mode.
|
|
72
|
+
* - 'required': throws if coordinationSecret is missing (default).
|
|
73
|
+
* - 'optional': disables lease/reaper if coordinationSecret is missing.
|
|
74
|
+
*/
|
|
75
|
+
leaseMode?: 'required' | 'optional';
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Enable/disable the background connection reaper. Default: true.
|
|
79
|
+
*/
|
|
80
|
+
reaper?: boolean;
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Probability (0.0 - 1.0) of running the reaper on connect.
|
|
84
|
+
* Alias for legacy 'connUtilization'. Default: 0.1.
|
|
85
|
+
*/
|
|
86
|
+
reaperRunProbability?: number;
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Minimum time (ms) between reaper runs on this container. Default: 120000 (2m).
|
|
90
|
+
*/
|
|
91
|
+
reaperCooldownMs?: number;
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* How to handle reaper internal errors.
|
|
95
|
+
* - 'swallow': log and ignore (default).
|
|
96
|
+
* - 'throw': throw exception to the caller.
|
|
97
|
+
*/
|
|
98
|
+
reaperErrorMode?: 'swallow' | 'throw';
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Minimum idle time (seconds) before a connection is considered a zombie candidate.
|
|
102
|
+
* Default: 180 (3m).
|
|
103
|
+
*/
|
|
104
|
+
minConnectionIdleTimeSec?: number;
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Maximum number of zombie connections to kill in one reaper pass. Default: 1.
|
|
108
|
+
*/
|
|
109
|
+
maxIdleConnectionsToKill?: number;
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Lease time-to-live in milliseconds. Default: 90000 (90s).
|
|
113
|
+
*/
|
|
114
|
+
leaseTtlMs?: number;
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Time remaining (ms) where we soft-check lease renewal. Default: 30000.
|
|
118
|
+
*/
|
|
119
|
+
heartbeatSoftRemainingMs?: number;
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Time remaining (ms) where we force-wait for lease renewal. Default: 5000.
|
|
123
|
+
*/
|
|
124
|
+
heartbeatHardWaitRemainingMs?: number;
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Time (ms) to wait for set_config heartbeat query before timing out. Default: 2000.
|
|
128
|
+
*/
|
|
129
|
+
heartbeatTimeoutMs?: number;
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Action on heartbeat failure.
|
|
133
|
+
* - 'reconnect': mark client dead and reconnect (safest for serverless).
|
|
134
|
+
* - 'swallow': log and ignore.
|
|
135
|
+
* - 'throw': throw error.
|
|
136
|
+
* Default: 'reconnect'.
|
|
137
|
+
*/
|
|
138
|
+
heartbeatErrorMode?: 'reconnect' | 'swallow' | 'throw';
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Max time (ms) to spend retrying a connect operation. Default: 15000.
|
|
142
|
+
*/
|
|
143
|
+
maxConnectRetryTimeMs?: number;
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Max time (ms) to spend retrying a query operation. Default: 15000.
|
|
147
|
+
*/
|
|
148
|
+
maxQueryRetryTimeMs?: number;
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Default query_timeout (ms) passed to pg if not specified in individual query.
|
|
152
|
+
*/
|
|
153
|
+
defaultQueryTimeoutMs?: number;
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Observability hooks.
|
|
157
|
+
*/
|
|
158
|
+
hooks?: AequorClientHooks;
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Debug logging (console.log). Default: false.
|
|
162
|
+
*/
|
|
163
|
+
debug?: boolean;
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Underlying pg driver instance (e.g. for X-Ray capture).
|
|
167
|
+
*/
|
|
168
|
+
library?: any;
|
|
169
|
+
|
|
170
|
+
// Legacy aliases
|
|
171
|
+
connUtilization?: number;
|
|
172
|
+
applicationName?: string;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
export class AequorClient {
|
|
176
|
+
constructor(config: AequorClientConfig);
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Establishes a connection (if not already connected) and acquires a lease.
|
|
180
|
+
*/
|
|
181
|
+
connect(): Promise<void>;
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Executes a query with automatic retry and lease management.
|
|
185
|
+
*/
|
|
186
|
+
query<R extends QueryResultRow = any, I extends any[] = any[]>(
|
|
187
|
+
queryTextOrConfig: string | import('pg').QueryConfig<I>,
|
|
188
|
+
values?: I
|
|
189
|
+
): Promise<QueryResult<R>>;
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Gracefully closes the connection.
|
|
193
|
+
*/
|
|
194
|
+
clean(): Promise<void>;
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Alias for clean().
|
|
198
|
+
*/
|
|
199
|
+
end(): Promise<void>;
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Returns the underlying pg.Client instance (if connected).
|
|
203
|
+
* Use with caution.
|
|
204
|
+
*/
|
|
205
|
+
getClient(): Client | null;
|
|
206
|
+
}
|
package/index.js
ADDED
package/lib/client.js
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
const RetryStrategy = require('./retry')
|
|
2
|
+
const LeaseManager = require('./lease')
|
|
3
|
+
const Reaper = require('./reaper')
|
|
4
|
+
const crypto = require('crypto')
|
|
5
|
+
|
|
6
|
+
class AequorClient {
|
|
7
|
+
constructor(config = {}) {
|
|
8
|
+
this._config = config
|
|
9
|
+
this._library = config.library || require('pg')
|
|
10
|
+
this._client = null
|
|
11
|
+
this._isDead = false // Flag to force recreation
|
|
12
|
+
this._generation = 0
|
|
13
|
+
this._connectPromise = null
|
|
14
|
+
|
|
15
|
+
// Retry Strategy
|
|
16
|
+
this._retryStrategy = {
|
|
17
|
+
retries: config.retries ?? 3,
|
|
18
|
+
minBackoff: config.minBackoff ?? 100, // ms
|
|
19
|
+
maxBackoff: config.maxBackoff ?? 2000 // ms
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Lease/Reaper mode:
|
|
23
|
+
// - required: coordinationSecret must be provided (safe distributed coordination)
|
|
24
|
+
// - optional: if coordinationSecret missing, disable lease/reaper/heartbeat but client still works
|
|
25
|
+
this._leaseMode = config.leaseMode || 'required' // 'required' | 'optional'
|
|
26
|
+
|
|
27
|
+
// Reaper config (can be disabled if lease is disabled)
|
|
28
|
+
this._reaperEnabled = config.reaper !== false
|
|
29
|
+
this._strategy = {
|
|
30
|
+
// Probability of running a reaper pass on connect (0..1). Alias for backwards compatibility.
|
|
31
|
+
reaperRunProbability: config.reaperRunProbability ?? config.connUtilization ?? 0.1,
|
|
32
|
+
// Default should be minutes, not seconds, otherwise you create your own outages.
|
|
33
|
+
minConnIdleTimeSec: config.minConnectionIdleTimeSec || 180, // Default 3m
|
|
34
|
+
maxIdleConnectionsToKill: config.maxIdleConnectionsToKill || 10,
|
|
35
|
+
reaperErrorMode: config.reaperErrorMode || 'swallow', // 'swallow' | 'throw'
|
|
36
|
+
}
|
|
37
|
+
this._reaperCooldownMs = config.reaperCooldownMs ?? 30000
|
|
38
|
+
// Jittered Cooldown Base: Add random offset to avoid synchronized reapers
|
|
39
|
+
this._reaperBaseInterval = this._reaperCooldownMs + Math.random() * (this._reaperCooldownMs / 3)
|
|
40
|
+
this._reaperCurrentInterval = this._reaperBaseInterval
|
|
41
|
+
this._reaperNextRunAt = 0
|
|
42
|
+
|
|
43
|
+
// Setup Lease Manager
|
|
44
|
+
const serviceName = config.serviceName || process.env.AWS_LAMBDA_FUNCTION_NAME || 'sls_pg'
|
|
45
|
+
// 48-bit random instance id => exactly 8 base64url chars (no padding). Good entropy, tight budget.
|
|
46
|
+
const instanceId = crypto.randomBytes(6).toString('base64url')
|
|
47
|
+
// Explicit coordination secret (NOT db password).
|
|
48
|
+
const coordinationSecret = config.coordinationSecret
|
|
49
|
+
this._baseApplicationName =
|
|
50
|
+
(typeof config.application_name === 'string' && config.application_name) ||
|
|
51
|
+
(typeof config.applicationName === 'string' && config.applicationName) ||
|
|
52
|
+
serviceName
|
|
53
|
+
|
|
54
|
+
if (!coordinationSecret) {
|
|
55
|
+
if (this._leaseMode === 'required') {
|
|
56
|
+
throw new Error('Missing config.coordinationSecret (required for lease/reaper). Set leaseMode=\"optional\" to disable lease/reaper/heartbeat.')
|
|
57
|
+
}
|
|
58
|
+
this._leaseManager = null
|
|
59
|
+
this._reaperEnabled = false
|
|
60
|
+
} else {
|
|
61
|
+
this._leaseManager = new LeaseManager(serviceName, instanceId, coordinationSecret)
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Heartbeat state
|
|
65
|
+
this._leaseExp = 0
|
|
66
|
+
this._heartbeatPromise = null
|
|
67
|
+
this._leaseTtlMs = config.leaseTtlMs ?? 90000
|
|
68
|
+
this._heartbeatSoftRemainingMs = config.heartbeatSoftRemainingMs ?? 30000
|
|
69
|
+
this._heartbeatHardWaitRemainingMs = config.heartbeatHardWaitRemainingMs ?? 5000
|
|
70
|
+
this._heartbeatErrorMode = config.heartbeatErrorMode || 'reconnect' // 'swallow' | 'reconnect' | 'throw'
|
|
71
|
+
this._heartbeatTimeoutMs = config.heartbeatTimeoutMs ?? 2000
|
|
72
|
+
this._defaultQueryTimeoutMs = config.defaultQueryTimeoutMs ?? 0
|
|
73
|
+
|
|
74
|
+
// Logging
|
|
75
|
+
this._logger = config.debug ? console.log : () => {}
|
|
76
|
+
this._hooks = config.hooks || {}
|
|
77
|
+
|
|
78
|
+
// Backoff state (decorrelated jitter needs previous delay)
|
|
79
|
+
this._connectPrevDelay = 0
|
|
80
|
+
this._queryPrevDelay = 0
|
|
81
|
+
this._maxConnectRetryTimeMs = config.maxConnectRetryTimeMs ?? 15000
|
|
82
|
+
this._maxQueryRetryTimeMs = config.maxQueryRetryTimeMs ?? 15000
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
_safeHook(name, payload) {
|
|
86
|
+
const fn = this._hooks && this._hooks[name]
|
|
87
|
+
if (typeof fn !== 'function') return
|
|
88
|
+
try { fn(payload) } catch (_) { /* never throw from hooks */ }
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
async connect() {
|
|
92
|
+
if (this._client && !this._isDead) return
|
|
93
|
+
if (this._connectPromise) return this._connectPromise
|
|
94
|
+
const gen = ++this._generation
|
|
95
|
+
this._connectPromise = (async () => {
|
|
96
|
+
try {
|
|
97
|
+
await this._connectWithRetry(gen)
|
|
98
|
+
} finally {
|
|
99
|
+
this._connectPromise = null
|
|
100
|
+
}
|
|
101
|
+
})()
|
|
102
|
+
return this._connectPromise
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
async _connectWithRetry(gen) {
|
|
106
|
+
const startedAt = Date.now()
|
|
107
|
+
let retries = 0
|
|
108
|
+
while (true) {
|
|
109
|
+
try {
|
|
110
|
+
await this._connect(gen)
|
|
111
|
+
this._connectPrevDelay = 0
|
|
112
|
+
this._safeHook('onConnect', { gen })
|
|
113
|
+
return // Success
|
|
114
|
+
} catch (err) {
|
|
115
|
+
if (this._maxConnectRetryTimeMs > 0 && (Date.now() - startedAt) > this._maxConnectRetryTimeMs) {
|
|
116
|
+
throw err
|
|
117
|
+
}
|
|
118
|
+
if (!RetryStrategy.isRetryable(err) || retries >= this._retryStrategy.retries) {
|
|
119
|
+
throw err
|
|
120
|
+
}
|
|
121
|
+
retries++
|
|
122
|
+
const delay = RetryStrategy.getBackoff(
|
|
123
|
+
this._retryStrategy.minBackoff,
|
|
124
|
+
this._retryStrategy.maxBackoff,
|
|
125
|
+
this._connectPrevDelay
|
|
126
|
+
)
|
|
127
|
+
this._connectPrevDelay = delay
|
|
128
|
+
this._safeHook('onReconnect', { gen, retries, delay, err })
|
|
129
|
+
this._logger(`Connect Retry ${retries}/${this._retryStrategy.retries} after ${delay}ms: ${err.message}`)
|
|
130
|
+
await new Promise(res => setTimeout(res, delay))
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
async _connect(gen) {
|
|
136
|
+
// Internal cleanup before creating a new client should NOT invalidate this generation.
|
|
137
|
+
await this._disposeClient('reconnect', { bumpGeneration: false })
|
|
138
|
+
|
|
139
|
+
// Generate initial lease
|
|
140
|
+
let appName = String(this._baseApplicationName || 'app').slice(0, 63)
|
|
141
|
+
if (this._leaseManager) {
|
|
142
|
+
this._leaseExp = Date.now() + this._leaseTtlMs
|
|
143
|
+
appName = this._leaseManager.generateAppName(this._leaseExp)
|
|
144
|
+
} else {
|
|
145
|
+
this._leaseExp = 0
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const clientConfig = this._buildPgClientConfig({ application_name: appName })
|
|
149
|
+
|
|
150
|
+
const client = new this._library.Client(clientConfig)
|
|
151
|
+
|
|
152
|
+
// Crash Safety: Swallow errors to prevent Runtime.ExitError
|
|
153
|
+
client.on('error', (err) => this._markDeadAndDispose(client, err, 'error'))
|
|
154
|
+
// If connection ends, the client is not reusable.
|
|
155
|
+
client.on('end', () => this._markDeadAndDispose(client, null, 'end'))
|
|
156
|
+
|
|
157
|
+
await client.connect()
|
|
158
|
+
|
|
159
|
+
// Generation guard: do not resurrect if a newer generation started while we were connecting.
|
|
160
|
+
if (this._generation !== gen) {
|
|
161
|
+
try { await client.end() } catch (_) {}
|
|
162
|
+
return
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
this._client = client
|
|
166
|
+
this._isDead = false
|
|
167
|
+
|
|
168
|
+
// Run Reaper if enabled (async, best effort)
|
|
169
|
+
if (this._reaperEnabled) {
|
|
170
|
+
this._reap().catch(err => this._logger('Reap failed:', err.message))
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Best-effort connection cleanup
|
|
175
|
+
async _reap() {
|
|
176
|
+
// 1. Check Lease Manager
|
|
177
|
+
if (!this._leaseManager) return
|
|
178
|
+
|
|
179
|
+
// 2. Jittered Cooldown + Backoff
|
|
180
|
+
const now = Date.now()
|
|
181
|
+
if (now < this._reaperNextRunAt) return
|
|
182
|
+
|
|
183
|
+
// 3. Use CURRENT client (Single Connection Architecture)
|
|
184
|
+
const client = this._client
|
|
185
|
+
if (!client) return
|
|
186
|
+
|
|
187
|
+
try {
|
|
188
|
+
const startedAt = Date.now()
|
|
189
|
+
const result = await Reaper.reap(client, this._config, this._leaseManager, this._strategy, this._logger)
|
|
190
|
+
const durationMs = Date.now() - startedAt
|
|
191
|
+
// Hook for metrics: how often we attempt, lock status, and how many zombies were killed.
|
|
192
|
+
this._safeHook('onReap', { gen: this._generation, locked: !!result.locked, killed: Number(result.killed || 0), durationMs })
|
|
193
|
+
|
|
194
|
+
if (!result.locked) {
|
|
195
|
+
// Lock busy (someone else is reaping) -> Exponential Backoff
|
|
196
|
+
this._reaperCurrentInterval = Math.min(this._reaperCurrentInterval * 1.5, 600000) // max 10m
|
|
197
|
+
} else {
|
|
198
|
+
// Success (or just acquired lock) -> Reset to Base
|
|
199
|
+
this._reaperCurrentInterval = this._reaperBaseInterval
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Schedule next run with jitter
|
|
203
|
+
const jitter = Math.random() * (this._reaperCurrentInterval / 2)
|
|
204
|
+
this._reaperNextRunAt = now + this._reaperCurrentInterval + jitter
|
|
205
|
+
|
|
206
|
+
if (result.killed > 0) {
|
|
207
|
+
this._logger(`Reaper: Killed ${result.killed} zombies`)
|
|
208
|
+
}
|
|
209
|
+
} catch (err) {
|
|
210
|
+
this._logger('Reap failed:', err.message)
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
async query(...args) {
|
|
215
|
+
const startedAt = Date.now()
|
|
216
|
+
this._safeHook('onQueryStart', { args, startedAt })
|
|
217
|
+
let retries = 0
|
|
218
|
+
while (true) {
|
|
219
|
+
try {
|
|
220
|
+
if (!this._client || this._isDead) {
|
|
221
|
+
await this.connect()
|
|
222
|
+
} else {
|
|
223
|
+
// Check heartbeat. If lease expired -> WAIT. If OK -> async update.
|
|
224
|
+
await this._heartbeatIfNeeded()
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const res = await this._client.query(...args)
|
|
228
|
+
this._queryPrevDelay = 0
|
|
229
|
+
this._safeHook('onQueryEnd', { args, res, duration: Date.now() - startedAt })
|
|
230
|
+
return res
|
|
231
|
+
|
|
232
|
+
} catch (err) {
|
|
233
|
+
// If error is NOT retryable, throw immediately
|
|
234
|
+
if (!RetryStrategy.isRetryable(err) || retries >= this._retryStrategy.retries) {
|
|
235
|
+
this._safeHook('onQueryError', { args, err, duration: Date.now() - startedAt })
|
|
236
|
+
throw err
|
|
237
|
+
}
|
|
238
|
+
if (this._maxQueryRetryTimeMs > 0 && (Date.now() - startedAt) > this._maxQueryRetryTimeMs) {
|
|
239
|
+
this._safeHook('onQueryError', { args, err, duration: Date.now() - startedAt })
|
|
240
|
+
throw err
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
retries++
|
|
244
|
+
const delay = RetryStrategy.getBackoff(
|
|
245
|
+
this._retryStrategy.minBackoff,
|
|
246
|
+
this._retryStrategy.maxBackoff,
|
|
247
|
+
this._queryPrevDelay
|
|
248
|
+
)
|
|
249
|
+
this._queryPrevDelay = delay
|
|
250
|
+
this._safeHook('onQueryRetry', { retries, delay, err })
|
|
251
|
+
this._logger(`Query Retry ${retries}/${this._retryStrategy.retries} after ${delay}ms: ${err.message}`)
|
|
252
|
+
|
|
253
|
+
// Force reconnect on next loop
|
|
254
|
+
this._isDead = true
|
|
255
|
+
await this._disposeClient('query_error')
|
|
256
|
+
|
|
257
|
+
await new Promise(res => setTimeout(res, delay))
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
async _heartbeatIfNeeded() {
|
|
263
|
+
if (!this._leaseManager) return
|
|
264
|
+
const gen = this._generation
|
|
265
|
+
const client = this._client
|
|
266
|
+
const now = Date.now()
|
|
267
|
+
const remaining = this._leaseExp - now
|
|
268
|
+
|
|
269
|
+
// If lease has > 30s remaining, we are safe. Do nothing.
|
|
270
|
+
if (remaining > this._heartbeatSoftRemainingMs) return
|
|
271
|
+
|
|
272
|
+
// If lease is expired or close to expiring (< 30s), we need update.
|
|
273
|
+
// Use promise deduplication to avoid thundering herd.
|
|
274
|
+
if (!this._heartbeatPromise) {
|
|
275
|
+
this._heartbeatPromise = this._performHeartbeat(gen, client).finally(() => {
|
|
276
|
+
this._heartbeatPromise = null
|
|
277
|
+
})
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// If lease is ALREADY expired (or < 5s safety margin), we MUST wait for update.
|
|
281
|
+
if (remaining < this._heartbeatHardWaitRemainingMs) {
|
|
282
|
+
await this._heartbeatPromise
|
|
283
|
+
} else {
|
|
284
|
+
// Otherwise, let it update in background (fire-and-forget)
|
|
285
|
+
// This is safe because we still have > 5s lease
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
async _performHeartbeat(gen, client) {
|
|
290
|
+
try {
|
|
291
|
+
if (!this._leaseManager) return
|
|
292
|
+
if (!client || client !== this._client) return
|
|
293
|
+
if (this._generation !== gen) return
|
|
294
|
+
const newExp = Date.now() + this._leaseTtlMs
|
|
295
|
+
const appName = this._leaseManager.generateAppName(newExp)
|
|
296
|
+
// Never interpolate appName into SQL. Use bind parameters.
|
|
297
|
+
const heartbeatQuery = this._client.query(`SELECT set_config('application_name', $1, false)`, [appName])
|
|
298
|
+
const timeout = new Promise((_, reject) => {
|
|
299
|
+
const e = new Error(`Heartbeat timed out after ${this._heartbeatTimeoutMs}ms`)
|
|
300
|
+
e.code = 'ETIMEDOUT'
|
|
301
|
+
setTimeout(() => reject(e), this._heartbeatTimeoutMs)
|
|
302
|
+
})
|
|
303
|
+
const res = await Promise.race([heartbeatQuery, timeout])
|
|
304
|
+
if (!res) throw new Error('Heartbeat failed: no result')
|
|
305
|
+
// Only update local lease if DB update succeeded.
|
|
306
|
+
if (this._generation === gen && client === this._client) {
|
|
307
|
+
this._leaseExp = newExp
|
|
308
|
+
this._safeHook('onHeartbeat', { gen })
|
|
309
|
+
}
|
|
310
|
+
} catch (err) {
|
|
311
|
+
this._logger('Heartbeat failed:', err.message)
|
|
312
|
+
this._safeHook('onHeartbeatFail', { gen, err })
|
|
313
|
+
// If we're in hard-wait territory and heartbeat fails, do NOT keep a client that
|
|
314
|
+
// is now invisible to other reapers (lease can expire). Default action: reconnect.
|
|
315
|
+
if (this._heartbeatErrorMode === 'throw') throw err
|
|
316
|
+
if (this._heartbeatErrorMode === 'reconnect') {
|
|
317
|
+
// In soft zone we already decided heartbeat matters. Don't limp along into expiry.
|
|
318
|
+
// If it's retryable, definitely reconnect. If it's non-retryable, reconnect won't help,
|
|
319
|
+
// but it's still safer than staying in an inconsistent lease state.
|
|
320
|
+
this._isDead = true
|
|
321
|
+
await this._disposeClient('heartbeat_failed')
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
_buildPgClientConfig(overrides = {}) {
|
|
327
|
+
const clientConfig = { ...this._config, ...overrides }
|
|
328
|
+
if (!clientConfig.query_timeout && this._defaultQueryTimeoutMs > 0) {
|
|
329
|
+
clientConfig.query_timeout = this._defaultQueryTimeoutMs
|
|
330
|
+
}
|
|
331
|
+
// Strip internal fields (keep pg config clean and future-proof)
|
|
332
|
+
const internalKeys = [
|
|
333
|
+
'library',
|
|
334
|
+
'reaper',
|
|
335
|
+
'reaperRunProbability',
|
|
336
|
+
'reaperErrorMode',
|
|
337
|
+
'connUtilization', // legacy alias
|
|
338
|
+
'minConnectionIdleTimeSec',
|
|
339
|
+
'maxIdleConnectionsToKill',
|
|
340
|
+
'retries',
|
|
341
|
+
'minBackoff',
|
|
342
|
+
'maxBackoff',
|
|
343
|
+
'serviceName',
|
|
344
|
+
'coordinationSecret',
|
|
345
|
+
'debug',
|
|
346
|
+
'leaseTtlMs',
|
|
347
|
+
'heartbeatSoftRemainingMs',
|
|
348
|
+
'heartbeatHardWaitRemainingMs',
|
|
349
|
+
'heartbeatErrorMode',
|
|
350
|
+
'heartbeatTimeoutMs',
|
|
351
|
+
'reaperCooldownMs',
|
|
352
|
+
'leaseMode',
|
|
353
|
+
'applicationName',
|
|
354
|
+
'defaultQueryTimeoutMs',
|
|
355
|
+
'hooks',
|
|
356
|
+
'maxConnectRetryTimeMs',
|
|
357
|
+
'maxQueryRetryTimeMs',
|
|
358
|
+
]
|
|
359
|
+
for (const k of internalKeys) delete clientConfig[k]
|
|
360
|
+
return clientConfig
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
async _disposeClient(reason, { bumpGeneration = true } = {}) {
|
|
364
|
+
if (bumpGeneration) this._generation++
|
|
365
|
+
const old = this._client
|
|
366
|
+
this._client = null
|
|
367
|
+
if (!old) return
|
|
368
|
+
try {
|
|
369
|
+
await old.end()
|
|
370
|
+
} catch (_) {
|
|
371
|
+
// ignore
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
_markDeadAndDispose(client, err, source) {
|
|
376
|
+
// Never throw from event handlers (Lambda crash safety).
|
|
377
|
+
this._isDead = true
|
|
378
|
+
// Invalidate any in-flight connect/heartbeat on older generations.
|
|
379
|
+
this._generation++
|
|
380
|
+
// Atomically detach the client if it is the current one.
|
|
381
|
+
if (this._client === client) {
|
|
382
|
+
this._client = null
|
|
383
|
+
}
|
|
384
|
+
if (err) {
|
|
385
|
+
const meta = {
|
|
386
|
+
code: err.code,
|
|
387
|
+
sqlstate: err.sqlstate,
|
|
388
|
+
errno: err.errno,
|
|
389
|
+
syscall: err.syscall,
|
|
390
|
+
address: err.address,
|
|
391
|
+
port: err.port,
|
|
392
|
+
severity: err.severity,
|
|
393
|
+
routine: err.routine,
|
|
394
|
+
}
|
|
395
|
+
this._logger(`WARN: pg client ${source} (swallowed):`, err.message || err.code, meta)
|
|
396
|
+
this._safeHook('onClientDead', { source, err, meta })
|
|
397
|
+
}
|
|
398
|
+
// Best-effort close; do not await.
|
|
399
|
+
try {
|
|
400
|
+
client.end().catch(() => {})
|
|
401
|
+
} catch (_) {}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
async clean() {
|
|
405
|
+
// Try to close gracefully
|
|
406
|
+
await this._disposeClient('clean')
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
async end() {
|
|
410
|
+
return this.clean()
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
getClient() {
|
|
414
|
+
return this._client
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
module.exports = AequorClient
|
package/lib/lease.js
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
const crypto = require('crypto')
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Lease Manager
|
|
5
|
+
* Handles generation and verification of signed application_name strings.
|
|
6
|
+
* Format: "s=SERVICE;i=INSTANCE_ID;e=TIMESTAMP;g=HMAC"
|
|
7
|
+
* Short keys used to fit within Postgres 63-byte limit.
|
|
8
|
+
*/
|
|
9
|
+
class LeaseManager {
|
|
10
|
+
static APP_NAME_MAX_LEN = 63
|
|
11
|
+
static SIG_LEN = 11 // 8 bytes -> base64url w/o padding => 11 chars
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @param {string} serviceName - The logical name of the service
|
|
15
|
+
* @param {string} instanceId - Unique ID of this client instance
|
|
16
|
+
* @param {string} secret - Shared secret for HMAC (coordination secret; NOT db password)
|
|
17
|
+
*/
|
|
18
|
+
constructor(serviceName, instanceId, secret) {
|
|
19
|
+
// Keep instanceId compact and delimiter-safe.
|
|
20
|
+
this.instanceId = LeaseManager._sanitizeToken(instanceId || 'inst')
|
|
21
|
+
if (!secret) {
|
|
22
|
+
throw new Error('LeaseManager requires a non-empty secret')
|
|
23
|
+
}
|
|
24
|
+
if (Buffer.byteLength(String(secret), 'utf8') < 16) {
|
|
25
|
+
throw new Error('LeaseManager secret is too short; must be at least 16 bytes')
|
|
26
|
+
}
|
|
27
|
+
this.secret = secret
|
|
28
|
+
// Normalize serviceName so application_name ALWAYS fits into 63 bytes and is LIKE-safe.
|
|
29
|
+
this.serviceName = LeaseManager._normalizeServiceName(serviceName || 'sls_pg', this.instanceId)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Generates a signed application_name.
|
|
34
|
+
* @param {number} expirationTs - Unix timestamp (ms) when lease expires
|
|
35
|
+
* @returns {string} The formatted application_name
|
|
36
|
+
* @throws {Error} if generated name exceeds 63 bytes
|
|
37
|
+
*/
|
|
38
|
+
generateAppName(expirationTs) {
|
|
39
|
+
// Format: s=...;i=...;e=...
|
|
40
|
+
const base = `s=${this.serviceName};i=${this.instanceId};e=${expirationTs}`
|
|
41
|
+
const sig = this._sign(base)
|
|
42
|
+
const result = `${base};g=${sig}`
|
|
43
|
+
|
|
44
|
+
// Hard guarantee: never exceed Postgres 63-byte truncation limit.
|
|
45
|
+
// If this fires, our normalization math is wrong.
|
|
46
|
+
if (result.length > LeaseManager.APP_NAME_MAX_LEN) {
|
|
47
|
+
throw new Error(`BUG: application_name too long (${result.length} > ${LeaseManager.APP_NAME_MAX_LEN}): ${result}`)
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return result
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Parses an application_name and verifies its signature and expiration.
|
|
55
|
+
* @param {string} appNameString
|
|
56
|
+
* @returns {Object|null} Parsed info if valid format & signature, else null
|
|
57
|
+
*/
|
|
58
|
+
parseAndVerify(appNameString) {
|
|
59
|
+
if (!appNameString) return null
|
|
60
|
+
|
|
61
|
+
// Regex for: s=...;i=...;e=...;g=...
|
|
62
|
+
const match = appNameString.match(/^s=([^;]+);i=([^;]+);e=([^;]+);g=([^;]+)$/)
|
|
63
|
+
if (!match) return null
|
|
64
|
+
|
|
65
|
+
const [full, s, i, eStr, g] = match
|
|
66
|
+
const base = `s=${s};i=${i};e=${eStr}`
|
|
67
|
+
const expectedSig = this._sign(base)
|
|
68
|
+
|
|
69
|
+
// Timing-safe signature comparison
|
|
70
|
+
const bufG = Buffer.from(g, 'utf8')
|
|
71
|
+
const bufExpected = Buffer.from(expectedSig, 'utf8')
|
|
72
|
+
if (bufG.length !== bufExpected.length || !crypto.timingSafeEqual(bufG, bufExpected)) return null
|
|
73
|
+
|
|
74
|
+
const exp = parseInt(eStr, 10)
|
|
75
|
+
if (!Number.isFinite(exp)) return null
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
svc: s,
|
|
79
|
+
inst: i,
|
|
80
|
+
exp,
|
|
81
|
+
isExpired: Date.now() > exp,
|
|
82
|
+
isValidSignature: true
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
_sign(text) {
|
|
87
|
+
// Compact signature: take first 8 bytes of HMAC and encode as base64url (11 chars, no padding)
|
|
88
|
+
const buf = crypto.createHmac('sha256', this.secret).update(text).digest()
|
|
89
|
+
return buf.subarray(0, 8).toString('base64url')
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
static _sanitizeToken(s) {
|
|
93
|
+
// Remove delimiter characters used by our format and LIKE wildcards.
|
|
94
|
+
// Keep it deterministic and log-friendly.
|
|
95
|
+
return String(s).replace(/[^a-zA-Z0-9:_-]/g, '_')
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
static _normalizeServiceName(serviceName, instanceId) {
|
|
99
|
+
const original = String(serviceName || 'sls_pg')
|
|
100
|
+
const raw = LeaseManager._sanitizeToken(original)
|
|
101
|
+
const inst = LeaseManager._sanitizeToken(instanceId || 'inst')
|
|
102
|
+
|
|
103
|
+
// Total format:
|
|
104
|
+
// s=<svc>;i=<inst>;e=<13digits>;g=<sig>
|
|
105
|
+
// Fixed overhead excluding <svc>: "s="(2) + ";i="(3) + inst + ";e="(3) + 13 + ";g="(3) + SIG_LEN
|
|
106
|
+
// => 24 + instLen + SIG_LEN
|
|
107
|
+
const overhead = 24 + inst.length + LeaseManager.SIG_LEN
|
|
108
|
+
const maxSvcLen = Math.max(1, LeaseManager.APP_NAME_MAX_LEN - overhead)
|
|
109
|
+
|
|
110
|
+
// If sanitization changed the name, we must add a hash suffix to avoid accidental collisions
|
|
111
|
+
// (different originals mapping to the same sanitized token).
|
|
112
|
+
const needsHash = raw !== original
|
|
113
|
+
if (!needsHash && raw.length <= maxSvcLen) return raw
|
|
114
|
+
|
|
115
|
+
// Truncate with a short hash suffix to preserve uniqueness.
|
|
116
|
+
const hash = crypto.createHash('sha1').update(original).digest('hex').slice(0, 8)
|
|
117
|
+
if (maxSvcLen <= hash.length) return hash.slice(0, maxSvcLen)
|
|
118
|
+
|
|
119
|
+
const prefixLen = maxSvcLen - (hash.length + 1)
|
|
120
|
+
return `${raw.slice(0, prefixLen)}-${hash}`
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
module.exports = LeaseManager
|
package/lib/reaper.js
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Connection Reaper
|
|
3
|
+
* Safely kills zombie connections using Advisory Locks and Signed Leases.
|
|
4
|
+
*/
|
|
5
|
+
class Reaper {
|
|
6
|
+
// Namespace advisory locks to avoid collisions with other apps in same DB.
|
|
7
|
+
// 0x50474151 corresponds to "PGAQ" (pg-aequor) in ASCII.
|
|
8
|
+
static LOCK_NS = 0x50474151
|
|
9
|
+
/**
|
|
10
|
+
* Runs the reaping process.
|
|
11
|
+
* @param {Object} client - The connected pg.Client
|
|
12
|
+
* @param {Object} config - Config including database name
|
|
13
|
+
* @param {LeaseManager} leaseManager - For verifying leases
|
|
14
|
+
* @param {Object} strategy - { minConnIdleTimeSec, connUtilization }
|
|
15
|
+
* @param {Function} logger
|
|
16
|
+
*/
|
|
17
|
+
static async reap(client, config, leaseManager, strategy, logger) {
|
|
18
|
+
const serviceName = leaseManager.serviceName
|
|
19
|
+
let locked = false
|
|
20
|
+
|
|
21
|
+
// 1. Acquire Advisory Lock (Non-blocking)
|
|
22
|
+
// Use Postgres native hashtext() to get a consistent 64-bit lock ID from the service string.
|
|
23
|
+
// This avoids JS-side 32-bit hash collisions.
|
|
24
|
+
|
|
25
|
+
try {
|
|
26
|
+
const lockRes = await client.query(
|
|
27
|
+
`SELECT pg_try_advisory_lock($1::int, hashtext($2)) as locked`,
|
|
28
|
+
[Reaper.LOCK_NS, serviceName]
|
|
29
|
+
)
|
|
30
|
+
if (lockRes.rows[0].locked !== true) {
|
|
31
|
+
logger(`Reaper[pid=${process.pid}]: Lock busy, skipping`)
|
|
32
|
+
return { locked: false, killed: 0 }
|
|
33
|
+
}
|
|
34
|
+
locked = true
|
|
35
|
+
|
|
36
|
+
// 2. Scan for zombies
|
|
37
|
+
const minIdle = strategy.minConnIdleTimeSec
|
|
38
|
+
|
|
39
|
+
// Fetch idle connections that look like our service
|
|
40
|
+
// Exclude self (pg_backend_pid())
|
|
41
|
+
// Optimization: Filter by application_name prefix in SQL to reduce result set size.
|
|
42
|
+
const query = `
|
|
43
|
+
SELECT pid, application_name, extract(epoch from (now() - state_change)) as idle_time
|
|
44
|
+
FROM pg_stat_activity
|
|
45
|
+
WHERE datname = current_database()
|
|
46
|
+
AND state = 'idle'
|
|
47
|
+
AND pid <> pg_backend_pid()
|
|
48
|
+
AND application_name LIKE $1 || '%'
|
|
49
|
+
`
|
|
50
|
+
|
|
51
|
+
// Correctness > optimization: do not prefilter using untrusted application_name.
|
|
52
|
+
const res = await client.query(query, [`s=${leaseManager.serviceName};`])
|
|
53
|
+
const candidates = []
|
|
54
|
+
|
|
55
|
+
for (const row of res.rows) {
|
|
56
|
+
if (row.idle_time < minIdle) continue
|
|
57
|
+
|
|
58
|
+
const lease = leaseManager.parseAndVerify(row.application_name)
|
|
59
|
+
|
|
60
|
+
if (!lease) {
|
|
61
|
+
// Invalid format or signature -> Unsafe to touch (could be neighbor with different secret)
|
|
62
|
+
continue
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if (lease.isExpired) {
|
|
66
|
+
// Valid signature, but expired -> ZOMBIE
|
|
67
|
+
candidates.push({
|
|
68
|
+
pid: row.pid,
|
|
69
|
+
idle_time: Number(row.idle_time) || 0,
|
|
70
|
+
exp: lease.exp,
|
|
71
|
+
})
|
|
72
|
+
}
|
|
73
|
+
// else: Lease valid -> ACTIVE neighbor -> Do not kill
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// 3. Terminate zombies
|
|
77
|
+
if (candidates.length > 0) {
|
|
78
|
+
// Deterministic: kill the "stale-est" first.
|
|
79
|
+
// Primary: oldest expiration (smallest exp) -> longest expired.
|
|
80
|
+
// Secondary: largest idle_time.
|
|
81
|
+
candidates.sort((a, b) => (a.exp - b.exp) || (b.idle_time - a.idle_time) || (a.pid - b.pid))
|
|
82
|
+
|
|
83
|
+
const limit = Math.max(1, Number(strategy.maxIdleConnectionsToKill) || 1)
|
|
84
|
+
const selected = candidates.slice(0, limit)
|
|
85
|
+
const pidsToKill = selected.map(x => x.pid)
|
|
86
|
+
|
|
87
|
+
// Log a compact reason line for debugging.
|
|
88
|
+
const meta = selected.map(x => `pid=${x.pid},idle=${Math.round(x.idle_time)}s,expDelta=${Math.round((Date.now() - x.exp) / 1000)}s`).join(' | ')
|
|
89
|
+
logger(`Reaper[pid=${process.pid}]: Killing ${pidsToKill.length} zombies: ${meta}`)
|
|
90
|
+
// Cast to int[] to be safe
|
|
91
|
+
await client.query(`SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE pid = ANY($1::int[])`, [pidsToKill])
|
|
92
|
+
return { locked: true, killed: pidsToKill.length }
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return { locked: true, killed: 0 }
|
|
96
|
+
|
|
97
|
+
} catch (err) {
|
|
98
|
+
logger(`Reaper[pid=${process.pid}] failed:`, err && (err.stack || err.message || err))
|
|
99
|
+
if (strategy && strategy.reaperErrorMode === 'throw') throw err
|
|
100
|
+
return { locked: false, killed: 0, error: err }
|
|
101
|
+
} finally {
|
|
102
|
+
// 4. Release Lock
|
|
103
|
+
if (locked) {
|
|
104
|
+
try {
|
|
105
|
+
await client.query(
|
|
106
|
+
`SELECT pg_advisory_unlock($1::int, hashtext($2))`,
|
|
107
|
+
[Reaper.LOCK_NS, serviceName]
|
|
108
|
+
)
|
|
109
|
+
} catch (_) { /* ignore unlock error */ }
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
// Removed _hashString method as we use DB-side hashtext()
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
module.exports = Reaper
|
|
117
|
+
|
package/lib/retry.js
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Retry Strategy & Error Classification
|
|
3
|
+
* Implements "Decorrelated Jitter" and safe error analysis.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
class RetryStrategy {
|
|
7
|
+
/**
|
|
8
|
+
* Determines if an error is a "dead connection" error that warrants a retry/reconnect.
|
|
9
|
+
* @param {Error} err
|
|
10
|
+
* @returns {boolean}
|
|
11
|
+
*/
|
|
12
|
+
static isRetryable(err) {
|
|
13
|
+
const code = err && err.code
|
|
14
|
+
const msg = (err && err.message) || ''
|
|
15
|
+
const sqlstate = (err && (err.code || err.sqlstate)) || null
|
|
16
|
+
|
|
17
|
+
// 1) Node.js socket / transport codes (not SQLSTATE)
|
|
18
|
+
if (code === 'ECONNRESET' || code === 'EPIPE' || code === 'ETIMEDOUT' || code === 'ECONNREFUSED') return true
|
|
19
|
+
if (code === 'ENETUNREACH' || code === 'EHOSTUNREACH' || code === 'EAI_AGAIN') return true
|
|
20
|
+
if (code === 'ECONNABORTED' || code === 'EADDRINUSE') return true
|
|
21
|
+
|
|
22
|
+
// 2) SQLSTATE-first (stable)
|
|
23
|
+
// Class 08 — connection exception
|
|
24
|
+
if (typeof sqlstate === 'string' && sqlstate.length === 5 && sqlstate.startsWith('08')) return true
|
|
25
|
+
|
|
26
|
+
// Admin / crash / cannot continue
|
|
27
|
+
if (sqlstate === '57P01' || sqlstate === '57P02' || sqlstate === '57P03') return true
|
|
28
|
+
|
|
29
|
+
// Too many connections (can be transient under spiky concurrency)
|
|
30
|
+
if (sqlstate === '53300') return true
|
|
31
|
+
|
|
32
|
+
// Optional: transient concurrency failures (only safe if queries are idempotent)
|
|
33
|
+
// Keep disabled for now to avoid duplicating non-idempotent writes.
|
|
34
|
+
// if (sqlstate === '40001' || sqlstate === '40P01') return true
|
|
35
|
+
|
|
36
|
+
// 3) LAST-RESORT message fallbacks (keep minimal; remove over time)
|
|
37
|
+
if (msg.includes('Connection terminated unexpectedly')) return true
|
|
38
|
+
if (msg.includes('sorry, too many clients already')) return true
|
|
39
|
+
|
|
40
|
+
return false
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Calculates backoff delay using "Decorrelated Jitter".
|
|
45
|
+
* sleep = min(cap, random(base, sleep * 3))
|
|
46
|
+
* @param {number} baseMs - Minimum wait
|
|
47
|
+
* @param {number} capMs - Maximum wait
|
|
48
|
+
* @param {number} previousDelay - The delay used in the previous attempt (or 0)
|
|
49
|
+
* @returns {number} ms to sleep
|
|
50
|
+
*/
|
|
51
|
+
static getBackoff(baseMs, capMs, previousDelay) {
|
|
52
|
+
const prev = previousDelay || baseMs
|
|
53
|
+
const randRange = (min, max) => Math.floor(Math.random() * (max - min + 1)) + min
|
|
54
|
+
return Math.min(capMs, randRange(baseMs, prev * 3))
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
module.exports = RetryStrategy
|
|
59
|
+
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pg-aequor",
|
|
3
|
+
"version": "0.1.3",
|
|
4
|
+
"description": "Crash-safe, coordination-aware PostgreSQL client for Serverless environments",
|
|
5
|
+
"main": "./index.js",
|
|
6
|
+
"types": "index.d.ts",
|
|
7
|
+
"author": "dimaq12",
|
|
8
|
+
"license": "MIT",
|
|
9
|
+
"repository": {
|
|
10
|
+
"type": "git",
|
|
11
|
+
"url": "git+ssh://git@github.com/dimaq12/pg-aequor.git"
|
|
12
|
+
},
|
|
13
|
+
"bugs": {
|
|
14
|
+
"url": "https://github.com/dimaq12/pg-aequor/issues"
|
|
15
|
+
},
|
|
16
|
+
"homepage": "https://github.com/dimaq12/pg-aequor#readme",
|
|
17
|
+
"publishConfig": {
|
|
18
|
+
"registry": "https://registry.npmjs.org/"
|
|
19
|
+
},
|
|
20
|
+
"sideEffects": false,
|
|
21
|
+
"engines": {
|
|
22
|
+
"node": ">=18"
|
|
23
|
+
},
|
|
24
|
+
"exports": {
|
|
25
|
+
".": {
|
|
26
|
+
"types": "./index.d.ts",
|
|
27
|
+
"require": "./index.js"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
"files": [
|
|
31
|
+
"index.js",
|
|
32
|
+
"index.d.ts",
|
|
33
|
+
"lib/",
|
|
34
|
+
"README.md",
|
|
35
|
+
"LICENSE"
|
|
36
|
+
],
|
|
37
|
+
"scripts": {
|
|
38
|
+
"test": "node --test"
|
|
39
|
+
},
|
|
40
|
+
"peerDependencies": {
|
|
41
|
+
"pg": "^8.11.0"
|
|
42
|
+
},
|
|
43
|
+
"devDependencies": {
|
|
44
|
+
"pg": "^8.11.0"
|
|
45
|
+
}
|
|
46
|
+
}
|