bunsane 0.2.10 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md ADDED
@@ -0,0 +1,266 @@
1
+ # Changelog
2
+
3
+ All notable changes to bunsane are documented here.
4
+
5
+ ## Unreleased
6
+
7
+ ### Fixed (PR E — outbox, cache, query hardening)
8
+
9
+ - **OutboxWorker publishes to Redis concurrently and marks rows in bulk.**
10
+ Previously `processBatch` awaited each `publisher.xadd` serially inside
11
+ the PG transaction, holding `FOR UPDATE` row locks for up to N ×
12
+ `commandTimeout` when Redis was slow. Now uses `Promise.allSettled` to
13
+ publish the whole batch in parallel — worst-case lock hold drops to a
14
+ single xadd timeout. Followed by a single bulk `UPDATE … WHERE id IN
15
+ …` instead of N serial updates. Tickets H-DB-1 (partial — full fix
16
+ needs claim-via-column redesign so Redis latency is outside the PG
17
+ transaction entirely) and H-DB-3.
18
+
19
+ - **`Entity.save` pre-flights `ComponentRegistry.getReadyPromise` outside
20
+ the transaction.** Previously `doSave` awaited registry readiness from
21
+ inside `executeSave`, so a slow DDL (partition creation) would keep a PG
22
+ transaction idle. Pre-flight loop in `save()` awaits readiness before
23
+ opening the transaction; `doSave` now only asserts readiness and throws
24
+ if a caller bypassed `save()`. Ticket H-DB-4.
25
+
26
+ - **Entity.set / Entity.remove fire-and-forget cache ops now drainable on
27
+ shutdown.** Previously `setImmediate(async () => { … })` was untracked,
28
+ so SIGTERM could abandon in-flight cache writes. `Entity.pendingCacheOps`
29
+ is a drainable `Set<Promise<void>>`, and `Entity.drainPendingCacheOps`
30
+ is awaited by `App.shutdown` between HTTP drain and cache disconnect.
31
+ Ticket H-CACHE-1.
32
+
33
+ - **`CacheManager.shutdownProvider` descends into `MultiLevelCache` layers.**
34
+ Previously only checked the top-level provider for `disconnect` /
35
+ `stopCleanup` methods, so a MultiLevelCache deployment left its inner
36
+ MemoryCache cleanup timer and Redis connection alive forever. Now
37
+ dispatches to `getL1Cache()` and `getL2Cache()` when available. Ticket
38
+ H-CACHE-2.
39
+
40
+ - **`setComponentWriteThrough` preserves `createdAt` across updates.**
41
+ Previously every write-through stamped `createdAt: new Date()`,
42
+ corrupting the timeline across consecutive updates. Now peeks the
43
+ existing cache entry and preserves its `createdAt` when present; only
44
+ `updatedAt` is stamped fresh. Full fix (BaseComponent tracking
45
+ timestamps natively) deferred. Ticket H-CACHE-3.
46
+
47
+ - **Default query limit applied when `.take()` is omitted.** `Query.exec()`
48
+ now applies a framework-level default LIMIT
49
+ (env `BUNSANE_DEFAULT_QUERY_LIMIT`, default 10000, 0 to disable) and
50
+ emits a warning so runaway queries are visible. Ticket H-QUERY-1.
51
+
52
+ - **OrNode debug `console.log` traces removed from the production path.**
53
+ Ticket H-QUERY-2.
54
+
55
+ - **`unregisterDecoratedHooks` now actually unregisters.** Previously a
56
+ no-op stub that warned to stderr. Hook IDs returned from each
57
+ registration are stored in a `WeakMap<instance, string[]>` and passed
58
+ to `EntityHookManager.removeHook` on tear-down. Enables per-instance
59
+ cleanup in tests and service destruction. Ticket H-HOOK-3.
60
+
61
+ ### Fixed (PR D — scheduler + hook concurrency hardening)
62
+
63
+ - **Entity.add / Entity.set / Entity.remove hook calls no longer leak
64
+ unhandled rejections.** `EntityHookManager.executeHooks` is async, but
65
+ the three mutating methods previously invoked it without `await` and the
66
+ surrounding `try/catch` captured only synchronous throws. A hook
67
+ declared `async` that rejected escaped as an unhandled rejection. `set`
68
+ now `await`s consistently; `add` and `remove` remain synchronous (to
69
+ preserve their fluent-chain / boolean signatures) and attach a
70
+ `.catch` to the returned promise so rejections are logged rather than
71
+ escaping. Ticket H-HOOK-1.
72
+
73
+ - **Hook timeout timers no longer leak and late rejections no longer
74
+ escape.** All four timeout race sites in `EntityHookManager` (sync path,
75
+ async-parallel path, sync-batch path, async-batch path) now capture the
76
+ `setTimeout` handle and `clearTimeout` on normal completion, and
77
+ attach a detached `.catch` to the hook callback promise so a rejection
78
+ that arrives after the race has been decided is logged rather than
79
+ emitted as an unhandled rejection. Tickets H-HOOK-2 / H-MEM-2.
80
+
81
+ - **SchedulerManager task interval no longer burns lock attempts for a
82
+ still-running task.** `doExecuteTask` now skips early if
83
+ `taskInfo.isRunning` is true, avoiding a wasted PG advisory-lock
84
+ round-trip every tick when execution outlasts the interval. Increments
85
+ `skippedExecutions`. Ticket H-SCHED-1.
86
+
87
+ - **Scheduled-task retry timer is now tracked and cleared on stop.**
88
+ `handleTaskFailure` previously scheduled retries with a bare
89
+ `setTimeout` whose handle was never stored, so `stop()` could not
90
+ clear it and the retry fired post-shutdown against a closed DB pool.
91
+ The retry handle is now registered in `intervals` under
92
+ `<taskId>:retry:<n>` and self-deletes once fired. The retry callback
93
+ also checks `isRunning` before executing. Tickets H-SCHED-2 /
94
+ H-SCHED-3.
95
+
96
+ - **DistributedLock re-entry now reports overlap instead of success.**
97
+ `tryAcquire` previously returned `acquired: true` when the instance
98
+ already held the lock for `taskId`, which meant retry + interval could
99
+ both enter `executeTask` concurrently. Now returns
100
+ `acquired: false` so the second caller skips — defense-in-depth on
101
+ top of the caller-side `isRunning` guard. Ticket H-SCHED-4.
102
+
103
+ - **`executeWithTimeout` no longer leaks late rejections.** A scheduled
104
+ task that rejects after its wrapper timed out previously produced an
105
+ unhandled rejection (the wrapper was already settled). The wrapper now
106
+ uses a `settled` flag and logs late rejections instead of propagating.
107
+ Ticket H-SCHED-5.
108
+
109
+ - **DistributedLock `reservePromise` nulls on reject.** Previously, if
110
+ `db.reserve()` rejected (pool exhausted, shutdown mid-call), the
111
+ rejected promise was cached in `reservePromise` forever and every
112
+ subsequent `ensureReserved` received the same rejection. Now nulls the
113
+ promise in the reject handler so future callers retry a fresh reserve.
114
+ Ticket H-DB-2.
115
+
116
+ - **`App.waitForAppReady` no longer polls indefinitely.** Replaced the
117
+ 100ms `setInterval` with a one-shot phase listener and default 60s
118
+ timeout. A boot failure that never reaches `APPLICATION_READY` now
119
+ surfaces as a rejection instead of leaking a timer for process
120
+ lifetime. Ticket H-MEM-1.
121
+
122
+ ### Security
123
+
124
+ - **SQL injection hardening across Query layer.** Identifiers (component
125
+ table names, JSON field paths, ORDER BY properties, text-search language)
126
+ interpolated into SQL via `db.unsafe(...)` or template literals are now
127
+ validated against strict allow-lists before use. Added `query/SqlIdentifier.ts`
128
+ with `assertIdentifier`, `assertComponentTableName`, `assertFieldPath`,
129
+ `assertTsLanguage`. Applied at `Query.estimatedCount`, `Query.doAggregate`,
130
+ `ComponentInclusionNode` sort expressions (3 sites), and
131
+ `FullTextSearchBuilder` (3 sites + factory). Throws `InvalidIdentifierError`
132
+ on unsafe input. Ticket C08.
133
+
134
+ - **GraphQL depth limit hard minimum enforced.** Previously `maxDepth: 0`
135
+ or `undefined` silently disabled the depth-limit guard, allowing CPU/memory
136
+ DoS via deeply nested queries. Now `createYogaInstance` enforces a hard
137
+ floor of 15 regardless of input; callers can raise but cannot disable.
138
+ Ticket C06.
139
+
140
+ - **Request AbortSignal now propagates into Yoga and REST handlers.** The
141
+ 30s wall-clock timer previously only logged a warning; the signal was
142
+ never forwarded downstream. Request timeouts (and client disconnects) now
143
+ cancel in-flight resolvers, DB queries, and external calls. Uses
144
+ `AbortSignal.any` (Bun/Node 20+) with a manual combiner fallback.
145
+ Ticket C05.
146
+
147
+ ### Fixed
148
+
149
+ - **Sync lifecycle hooks now awaited, preventing unhandled rejections.**
150
+ `EntityHookManager.executeHooks` previously discarded the return value of
151
+ `hook.callback(event)` on the sync path when no timeout was configured.
152
+ A hook mistakenly declared `async: false` but implemented as an
153
+ `async function` would silently throw unhandled rejections, crashing the
154
+ process under strict mode. Sync path now awaits consistently. Ticket C13.
155
+
156
+ - **`createRequestContextPlugin` auto-applied by default.** Previously
157
+ opt-in (and the export was commented out of the root barrel), so any app
158
+ using `@BelongsTo` / `@HasMany` relations silently fell into N+1 query
159
+ patterns. `App` now prepends the plugin to Yoga plugins by default. Opt
160
+ out via `App.disableRequestContextPlugin()` if supplying your own
161
+ DataLoader layer. Ticket C07.
162
+
163
+ - **Redis cache no longer causes unbounded heap growth when Redis is
164
+ unreachable.** `enableOfflineQueue` now defaults to `false` so commands
165
+ fail fast and the caller's `try/catch` treats it as a cache miss instead
166
+ of queuing indefinitely. Can be overridden per-deployment via
167
+ `REDIS_ENABLE_OFFLINE_QUEUE=true` when you accept the memory risk.
168
+ Ticket C02.
169
+
170
+ - **Redis reconnect storm capped.** `retryStrategy` now returns `null`
171
+ after `maxReconnectAttempts` (default 20) so a permanently unreachable
172
+ Redis cannot spin forever, saturating logs and keeping the ioredis
173
+ state machine busy. Configurable via `REDIS_MAX_RECONNECT_ATTEMPTS`.
174
+ Default inter-attempt delay also raised from `times * 50` to
175
+ `times * 200` (capped at 2s) for a gentler back-off. Ticket C03.
176
+
177
+ - **`App.init()` now awaits `CacheManager.initialize()`.** Previously only
178
+ `getInstance()` was called so pub/sub cross-instance invalidation was
179
+ never set up and any app-supplied cache config was silently ignored.
180
+ Added `App.setCacheConfig(config)` so callers can supply a partial
181
+ config that is merged with `defaultCacheConfig` and passed to
182
+ `initialize()`. Ticket C04.
183
+
184
+ - **`Entity.doDelete` no longer leaks `idle in transaction` backends on timeout.**
185
+ Same AbortController + in-flight query cancellation pattern as `Entity.save`.
186
+ Post-commit cache invalidation and lifecycle hooks moved out of the save
187
+ budget via `queueMicrotask`. Ticket C01.
188
+
189
+ - **`SYSTEM_READY` phase errors are no longer swallowed silently.**
190
+ Previously a schema-build, REST-registration, or scheduler-init failure was
191
+ caught and only logged, leaving the app stuck at `isReady=false` with
192
+ `/health/ready` returning 503 forever and k8s rollouts blocked indefinitely.
193
+ Now marks the app unready, logs at fatal level, and exits so the orchestrator
194
+ can restart. In tests, rethrows instead of exiting. Ticket C09.
195
+
196
+ - **HTTP server drain is now awaited before tearing down dependencies.**
197
+ `server.stop(false)` previously initiated drain but was not awaited, so the
198
+ scheduler / cache / DB pool closed while requests were still executing,
199
+ causing cascade failures in the final seconds of shutdown. Shutdown now
200
+ polls pending requests (bounded by `shutdownGracePeriod`) before force-close,
201
+ then stops each subsystem in order. Ticket C10.
202
+
203
+ - **ApplicationLifecycle phase listeners are now captured and removed on
204
+ shutdown.** Five singletons (`App`, `EntityManager`, `EntityHookManager`,
205
+ `SchedulerManager`, `ServiceRegistry`) previously registered listeners
206
+ without storing refs, so each `init()` call (common in tests) stacked
207
+ listeners on the singleton `EventTarget`, permanently leaking memory and
208
+ firing duplicate phase handlers. Each now captures the listener reference
209
+ and exposes a `dispose()` / `disposeLifecycleIntegration()` method called
210
+ from `App.shutdown()`. `init()` paths are also idempotent. Ticket C11.
211
+
212
+ - **`ApplicationLifecycle.waitForPhase` replaced 100ms busy-loop with a
213
+ listener-based Promise.** Previously a `while (currentPhase !== phase)`
214
+ loop polling every 100ms; if the target phase was never reached (see
215
+ SYSTEM_READY fix above) every caller hung forever. Now attaches a one-shot
216
+ phase listener + `timeoutMs` (default 30s). Rejects with a descriptive
217
+ error on timeout. Ticket C12.
218
+
219
+ - **`SchedulerManager.stop()` now awaits in-flight tasks before returning.**
220
+ Previously cleared timers and returned immediately; any task mid-execution
221
+ continued running against a DB pool that was about to close in
222
+ `App.shutdown()`. Now tracks each `executeTask` promise in a Set, and
223
+ `stop(drainTimeoutMs = 15_000)` awaits `Promise.allSettled` bounded by the
224
+ timeout. Scheduler listener also disposed. Ticket C14.
225
+
226
+ - **Process-level error handlers (`unhandledRejection`, `uncaughtException`)
227
+ and signal handlers (`SIGTERM`, `SIGINT`) now registered at the top of
228
+ `App.init()` instead of only in `start()`.** Previously any rejection
229
+ during boot (DB prep, component registration, cache init) was silently
230
+ discarded by the runtime. Signal handlers now use `process.once` so a
231
+ double SIGTERM cannot fire two concurrent shutdown paths. Ticket C15.
232
+
233
+ - **`Entity.save` no longer leaks `idle in transaction` backends on timeout.**
234
+ The previous implementation wrapped `db.transaction(...)` in a JS `setTimeout`
235
+ and rejected the outer promise when the timer fired, but the underlying Bun
236
+ SQL transaction continued on the server with no `COMMIT`/`ROLLBACK` ever
237
+ sent. Under pgbouncer `transaction` pool mode this pinned backend sessions
238
+ permanently, exhausting the pool and cascading into further save timeouts.
239
+
240
+ `Entity.save` now threads an `AbortSignal` into `doSave`. When the wall-clock
241
+ timer fires the signal is aborted, the in-flight `SQL.Query` is cancelled
242
+ via `.cancel()`, and the cancellation propagates out of the transaction
243
+ callback, triggering Bun SQL's automatic `ROLLBACK` and releasing the
244
+ pooled connection. The `DB_STATEMENT_TIMEOUT` env var (already supported
245
+ in `database/index.ts`) acts as a PostgreSQL-side backstop.
246
+
247
+ See `docs` / handoff dated 2026-04-18 for incident details.
248
+
249
+ ### Changed
250
+
251
+ - **Post-commit side effects (cache invalidation, lifecycle hooks) no longer
252
+ block `Entity.save`.** `handleCacheAfterSave` and `EntityHookManager.executeHooks`
253
+ are now queued via `queueMicrotask` after the transaction commits. Save
254
+ resolves as soon as the DB write is durable; cache or hook latency cannot
255
+ consume the save budget or surface as save failures. Errors are logged
256
+ and swallowed (matching prior error-handling behavior).
257
+
258
+ ### Added
259
+
260
+ - **`DB_SAVE_PROFILE=true` env var** — when set, `Entity.save` logs per-phase
261
+ timings (`db`, `cache`, `hooks`, `total`) at info level. Off by default.
262
+
263
+ - **Integration tests** in `tests/integration/entity/Entity.saveTimeout.test.ts`
264
+ covering: aborted save leaves no partial rows, pool stays healthy after
265
+ repeated aborts, backwards-compatible signal-less `doSave`, non-blocking
266
+ post-commit work.
@@ -14,9 +14,13 @@ export interface CacheConfig {
14
14
  password?: string;
15
15
  db?: number;
16
16
  keyPrefix?: string;
17
- retryStrategy?: (times: number) => number | void;
17
+ retryStrategy?: (times: number) => number | null | void;
18
18
  connectTimeout?: number;
19
19
  commandTimeout?: number;
20
+ /** Give up after this many reconnect attempts. Default 20. */
21
+ maxReconnectAttempts?: number;
22
+ /** Queue commands while offline. Default false (fail-fast). */
23
+ enableOfflineQueue?: boolean;
20
24
  };
21
25
 
22
26
  entity?: {
@@ -53,8 +57,14 @@ export const defaultCacheConfig: CacheConfig = {
53
57
  password: process.env.REDIS_PASSWORD,
54
58
  db: parseInt(process.env.REDIS_DB || '0'),
55
59
  keyPrefix: process.env.REDIS_KEY_PREFIX || 'bunsane:',
60
+ // Cap reconnect attempts so a permanently unreachable Redis doesn't
61
+ // spin forever (C03). Tune via REDIS_MAX_RECONNECT_ATTEMPTS.
62
+ maxReconnectAttempts: parseInt(process.env.REDIS_MAX_RECONNECT_ATTEMPTS || '20'),
63
+ // Fail-fast on outage instead of unbounded offline queue (C02).
64
+ // Override only if caller accepts the memory risk.
65
+ enableOfflineQueue: process.env.REDIS_ENABLE_OFFLINE_QUEUE === 'true',
56
66
  retryStrategy: (times: number) => {
57
- const delay = Math.min(times * 50, 2000);
67
+ const delay = Math.min(times * 200, 2000);
58
68
  return delay;
59
69
  }
60
70
  },