@sylphx/sdk 0.10.4 → 0.10.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/health/index.d.ts +681 -0
- package/dist/index.d.ts +17 -17
- package/dist/index.mjs +6 -6
- package/dist/index.mjs.map +1 -1
- package/dist/nextjs/index.d.ts +3 -0
- package/dist/nextjs/index.mjs +166 -3
- package/dist/nextjs/index.mjs.map +1 -1
- package/dist/react/index.mjs.map +1 -1
- package/dist/server/index.mjs.map +1 -1
- package/dist/web-analytics.mjs.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
import { Effect } from 'effect';
|
|
2
|
+
import { monitorEventLoopDelay } from 'node:perf_hooks';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Type SSOT for `@sylphx/sdk/health` (ADR-111 §4).
|
|
6
|
+
*
|
|
7
|
+
* Pure runtime types — framework-free, no schema library required at the
|
|
8
|
+
* SDK boundary. Apps that want Standard Schema validation (per ADR-084)
|
|
9
|
+
* can wrap the wire-format `HealthSnapshot` in their own schema.
|
|
10
|
+
*
|
|
11
|
+
* The wire shape (`HealthSnapshot`) is the **stable contract** the
|
|
12
|
+
* `sylphx-health-agent` sidecar parses (see ADR-111 §3.2.4 +
|
|
13
|
+
* `apps/health-agent/src/app-poll.ts::parseAppHealthBody`). Do not break it.
|
|
14
|
+
*/
|
|
15
|
+
/**
|
|
16
|
+
* One reading produced by a `Signal.read()`. The aggregator turns each
|
|
17
|
+
* reading into a `factor` in `[0, 1]` then folds them into the score via
|
|
18
|
+
* the configured `ScoringStrategy`.
|
|
19
|
+
*
|
|
20
|
+
* `value` is the raw observation (ms, ratio, count, …) — surfaced verbatim
|
|
21
|
+
* in the wire snapshot so operators can debug from JSON without re-running
|
|
22
|
+
* the signal logic.
|
|
23
|
+
*
|
|
24
|
+
* `healthFactor` is the normalised health in `[0, 1]`:
|
|
25
|
+
* - `1` = fully healthy
|
|
26
|
+
* - `0` = dead (or "we don't know" if `unknown=true` and the policy says so)
|
|
27
|
+
*
|
|
28
|
+
* `unknown=true` means the signal **could not be measured this tick** (e.g.
|
|
29
|
+
* cgroup file unreadable). Scoring strategies treat unknown signals as
|
|
30
|
+
* `factor=1` (don't penalise an app for our own missing data).
|
|
31
|
+
*/
|
|
32
|
+
interface SignalReading {
|
|
33
|
+
readonly value: number | string | boolean;
|
|
34
|
+
readonly healthFactor: number;
|
|
35
|
+
readonly unknown?: boolean;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* A health signal — one named, weighted measurement the score is built from.
|
|
39
|
+
*
|
|
40
|
+
* Two variants:
|
|
41
|
+
* - `SyncSignal` — `read(): SignalReading` (e.g. event-loop lag)
|
|
42
|
+
* - `AsyncSignal` — `read(): Promise<SignalReading>` (e.g. queue depth
|
|
43
|
+
* fetched over IPC)
|
|
44
|
+
*
|
|
45
|
+
* The discriminated `Signal` union accepts both. The aggregator awaits
|
|
46
|
+
* each reading uniformly via `Promise.resolve(signal.read())`.
|
|
47
|
+
*
|
|
48
|
+
* Implementations are pure — no internal mutation outside the closure-
|
|
49
|
+
* captured monitor state (e.g. `monitorEventLoopDelay()`). Stop /
|
|
50
|
+
* cleanup are exposed via `dispose()` for tests and graceful shutdown.
|
|
51
|
+
*/
|
|
52
|
+
interface SignalBase {
|
|
53
|
+
/** Unique stable identifier; appears in the wire `signals.<name>` map. */
|
|
54
|
+
readonly name: string;
|
|
55
|
+
/**
|
|
56
|
+
* Weight in the weighted-product score. Strictly `> 0` for active
|
|
57
|
+
* signals; `0` is a no-op signal (kept for compatibility with
|
|
58
|
+
* conditional registration but skipped in scoring).
|
|
59
|
+
*/
|
|
60
|
+
readonly weight: number;
|
|
61
|
+
/**
|
|
62
|
+
* Tear down any background work (timers, monitors, file watchers).
|
|
63
|
+
* Called during graceful shutdown and from test cleanup.
|
|
64
|
+
*/
|
|
65
|
+
dispose?(): void;
|
|
66
|
+
}
|
|
67
|
+
interface SyncSignal extends SignalBase {
|
|
68
|
+
read(): SignalReading;
|
|
69
|
+
}
|
|
70
|
+
interface AsyncSignal extends SignalBase {
|
|
71
|
+
read(): Promise<SignalReading>;
|
|
72
|
+
}
|
|
73
|
+
type Signal = SyncSignal | AsyncSignal;
|
|
74
|
+
/**
|
|
75
|
+
* The complete health score + signal breakdown produced by `health.evaluate()`.
|
|
76
|
+
*
|
|
77
|
+
* `score` is the normalised aggregate in `[0, 1]`. Signal payload is
|
|
78
|
+
* verbatim values from each `SignalReading.value` so operators can
|
|
79
|
+
* cross-reference Grafana dashboards with the JSON.
|
|
80
|
+
*/
|
|
81
|
+
interface HealthScore {
|
|
82
|
+
readonly score: number;
|
|
83
|
+
readonly signals: Record<string, number | string | boolean>;
|
|
84
|
+
readonly lastTickAt: string;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Wire format the sidecar's `app-poll.ts::parseAppHealthBody` consumes.
|
|
88
|
+
*
|
|
89
|
+
* Pinned by ADR-111 §3.2.4 — keep stable. The sidecar tolerates extra
|
|
90
|
+
* keys; do NOT remove existing ones without sequencing the sidecar update
|
|
91
|
+
* first.
|
|
92
|
+
*/
|
|
93
|
+
type HealthSnapshot = HealthScore;
|
|
94
|
+
/**
|
|
95
|
+
* Tagged error type for the Effect API. Promise consumers see the same
|
|
96
|
+
* `message` via `Error.message` — the tag is for `Effect.catchTag`.
|
|
97
|
+
*/
|
|
98
|
+
declare class HealthError extends Error {
|
|
99
|
+
readonly _tag: "HealthError";
|
|
100
|
+
readonly cause?: unknown;
|
|
101
|
+
constructor(message: string, cause?: unknown);
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* `ScoringStrategy` collapses N readings → one `score` in `[0, 1]`.
|
|
105
|
+
*
|
|
106
|
+
* Default is `weightedProduct` (see `scoring.ts`). Custom strategies can
|
|
107
|
+
* be plugged in via `sylphxHealth({ scoringStrategy: myStrategy })`.
|
|
108
|
+
*/
|
|
109
|
+
type ScoringStrategy = (readings: ReadonlyArray<{
|
|
110
|
+
signal: Signal;
|
|
111
|
+
reading: SignalReading;
|
|
112
|
+
}>) => number;
|
|
113
|
+
interface SylphxHealthOptions {
|
|
114
|
+
/**
|
|
115
|
+
* Signals to register. If omitted, defaults to a single
|
|
116
|
+
* `eventLoopLagSignal({ degradedMs: 5000, deadMs: 30000 })` per
|
|
117
|
+
* ADR-111 §4.6 ("sane out-of-box").
|
|
118
|
+
*/
|
|
119
|
+
readonly signals?: ReadonlyArray<Signal>;
|
|
120
|
+
/**
|
|
121
|
+
* Strategy used to fold readings into a score. Defaults to
|
|
122
|
+
* `weightedProduct` from `scoring.ts`.
|
|
123
|
+
*/
|
|
124
|
+
readonly scoringStrategy?: ScoringStrategy;
|
|
125
|
+
/**
|
|
126
|
+
* Optional injected clock — used by tests for deterministic
|
|
127
|
+
* `lastTickAt` timestamps.
|
|
128
|
+
*/
|
|
129
|
+
readonly now?: () => Date;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Effect TS integration for `@sylphx/sdk/health` (Rule 21 / ADR-058 Amendment).
|
|
134
|
+
*
|
|
135
|
+
* Effect-native services should NEVER call `Effect.runPromise` inside
|
|
136
|
+
* business logic — this module exposes `evaluateEffect` so they can fold
|
|
137
|
+
* the health computation into their own fiber graph.
|
|
138
|
+
*
|
|
139
|
+
* `effect` is an OPTIONAL peer dependency — apps that don't import this
|
|
140
|
+
* module never pull it in (sideEffects: false + tree-shaking guarantees
|
|
141
|
+
* this; verified in tests). The `Effect` type is imported from the
|
|
142
|
+
* peer-dep at runtime; consumers either provide it or never reach this
|
|
143
|
+
* code path.
|
|
144
|
+
*/
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Lift a `() => Promise<HealthScore>` evaluator into an Effect-native value.
|
|
148
|
+
*
|
|
149
|
+
* Errors thrown by the evaluator are tagged `HealthError` so callers can
|
|
150
|
+
* use `Effect.catchTag('HealthError', …)` for typed recovery.
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* ```ts
|
|
154
|
+
* import { Effect } from 'effect'
|
|
155
|
+
* import { sylphxHealth } from '@sylphx/sdk/health'
|
|
156
|
+
*
|
|
157
|
+
* const health = sylphxHealth({ signals: [...] })
|
|
158
|
+
*
|
|
159
|
+
* const program = Effect.gen(function* () {
|
|
160
|
+
* const { score, signals } = yield* health.evaluateEffect
|
|
161
|
+
* yield* Effect.log(`health=${score.toFixed(2)} signals=${JSON.stringify(signals)}`)
|
|
162
|
+
* return score
|
|
163
|
+
* })
|
|
164
|
+
*
|
|
165
|
+
* // Only the entry point runs the Effect (Rule 21).
|
|
166
|
+
* const finalScore = await Effect.runPromise(program)
|
|
167
|
+
* ```
|
|
168
|
+
*/
|
|
169
|
+
declare function evaluateEffect(evaluator: () => Promise<HealthScore>): Effect.Effect<HealthScore, HealthError, never>;
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Universal HTTP handler for `@sylphx/sdk/health` (ADR-111 §4.2).
|
|
173
|
+
*
|
|
174
|
+
* Framework-agnostic: returns either a `Web Fetch API` `Response`
|
|
175
|
+
* (Hono / Bun.serve / itty-router / Next.js routes) or, via thin adapters,
|
|
176
|
+
* a Node `(req, res) => void` (Express / Fastify-as-classic-handler).
|
|
177
|
+
*
|
|
178
|
+
* The handler always returns **HTTP 200** unless the SDK itself failed to
|
|
179
|
+
* compute a score (rare; caught and returned as 500 for ops to triage).
|
|
180
|
+
* The three-tier 200 / 503 gate (ADR-111 §4.4) lives in the **sidecar**,
|
|
181
|
+
* not here — the SDK's job ends at exposing the score so the sidecar
|
|
182
|
+
* can decide. This separation is documented in the README ("apps don't
|
|
183
|
+
* need to think about probe semantics").
|
|
184
|
+
*/
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Minimal contract a `health` instance must expose for the handler to
|
|
188
|
+
* call. The full `health` object satisfies this implicitly via
|
|
189
|
+
* `evaluate()` from `./index.ts`.
|
|
190
|
+
*/
|
|
191
|
+
interface HealthEvaluator {
|
|
192
|
+
evaluate(): Promise<HealthScore>;
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Build a Web-Fetch-style handler. Suitable for:
|
|
196
|
+
* - Hono: `app.get('/healthz', health.handler())`
|
|
197
|
+
* - Bun.serve: `fetch: health.handler()`
|
|
198
|
+
* - Next.js route.ts: `export const GET = health.handler()`
|
|
199
|
+
* - itty-router / Hattip / standard `(Request) => Response` runtimes
|
|
200
|
+
*/
|
|
201
|
+
declare function createWebHandler(source: HealthEvaluator): (req?: Request) => Promise<Response>;
|
|
202
|
+
/**
|
|
203
|
+
* Build a Node-style `(req, res)` handler — for Express / classic Fastify.
|
|
204
|
+
*
|
|
205
|
+
* Adapter that delegates to `createWebHandler()` so logic stays in one
|
|
206
|
+
* place. Imported lazily by callers that need it; doesn't drag any node
|
|
207
|
+
* types into Web-only code paths.
|
|
208
|
+
*/
|
|
209
|
+
interface NodeIncoming {
|
|
210
|
+
method?: string;
|
|
211
|
+
url?: string;
|
|
212
|
+
}
|
|
213
|
+
interface NodeOutgoing {
|
|
214
|
+
statusCode: number;
|
|
215
|
+
setHeader(name: string, value: string): void;
|
|
216
|
+
end(body?: string): void;
|
|
217
|
+
}
|
|
218
|
+
declare function createNodeHandler(source: HealthEvaluator): (req: NodeIncoming, res: NodeOutgoing) => Promise<void>;
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Unix-socket server for `@sylphx/sdk/health` (ADR-111 §3.2.4 + §4.2).
|
|
222
|
+
*
|
|
223
|
+
* The sidecar polls the app over a Unix socket — `/var/run/sylphx/health.sock`
|
|
224
|
+
* by default. The shared volume (`emptyDir` mount, see ADR-111 §3.2.2) is
|
|
225
|
+
* provisioned by the reconciler when the sidecar is injected. This server
|
|
226
|
+
* binds Bun's native `Bun.serve({ unix: <path> })` and serves the same
|
|
227
|
+
* JSON the HTTP handler does.
|
|
228
|
+
*
|
|
229
|
+
* Graceful shutdown is essential — the socket file lingers on disk if not
|
|
230
|
+
* cleaned, and the sidecar's first reconnect attempt after a redeploy
|
|
231
|
+
* would hit a stale inode. `shutdown()` calls `server.stop()` AND
|
|
232
|
+
* unlinks the socket file. Process signal ownership stays with the app
|
|
233
|
+
* entry point; SDK library code only returns an explicit shutdown handle.
|
|
234
|
+
*/
|
|
235
|
+
|
|
236
|
+
/** Bun-typed minimum for the server we need to control. */
|
|
237
|
+
interface BunServer {
|
|
238
|
+
stop(force?: boolean): void;
|
|
239
|
+
url?: {
|
|
240
|
+
href: string;
|
|
241
|
+
} | null;
|
|
242
|
+
}
|
|
243
|
+
interface UnixSocketServerOptions {
|
|
244
|
+
/** Absolute path to bind. Defaults to `/var/run/sylphx/health.sock`. */
|
|
245
|
+
readonly path?: string;
|
|
246
|
+
/**
|
|
247
|
+
* Override unlink behavior — useful for tests where the test runner
|
|
248
|
+
* already owns the socket cleanup.
|
|
249
|
+
*/
|
|
250
|
+
readonly unlinkOnShutdown?: boolean;
|
|
251
|
+
/**
|
|
252
|
+
* Override the Bun runtime resolver. Used by tests to verify the
|
|
253
|
+
* "no-Bun" error path without leaving a Bun-only test environment.
|
|
254
|
+
* Production code reads `globalThis.Bun` directly.
|
|
255
|
+
*/
|
|
256
|
+
readonly bunRuntime?: {
|
|
257
|
+
serve: (cfg: unknown) => BunServer;
|
|
258
|
+
} | null;
|
|
259
|
+
}
|
|
260
|
+
interface UnixSocketServerHandle {
|
|
261
|
+
readonly path: string;
|
|
262
|
+
readonly server: BunServer;
|
|
263
|
+
/** Stop the server AND unlink the socket file (unless `unlinkOnShutdown=false`). */
|
|
264
|
+
shutdown(): Promise<void>;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Scoring strategies (ADR-111 §4 — three-tier health gate).
|
|
269
|
+
*
|
|
270
|
+
* The default `weightedProduct` strategy is multiplicative — a single bad
|
|
271
|
+
* signal can drag the whole score down (any factor of 0 → score 0). This is
|
|
272
|
+
* the right semantic for **liveness**: one critical subsystem dead means
|
|
273
|
+
* the pod is not ready, regardless of how healthy the rest is.
|
|
274
|
+
*
|
|
275
|
+
* Mathematics:
|
|
276
|
+
* score = ∏ factor_i ^ weight_i (over signals where weight > 0 AND
|
|
277
|
+
* reading is not `unknown`)
|
|
278
|
+
*
|
|
279
|
+
* Where weights are normalised to sum=1 first, so absolute weight values
|
|
280
|
+
* don't matter — only the ratios do. This keeps user expectations sane:
|
|
281
|
+
* `[w=2, w=2]` and `[w=10, w=10]` produce identical scores.
|
|
282
|
+
*
|
|
283
|
+
* Edge cases (deterministic, never throw):
|
|
284
|
+
* - empty input → 1 (perfect health, nothing to penalise)
|
|
285
|
+
* - all weights = 0 → 1 (no active signals)
|
|
286
|
+
* - all readings `unknown` → 1 (we can't see; cardinal-rule fallback
|
|
287
|
+
* lives at the sidecar boundary, not
|
|
288
|
+
* here — ADR-111 §3.2.5)
|
|
289
|
+
* - factor < 0 / NaN → clamped to 0
|
|
290
|
+
* - factor > 1 → clamped to 1
|
|
291
|
+
* - weight < 0 → clamped to 0 (ignored)
|
|
292
|
+
*
|
|
293
|
+
* The clamps make us **safe by construction** — a misconfigured signal
|
|
294
|
+
* cannot push the score outside `[0, 1]`. Tests exercise all branches.
|
|
295
|
+
*/
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Weighted geometric mean — the default scoring strategy.
|
|
299
|
+
*
|
|
300
|
+
* @example
|
|
301
|
+
* weightedProduct([
|
|
302
|
+
* { signal: { name: 'lag', weight: 0.4 }, reading: { healthFactor: 0.4 } },
|
|
303
|
+
* { signal: { name: 'q', weight: 0.6 }, reading: { healthFactor: 1.0 } },
|
|
304
|
+
* ])
|
|
305
|
+
* // → 0.4^0.4 × 1.0^0.6 ≈ 0.693 — ADR-111 §4.5 worked example
|
|
306
|
+
*/
|
|
307
|
+
declare const weightedProduct: ScoringStrategy;
|
|
308
|
+
/**
|
|
309
|
+
* Convenience: build a default scoring strategy.
|
|
310
|
+
*
|
|
311
|
+
* Reserved for future: weighted-min, weighted-mean, etc. For now there's
|
|
312
|
+
* one strategy and `weightedProduct` is the only export — this keeps the
|
|
313
|
+
* surface narrow until a concrete second use-case appears.
|
|
314
|
+
*/
|
|
315
|
+
declare function defaultScoringStrategy(): ScoringStrategy;
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* `errorRateSignal` — request-error-rate over a sliding window (ADR-111 §4.3).
|
|
319
|
+
*
|
|
320
|
+
* Phase B uses an in-memory ring buffer of {timestamp, isError} samples.
|
|
321
|
+
* Phase C will replace this with an OTel collector subscription so the
|
|
322
|
+
* sidecar gets the same data without per-process bookkeeping; for now the
|
|
323
|
+
* in-memory path keeps the SDK self-contained.
|
|
324
|
+
*
|
|
325
|
+
* Mapping observed-rate → healthFactor:
|
|
326
|
+
*
|
|
327
|
+
* healthFactor
|
|
328
|
+
* ▲
|
|
329
|
+
* 1.0 ┤━━━━━━━━━━━━━━━━━━━┓
|
|
330
|
+
* │ ┃ linear interpolation
|
|
331
|
+
* 0.0 ┤ ┗━━━━━━━━━━
|
|
332
|
+
* ┼───────────────────┼────────► error rate (0..1)
|
|
333
|
+
* 0 degradedRate deadRate
|
|
334
|
+
*
|
|
335
|
+
* Default thresholds match ADR-111 §4.3 row 3:
|
|
336
|
+
* degradedRate = 0.05 (5 % errors → degraded)
|
|
337
|
+
* deadRate = 0.50 (50 % errors → dead)
|
|
338
|
+
*
|
|
339
|
+
* `recordSuccess()` / `recordError()` are pushed by the app on each
|
|
340
|
+
* request (or wired into Hono / Express middleware — the SDK provides
|
|
341
|
+
* primitives, not framework integrations). Zero-traffic windows produce
|
|
342
|
+
* `factor=1` (no requests = no errors = healthy by convention).
|
|
343
|
+
*/
|
|
344
|
+
|
|
345
|
+
interface ErrorRateOptions {
|
|
346
|
+
/** Sliding window length. Accepts ms (number) or '5s' / '1m' shorthand. */
|
|
347
|
+
readonly window: number | `${number}s` | `${number}m`;
|
|
348
|
+
/** Rate above this is considered degraded. Default 0.05 (5 %). */
|
|
349
|
+
readonly degradedRate?: number;
|
|
350
|
+
/** Rate at which the signal saturates to dead. Default 0.50 (50 %). */
|
|
351
|
+
readonly deadRate?: number;
|
|
352
|
+
/**
|
|
353
|
+
* Soft minimum sample count: until this many samples land, factor=1
|
|
354
|
+
* regardless of rate (avoids "1 error in 1 request → 100 %" panics).
|
|
355
|
+
* Default 10.
|
|
356
|
+
*/
|
|
357
|
+
readonly minSamples?: number;
|
|
358
|
+
/** Weight in the weighted-product score. Default 0.2 (ADR-111 §4.3). */
|
|
359
|
+
readonly weight?: number;
|
|
360
|
+
/** Optional injected clock for tests. */
|
|
361
|
+
readonly now?: () => number;
|
|
362
|
+
}
|
|
363
|
+
interface ErrorRateSignalHandle extends SyncSignal {
|
|
364
|
+
/** Record a successful request (call from app middleware). */
|
|
365
|
+
recordSuccess(): void;
|
|
366
|
+
/** Record a failed request (call from app middleware). */
|
|
367
|
+
recordError(): void;
|
|
368
|
+
/** Erase the window. Useful for tests. */
|
|
369
|
+
reset(): void;
|
|
370
|
+
}
|
|
371
|
+
declare function errorRateSignal(opts: ErrorRateOptions): ErrorRateSignalHandle;
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* `eventLoopLagSignal` — main-thread blocking detector (ADR-111 §4.3).
|
|
375
|
+
*
|
|
376
|
+
* Measures Node/Bun's libuv event-loop delay using `monitorEventLoopDelay()`
|
|
377
|
+
* from `node:perf_hooks`. The monitor is a histogram updated by libuv at a
|
|
378
|
+
* configurable resolution (default 10 ms here — every tick measures lag
|
|
379
|
+
* between expected wake and actual wake). We report the **max** observed
|
|
380
|
+
* since the last `read()` then `reset()` — that's the worst-case stall
|
|
381
|
+
* the app suffered in the polling window.
|
|
382
|
+
*
|
|
383
|
+
* Mapping observed-lag-ms → healthFactor ∈ [0, 1]:
|
|
384
|
+
*
|
|
385
|
+
* healthFactor
|
|
386
|
+
* ▲
|
|
387
|
+
* 1.0 ┤━━━━━━━━━┓
|
|
388
|
+
* │ ┃ linear interpolation
|
|
389
|
+
* │ ┃
|
|
390
|
+
* 0.0 ┤ ┗━━━━━━━━━━━━━
|
|
391
|
+
* ┼─────────┼──────────┼──────► observed lag (ms)
|
|
392
|
+
* 0 degradedMs deadMs
|
|
393
|
+
*
|
|
394
|
+
* Below `degradedMs` → factor 1 (healthy). Above `deadMs` → factor 0 (dead).
|
|
395
|
+
* In-between → linear interpolation. ADR-111 §4.3 default thresholds:
|
|
396
|
+
* degradedMs = 5000 (5 s — same order as ADR-110's 10 s probe timeout)
|
|
397
|
+
* deadMs = 30000 (30 s — definitely wedged)
|
|
398
|
+
*
|
|
399
|
+
* Bun-compatibility: Bun ships `monitorEventLoopDelay()` with the same
|
|
400
|
+
* shape as Node 16+. Verified against `bun:1.3` in this repo's CI.
|
|
401
|
+
*/
|
|
402
|
+
|
|
403
|
+
interface EventLoopLagOptions {
|
|
404
|
+
/** Lag below this is fully healthy (factor=1). Default 5000 ms. */
|
|
405
|
+
readonly degradedMs?: number;
|
|
406
|
+
/** Lag above this is fully dead (factor=0). Default 30000 ms. */
|
|
407
|
+
readonly deadMs?: number;
|
|
408
|
+
/**
|
|
409
|
+
* Histogram resolution (ms). Default 10 ms — the smaller, the more
|
|
410
|
+
* accurate the max but the higher the libuv accounting overhead.
|
|
411
|
+
* Node defaults are also 10 ms.
|
|
412
|
+
*/
|
|
413
|
+
readonly resolutionMs?: number;
|
|
414
|
+
/** Weight in the weighted-product score. Default 0.4 (ADR-111 §4.3). */
|
|
415
|
+
readonly weight?: number;
|
|
416
|
+
/**
|
|
417
|
+
* Optional injected monitor (tests). Defaults to a fresh
|
|
418
|
+
* `monitorEventLoopDelay()` instance.
|
|
419
|
+
*/
|
|
420
|
+
readonly monitor?: ReturnType<typeof monitorEventLoopDelay>;
|
|
421
|
+
}
|
|
422
|
+
/**
|
|
423
|
+
* Build an `event-loop-lag` signal. The returned object owns a started
|
|
424
|
+
* histogram monitor; call `dispose()` during shutdown / between tests.
|
|
425
|
+
*/
|
|
426
|
+
declare function eventLoopLagSignal(opts?: EventLoopLagOptions): SyncSignal;
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* `memoryPressureSignal` — RSS / cgroup memory.max ratio (ADR-111 §4.3).
|
|
430
|
+
*
|
|
431
|
+
* On Linux containers we read the cgroup v2 memory limit from
|
|
432
|
+
* `/sys/fs/cgroup/memory.max`. Then `pressure = process.memoryUsage().rss /
|
|
433
|
+
* limit`. ADR-111 §4.3 default thresholds:
|
|
434
|
+
* degradedRatio = 0.85 (85 % → degraded)
|
|
435
|
+
* deadRatio = 0.95 (95 % → dead)
|
|
436
|
+
*
|
|
437
|
+
* Mapping observed-pressure → healthFactor:
|
|
438
|
+
*
|
|
439
|
+
* healthFactor
|
|
440
|
+
* ▲
|
|
441
|
+
* 1.0 ┤━━━━━━━━━━━━━━━━━━┓
|
|
442
|
+
* │ ┃
|
|
443
|
+
* 0.0 ┤ ┗━━━━━━━━━
|
|
444
|
+
* ┼──────────────────┼─────────► pressure (0..1)
|
|
445
|
+
* 0 degradedRatio deadRatio
|
|
446
|
+
*
|
|
447
|
+
* Graceful fallback (the cardinal-rule deference):
|
|
448
|
+
* - cgroup file missing → unknown=true (signal ignored)
|
|
449
|
+
* - cgroup file has 'max' → unlimited container, ratio undefined → unknown
|
|
450
|
+
* - file unreadable / parse → unknown=true
|
|
451
|
+
*
|
|
452
|
+
* `unknown=true` makes the scoring strategy ignore the signal — we never
|
|
453
|
+
* pretend to know memory pressure on a host where we can't measure it.
|
|
454
|
+
*
|
|
455
|
+
* cgroup v1 (legacy) lives at `/sys/fs/cgroup/memory/memory.limit_in_bytes`
|
|
456
|
+
* — supported via `cgroupV1Path` for operators on older kernels.
|
|
457
|
+
*/
|
|
458
|
+
|
|
459
|
+
interface MemoryPressureOptions {
|
|
460
|
+
/** Pressure below this is fully healthy (factor=1). Default 0.85. */
|
|
461
|
+
readonly degradedRatio?: number;
|
|
462
|
+
/** Pressure above this is fully dead (factor=0). Default 0.95. */
|
|
463
|
+
readonly deadRatio?: number;
|
|
464
|
+
/**
|
|
465
|
+
* Custom cgroup v2 memory limit path. Default `/sys/fs/cgroup/memory.max`.
|
|
466
|
+
*/
|
|
467
|
+
readonly cgroupV2Path?: string;
|
|
468
|
+
/**
|
|
469
|
+
* Optional cgroup v1 fallback path. Default
|
|
470
|
+
* `/sys/fs/cgroup/memory/memory.limit_in_bytes`. Used when v2 path is
|
|
471
|
+
* unreadable AND `cgroupV1Path` is set or v2 doesn't exist.
|
|
472
|
+
*/
|
|
473
|
+
readonly cgroupV1Path?: string;
|
|
474
|
+
/** Weight in the weighted-product score. Default 0.2 (ADR-111 §4.3). */
|
|
475
|
+
readonly weight?: number;
|
|
476
|
+
/**
|
|
477
|
+
* Optional injected `process.memoryUsage` for tests.
|
|
478
|
+
* Default: `process.memoryUsage`.
|
|
479
|
+
*/
|
|
480
|
+
readonly memoryUsage?: () => {
|
|
481
|
+
rss: number;
|
|
482
|
+
};
|
|
483
|
+
/** Optional injected file reader for tests. */
|
|
484
|
+
readonly readFile?: (path: string) => string;
|
|
485
|
+
}
|
|
486
|
+
declare function memoryPressureSignal(opts?: MemoryPressureOptions): SyncSignal;
|
|
487
|
+
|
|
488
|
+
/**
|
|
489
|
+
* `queueDepthSignal` — backpressure indicator (ADR-111 §4.3).
|
|
490
|
+
*
|
|
491
|
+
* Generic signal: the app provides a `getter` that returns the current
|
|
492
|
+
* length of whatever queue the operator wants probed (BullMQ, RabbitMQ,
|
|
493
|
+
* in-memory work pool, …). The signal does NOT own the queue — the app
|
|
494
|
+
* does. We just measure.
|
|
495
|
+
*
|
|
496
|
+
* Mapping observed-depth → healthFactor:
|
|
497
|
+
*
|
|
498
|
+
* healthFactor
|
|
499
|
+
* ▲
|
|
500
|
+
* 1.0 ┤━━━━━━━━━━━━━━━━━━┓
|
|
501
|
+
* │ ┃ linear interpolation
|
|
502
|
+
* 0.0 ┤ ┗━━━━━━━━━━━━
|
|
503
|
+
* ┼──────────────────┼─────────► depth
|
|
504
|
+
* 0 fullThreshold
|
|
505
|
+
*
|
|
506
|
+
* Below `fullThreshold` → factor 1. At `fullThreshold` → factor 0. Above
|
|
507
|
+
* → factor 0. The implicit "degraded" zone is `[0.5 × fullThreshold,
|
|
508
|
+
* fullThreshold]` — depth at half-full produces factor 0.5. Operators
|
|
509
|
+
* tune `fullThreshold` to whatever their queue reasonably hits at peak
|
|
510
|
+
* load; "100% full" means "drain new traffic", not "kill the pod" (the
|
|
511
|
+
* three-tier gate at the sidecar handles the kill decision).
|
|
512
|
+
*
|
|
513
|
+
* `getter` errors are swallowed — a thrown getter produces `unknown=true`
|
|
514
|
+
* (so the scoring strategy ignores this signal, instead of falsely
|
|
515
|
+
* reporting score=0). The app's bug shouldn't masquerade as a sidecar
|
|
516
|
+
* decision.
|
|
517
|
+
*/
|
|
518
|
+
|
|
519
|
+
interface QueueDepthOptions {
|
|
520
|
+
/**
|
|
521
|
+
* Sync or async getter the SDK calls every poll tick. Must return a
|
|
522
|
+
* non-negative integer. Throws → reading marked `unknown=true`.
|
|
523
|
+
*/
|
|
524
|
+
readonly getter: () => number | Promise<number>;
|
|
525
|
+
/**
|
|
526
|
+
* Depth at which the queue is considered "full" (factor=0). Linear
|
|
527
|
+
* interp from 0..fullThreshold. No default — operator-specific.
|
|
528
|
+
*/
|
|
529
|
+
readonly fullThreshold: number;
|
|
530
|
+
/**
|
|
531
|
+
* Optional below-which factor is always 1 (a "soft floor"). Default 0
|
|
532
|
+
* — the linear interp starts at depth 0.
|
|
533
|
+
*/
|
|
534
|
+
readonly healthyBelow?: number;
|
|
535
|
+
/** Weight in the weighted-product score. Default 0.2 (ADR-111 §4.3). */
|
|
536
|
+
readonly weight?: number;
|
|
537
|
+
/** Custom signal name. Default `queueDepth`. */
|
|
538
|
+
readonly name?: string;
|
|
539
|
+
}
|
|
540
|
+
declare function queueDepthSignal(opts: QueueDepthOptions): AsyncSignal;
|
|
541
|
+
|
|
542
|
+
/**
|
|
543
|
+
* `@sylphx/sdk/health` — Phase B multi-signal health score (ADR-111 §4).
|
|
544
|
+
*
|
|
545
|
+
* Apps register signals (event-loop lag, queue depth, error rate, memory
|
|
546
|
+
* pressure, …); the SDK folds them into a continuous score in `[0, 1]`;
|
|
547
|
+
* the **sidecar** maps the score to liveness / readiness / drain via the
|
|
548
|
+
* three-tier gate. The SDK's responsibility ends at exposing the score.
|
|
549
|
+
*
|
|
550
|
+
* The wire format the sidecar parses (ADR-111 §3.2.4):
|
|
551
|
+
*
|
|
552
|
+
* ```json
|
|
553
|
+
* {
|
|
554
|
+
* "score": 0.92,
|
|
555
|
+
* "signals": {
|
|
556
|
+
* "eventLoopLagMs": 12,
|
|
557
|
+
* "queueDepth": 3,
|
|
558
|
+
* "recent5sErrorRate": 0.001,
|
|
559
|
+
* "memoryPressure": 0.45
|
|
560
|
+
* },
|
|
561
|
+
* "lastTickAt": "2026-05-03T12:34:56.789Z"
|
|
562
|
+
* }
|
|
563
|
+
* ```
|
|
564
|
+
*
|
|
565
|
+
* Default signals (if `sylphxHealth()` called with no `signals`):
|
|
566
|
+
* - `eventLoopLagSignal({ degradedMs: 5000, deadMs: 30000 })` (weight 1.0)
|
|
567
|
+
*
|
|
568
|
+
* Three-tier gate (decided by **sidecar**, not SDK; ADR-111 §4.4):
|
|
569
|
+
*
|
|
570
|
+
* | Score | Liveness | Readiness | Effect |
|
|
571
|
+
* | -------------- | -------- | --------- | --------------------- |
|
|
572
|
+
* | > 0.8 | 200 | 200 | Normal traffic |
|
|
573
|
+
* | (0.5, 0.8] | 200 | 503 | Drain new, no kill |
|
|
574
|
+
* | <= 0.5 | 503 | 503 | Kill after threshold |
|
|
575
|
+
*
|
|
576
|
+
* Apps don't need to think about probe semantics — that's the sidecar's
|
|
577
|
+
* job. The app just exposes the score. See `apps/health-agent/` for the
|
|
578
|
+
* sidecar implementation.
|
|
579
|
+
*
|
|
580
|
+
* @example Worked example — OpenClaw under PDF-extract load (ADR-111 §4.5):
|
|
581
|
+
*
|
|
582
|
+
* ```text
|
|
583
|
+
* eventLoopLagMs = 6000 → factor 0.4
|
|
584
|
+
* queueDepth = 12 → factor 1.0
|
|
585
|
+
* errorRate = 0.002 → factor 1.0
|
|
586
|
+
* memoryPressure = 0.55 → factor 1.0
|
|
587
|
+
* score = 0.4^0.4 × 1.0^0.6 ≈ 0.69
|
|
588
|
+
*
|
|
589
|
+
* → falls in [0.5, 0.8] → sidecar drains traffic, doesn't kill.
|
|
590
|
+
* Pod gets to finish PDF extraction.
|
|
591
|
+
* ```
|
|
592
|
+
*/
|
|
593
|
+
|
|
594
|
+
/**
|
|
595
|
+
* The handle returned by `sylphxHealth()`. Owns the registered signals,
|
|
596
|
+
* exposes evaluation in both Promise + Effect form, and produces an HTTP
|
|
597
|
+
* handler / Unix-socket server.
|
|
598
|
+
*
|
|
599
|
+
* Call `dispose()` during graceful shutdown to release per-signal
|
|
600
|
+
* resources (e.g. the `monitorEventLoopDelay()` histogram).
|
|
601
|
+
*/
|
|
602
|
+
interface SylphxHealth extends HealthEvaluator {
|
|
603
|
+
/** All signals registered (read-only). */
|
|
604
|
+
readonly signals: ReadonlyArray<Signal>;
|
|
605
|
+
/** Snapshot evaluation as a Promise. */
|
|
606
|
+
evaluate(): Promise<HealthScore>;
|
|
607
|
+
/** Snapshot evaluation as an Effect (per Rule 21 / ADR-058 Amendment). */
|
|
608
|
+
readonly evaluateEffect: ReturnType<typeof evaluateEffect>;
|
|
609
|
+
/**
|
|
610
|
+
* Web Fetch API HTTP handler — works under Hono, Bun.serve, Next.js
|
|
611
|
+
* route.ts, itty-router, Hattip. Always returns 200 + JSON; the
|
|
612
|
+
* sidecar applies the three-tier 200/503 gate.
|
|
613
|
+
*/
|
|
614
|
+
handler(): (req?: Request) => Promise<Response>;
|
|
615
|
+
/** Node.js classic `(req, res)` handler — for Express / classic Fastify. */
|
|
616
|
+
nodeHandler(): ReturnType<typeof createNodeHandler>;
|
|
617
|
+
/**
|
|
618
|
+
* Bind a Bun Unix-domain socket and serve the same JSON. The sidecar
|
|
619
|
+
* polls `/var/run/sylphx/health.sock` by default (ADR-111 §3.2.4).
|
|
620
|
+
*/
|
|
621
|
+
serveUnixSocket(opts?: UnixSocketServerOptions): UnixSocketServerHandle;
|
|
622
|
+
/**
|
|
623
|
+
* Tear down all registered signals. Idempotent. Call during graceful
|
|
624
|
+
* shutdown to release histograms, file watchers, etc.
|
|
625
|
+
*/
|
|
626
|
+
dispose(): void;
|
|
627
|
+
}
|
|
628
|
+
/**
|
|
629
|
+
* Build a `SylphxHealth` instance.
|
|
630
|
+
*
|
|
631
|
+
* @example Hono integration:
|
|
632
|
+
* ```ts
|
|
633
|
+
* import { Hono } from 'hono'
|
|
634
|
+
* import {
|
|
635
|
+
* sylphxHealth,
|
|
636
|
+
* eventLoopLagSignal,
|
|
637
|
+
* queueDepthSignal,
|
|
638
|
+
* errorRateSignal,
|
|
639
|
+
* memoryPressureSignal,
|
|
640
|
+
* } from '@sylphx/sdk/health'
|
|
641
|
+
*
|
|
642
|
+
* const errors = errorRateSignal({ window: '5s', degradedRate: 0.05 })
|
|
643
|
+
*
|
|
644
|
+
* const health = sylphxHealth({
|
|
645
|
+
* signals: [
|
|
646
|
+
* eventLoopLagSignal({ degradedMs: 5000, deadMs: 30000 }),
|
|
647
|
+
* queueDepthSignal({ getter: () => queue.size, fullThreshold: 1000 }),
|
|
648
|
+
* errors,
|
|
649
|
+
* memoryPressureSignal({ degradedRatio: 0.85 }),
|
|
650
|
+
* ],
|
|
651
|
+
* })
|
|
652
|
+
*
|
|
653
|
+
* const app = new Hono()
|
|
654
|
+
* app.get('/healthz', health.handler())
|
|
655
|
+
*
|
|
656
|
+
* // Track requests for the error-rate signal
|
|
657
|
+
* app.use(async (c, next) => {
|
|
658
|
+
* try { await next(); errors.recordSuccess() }
|
|
659
|
+
* catch (err) { errors.recordError(); throw err }
|
|
660
|
+
* })
|
|
661
|
+
*
|
|
662
|
+
* // Or — primary transport for the sidecar:
|
|
663
|
+
* health.serveUnixSocket() // → /var/run/sylphx/health.sock
|
|
664
|
+
* ```
|
|
665
|
+
*
|
|
666
|
+
* @example Worked example — OpenClaw under PDF-extract load (ADR-111 §4.5):
|
|
667
|
+
*
|
|
668
|
+
* ```text
|
|
669
|
+
* eventLoopLagMs = 6000 → factor 0.4
|
|
670
|
+
* queueDepth = 12 → factor 1.0
|
|
671
|
+
* errorRate = 0.002 → factor 1.0
|
|
672
|
+
* memoryPressure = 0.55 → factor 1.0
|
|
673
|
+
* score = 0.4^0.4 × 1.0^0.6 ≈ 0.69
|
|
674
|
+
*
|
|
675
|
+
* → falls in [0.5, 0.8] → sidecar drains traffic, doesn't kill.
|
|
676
|
+
* Pod gets to finish PDF extraction.
|
|
677
|
+
* ```
|
|
678
|
+
*/
|
|
679
|
+
declare function sylphxHealth(opts?: SylphxHealthOptions): SylphxHealth;
|
|
680
|
+
|
|
681
|
+
export { type AsyncSignal, type ErrorRateOptions, type ErrorRateSignalHandle, type EventLoopLagOptions, HealthError, type HealthEvaluator, type HealthScore, type HealthSnapshot, type MemoryPressureOptions, type QueueDepthOptions, type ScoringStrategy, type Signal, type SignalBase, type SignalReading, type SylphxHealth, type SylphxHealthOptions, type SyncSignal, type UnixSocketServerHandle, type UnixSocketServerOptions, createNodeHandler, createWebHandler, defaultScoringStrategy, errorRateSignal, eventLoopLagSignal, memoryPressureSignal, queueDepthSignal, sylphxHealth, weightedProduct };
|