@malloy-publisher/server 0.0.199 → 0.0.201
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/app/api-doc.yaml +110 -118
- package/dist/app/assets/{EnvironmentPage-Dpee_Kn6.js → EnvironmentPage-KoP4wt8H.js} +1 -1
- package/dist/app/assets/HomePage-HbPwKL84.js +1 -0
- package/dist/app/assets/MainPage-DfK4zDYO.js +2 -0
- package/dist/app/assets/{ModelPage-AwAugZ37.js → ModelPage-CUgSwGXg.js} +1 -1
- package/dist/app/assets/{PackagePage-XQ-EWGTC.js → PackagePage-CUDQNL5k.js} +1 -1
- package/dist/app/assets/{RouteError-3Mv8JQw7.js → RouteError-sgmtBdg8.js} +1 -1
- package/dist/app/assets/{WorkbookPage-DHYYpcYc.js → WorkbookPage-tnWmLcrW.js} +1 -1
- package/dist/app/assets/{core-DfcpQGVP.es-DQggNOdX.js → core-B3IQNPBD.es-foBNuT8L.js} +10 -10
- package/dist/app/assets/{index-D1pdwrUW.js → index-B5We8x8r.js} +1 -1
- package/dist/app/assets/{index-BUp81Qdm.js → index-KIvi9k3F.js} +1 -1
- package/dist/app/assets/index-PNYovl3E.js +452 -0
- package/dist/app/assets/{index.umd-CQH4LZU8.js → index.umd-BXcsl2XW.js} +1 -1
- package/dist/app/index.html +1 -1
- package/dist/package_load_worker.mjs +1 -1
- package/dist/server.mjs +1556 -1018
- package/package.json +1 -1
- package/publisher.config.json +4 -0
- package/src/config.spec.ts +246 -0
- package/src/config.ts +121 -1
- package/src/constants.ts +84 -1
- package/src/controller/connection.controller.spec.ts +803 -0
- package/src/controller/connection.controller.ts +207 -20
- package/src/controller/model.controller.ts +16 -5
- package/src/controller/query.controller.ts +20 -7
- package/src/controller/watch-mode.controller.ts +11 -2
- package/src/errors.spec.ts +44 -0
- package/src/errors.ts +34 -0
- package/src/filter_deprecation.spec.ts +64 -0
- package/src/filter_deprecation.ts +42 -0
- package/src/heap_check.spec.ts +144 -0
- package/src/heap_check.ts +144 -0
- package/src/mcp/handler_utils.ts +14 -0
- package/src/mcp/tools/execute_query_tool.ts +44 -14
- package/src/oom_guards.integration.spec.ts +261 -0
- package/src/path_safety.ts +9 -3
- package/src/query_cap_metrics.spec.ts +89 -0
- package/src/query_cap_metrics.ts +115 -0
- package/src/query_concurrency.spec.ts +247 -0
- package/src/query_concurrency.ts +236 -0
- package/src/query_timeout.spec.ts +224 -0
- package/src/query_timeout.ts +178 -0
- package/src/server-old.ts +20 -0
- package/src/server.ts +57 -72
- package/src/service/connection.spec.ts +244 -0
- package/src/service/connection.ts +14 -4
- package/src/service/environment.ts +124 -4
- package/src/service/environment_admission.spec.ts +165 -1
- package/src/service/environment_store.spec.ts +103 -0
- package/src/service/environment_store.ts +74 -23
- package/src/service/filter_integration.spec.ts +69 -0
- package/src/service/model.spec.ts +193 -3
- package/src/service/model.ts +95 -14
- package/src/service/model_limits.spec.ts +181 -0
- package/src/service/model_limits.ts +110 -0
- package/src/service/package.spec.ts +2 -6
- package/src/service/package.ts +6 -1
- package/src/service/path_injection.spec.ts +39 -0
- package/src/stream_helpers.spec.ts +280 -0
- package/src/stream_helpers.ts +162 -0
- package/src/test_helpers/metrics_harness.ts +126 -0
- package/dist/app/assets/HomePage-DLRWTNoL.js +0 -1
- package/dist/app/assets/MainPage-DsVt5QGM.js +0 -2
- package/dist/app/assets/index-Dv5bF4Ii.js +0 -451
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import { metrics, type Counter } from "@opentelemetry/api";
|
|
2
|
+
import type { NextFunction, Request, RequestHandler, Response } from "express";
|
|
3
|
+
|
|
4
|
+
import { getMaxConcurrentQueries } from "./config";
|
|
5
|
+
import { ServiceUnavailableError } from "./errors";
|
|
6
|
+
import { logger } from "./logger";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Process-wide query gauge — the only state owned by this module.
|
|
10
|
+
* Incremented when a gated handler accepts a request; decremented
|
|
11
|
+
* exactly once when the response finishes (whether the handler
|
|
12
|
+
* succeeded, errored, or the client disconnected mid-flight).
|
|
13
|
+
*
|
|
14
|
+
* Module-scoped so all gated routes share one slot pool: the
|
|
15
|
+
* concurrency cap bounds aggregate query load, not per-route load.
|
|
16
|
+
*/
|
|
17
|
+
let active = 0;
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* OpenTelemetry instruments. Lazy-initialized on first use so unit
|
|
21
|
+
* tests that install a real MeterProvider AFTER the module is
|
|
22
|
+
* imported still receive their data — OTel JS's `ProxyMeter`
|
|
23
|
+
* binds counters created before `setGlobalMeterProvider` to NoOp
|
|
24
|
+
* instruments. Module-scoped state so all gated routes share one
|
|
25
|
+
* counter / one slot pool.
|
|
26
|
+
*/
|
|
27
|
+
let queryConcurrencyRejectionsCounter: Counter | null = null;
|
|
28
|
+
let concurrencyTelemetryInitialized = false;
|
|
29
|
+
function ensureConcurrencyTelemetry(): Counter {
|
|
30
|
+
if (queryConcurrencyRejectionsCounter && concurrencyTelemetryInitialized) {
|
|
31
|
+
return queryConcurrencyRejectionsCounter;
|
|
32
|
+
}
|
|
33
|
+
const meter = metrics.getMeter("publisher");
|
|
34
|
+
queryConcurrencyRejectionsCounter = meter.createCounter(
|
|
35
|
+
"publisher_query_concurrency_rejections_total",
|
|
36
|
+
{
|
|
37
|
+
description:
|
|
38
|
+
"Queries rejected with 503 because the per-pod PUBLISHER_MAX_CONCURRENT_QUERIES cap was reached",
|
|
39
|
+
},
|
|
40
|
+
);
|
|
41
|
+
if (!concurrencyTelemetryInitialized) {
|
|
42
|
+
// Live count of in-flight queries holding a slot. Observable
|
|
43
|
+
// gauge so a scrape always sees the current value, not the
|
|
44
|
+
// value at the last admission/release event.
|
|
45
|
+
meter
|
|
46
|
+
.createObservableGauge("publisher_query_active_slots", {
|
|
47
|
+
description:
|
|
48
|
+
"In-flight queries currently holding a per-pod concurrency slot",
|
|
49
|
+
})
|
|
50
|
+
.addCallback((observation) => observation.observe(active));
|
|
51
|
+
|
|
52
|
+
// Configured cap, exposed so dashboards can render
|
|
53
|
+
// utilization (`active / max`) without needing a separate
|
|
54
|
+
// config feed. Read on every scrape so a runtime env-var
|
|
55
|
+
// change is reflected immediately; the parse is cheap. A
|
|
56
|
+
// `0` cap means "opt-out" (the middleware is a pass-through)
|
|
57
|
+
// and is reported verbatim.
|
|
58
|
+
meter
|
|
59
|
+
.createObservableGauge("publisher_query_max_slots", {
|
|
60
|
+
description:
|
|
61
|
+
"Current effective PUBLISHER_MAX_CONCURRENT_QUERIES (0 = disabled)",
|
|
62
|
+
})
|
|
63
|
+
.addCallback((observation) => {
|
|
64
|
+
try {
|
|
65
|
+
observation.observe(getMaxConcurrentQueries());
|
|
66
|
+
} catch {
|
|
67
|
+
// A misconfigured env var should fail the next
|
|
68
|
+
// request that observes it, not the metric scrape.
|
|
69
|
+
// Surface as -1 so the misconfig is visible.
|
|
70
|
+
observation.observe(-1);
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
concurrencyTelemetryInitialized = true;
|
|
74
|
+
}
|
|
75
|
+
return queryConcurrencyRejectionsCounter;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Visible for tests. Drops the cached instruments so a fresh
|
|
80
|
+
* MeterProvider can capture them on the next request. Do NOT
|
|
81
|
+
* call from production code.
|
|
82
|
+
*/
|
|
83
|
+
export function resetQueryConcurrencyTelemetryForTesting(): void {
|
|
84
|
+
queryConcurrencyRejectionsCounter = null;
|
|
85
|
+
concurrencyTelemetryInitialized = false;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Visible for tests / metrics. Don't mutate from outside.
|
|
90
|
+
*/
|
|
91
|
+
export function getActiveQueryCount(): number {
|
|
92
|
+
return active;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Visible for tests so a unit test that crashes mid-handler can
|
|
97
|
+
* reset between cases without spinning a fresh module loader.
|
|
98
|
+
*/
|
|
99
|
+
export function resetActiveQueryCountForTesting(): void {
|
|
100
|
+
active = 0;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Express middleware that bounds the number of concurrently
|
|
105
|
+
* in-flight query requests per pod.
|
|
106
|
+
*
|
|
107
|
+
* Defense-in-depth on top of the per-request caps from Steps 1–5:
|
|
108
|
+
* - Row/byte caps bound a single response.
|
|
109
|
+
* - Memory governor (Step 4) sheds load when RSS crosses the
|
|
110
|
+
* high-water mark.
|
|
111
|
+
* - Query timeout (Step 5) prevents one query from monopolising a
|
|
112
|
+
* slot indefinitely.
|
|
113
|
+
* This middleware caps the *number of slots in flight* at any one
|
|
114
|
+
* moment so a burst of well-behaved but expensive queries can't all
|
|
115
|
+
* land simultaneously and stampede aggregate memory.
|
|
116
|
+
*
|
|
117
|
+
* Behavior:
|
|
118
|
+
* - When `active >= limit`, the request is rejected with HTTP 503
|
|
119
|
+
* and the response body identifies the cap so an operator's
|
|
120
|
+
* grep finds the rationale immediately.
|
|
121
|
+
* - When admitted, a single-shot decrement is registered on both
|
|
122
|
+
* `finish` (normal completion) and `close` (client disconnect).
|
|
123
|
+
* The handler must release exactly once even if both events
|
|
124
|
+
* fire.
|
|
125
|
+
* - `limit === 0` opts out (the middleware becomes a pass-through);
|
|
126
|
+
* `getMaxConcurrentQueries()` is read per-request so config
|
|
127
|
+
* changes propagate without a server restart. The per-request
|
|
128
|
+
* read is a single env-var parse — cheap.
|
|
129
|
+
*
|
|
130
|
+
* Failure-mode notes:
|
|
131
|
+
* - If the response never emits `finish`/`close` for some reason
|
|
132
|
+
* (e.g. a runtime crash that bypasses Express' normal
|
|
133
|
+
* teardown), the slot leaks until process restart. This is the
|
|
134
|
+
* same failure mode as any active-request counter; in practice
|
|
135
|
+
* `close` always fires on socket teardown.
|
|
136
|
+
* - We do NOT queue. A backed-up queue would hide load and inflate
|
|
137
|
+
* p99 latency; failing fast lets the upstream LB retry against
|
|
138
|
+
* a less-loaded pod.
|
|
139
|
+
*/
|
|
140
|
+
/**
|
|
141
|
+
* Handle on an acquired concurrency slot. The caller MUST invoke
|
|
142
|
+
* `release()` exactly once when the work is done (success, error,
|
|
143
|
+
* or cancellation). `release()` is idempotent — calling it twice
|
|
144
|
+
* is a no-op rather than a double-decrement, so wrappers that
|
|
145
|
+
* register both `finish` and `close` listeners stay safe.
|
|
146
|
+
*/
|
|
147
|
+
export interface QuerySlotHandle {
|
|
148
|
+
release: () => void;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Synchronous slot acquisition shared by the HTTP middleware and
|
|
153
|
+
* the MCP `executeQuery` tool. Throws {@link ServiceUnavailableError}
|
|
154
|
+
* (which controllers map to HTTP 503) when the pod is at its cap;
|
|
155
|
+
* returns a {@link QuerySlotHandle} on success. The `routeLabel`
|
|
156
|
+
* argument is used only for the rejection counter
|
|
157
|
+
* (`publisher_query_concurrency_rejections_total`) so dashboards
|
|
158
|
+
* can identify the hottest surface — keep its cardinality bounded
|
|
159
|
+
* (Express route patterns, fixed strings like `mcp:executeQuery`).
|
|
160
|
+
*
|
|
161
|
+
* Production callers should prefer {@link queryConcurrencyMiddleware}
|
|
162
|
+
* on HTTP routes (it wires the release to `res.finish`/`close`
|
|
163
|
+
* automatically). Direct callers (MCP) take responsibility for
|
|
164
|
+
* release in their own try/finally.
|
|
165
|
+
*/
|
|
166
|
+
export function tryAcquireQuerySlot(routeLabel: string): QuerySlotHandle {
|
|
167
|
+
// Lazy-init runs on every call so the active/max gauges show up
|
|
168
|
+
// even on pods where the cap is never reached.
|
|
169
|
+
ensureConcurrencyTelemetry();
|
|
170
|
+
|
|
171
|
+
const limit = getMaxConcurrentQueries();
|
|
172
|
+
if (limit <= 0) {
|
|
173
|
+
// Opt-out: no slot bookkeeping. Useful for OSS deployments
|
|
174
|
+
// that already have an upstream concurrency bound.
|
|
175
|
+
return { release: () => undefined };
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (active >= limit) {
|
|
179
|
+
ensureConcurrencyTelemetry().add(1, {
|
|
180
|
+
"http.route": routeLabel,
|
|
181
|
+
limit,
|
|
182
|
+
});
|
|
183
|
+
logger.warn(
|
|
184
|
+
`Rejecting query: ${active}/${limit} slots in use (PUBLISHER_MAX_CONCURRENT_QUERIES).`,
|
|
185
|
+
{ route: routeLabel },
|
|
186
|
+
);
|
|
187
|
+
throw new ServiceUnavailableError(
|
|
188
|
+
`Publisher pod is at its maximum concurrent query cap (${limit}). Retry after in-flight queries complete, or raise PUBLISHER_MAX_CONCURRENT_QUERIES.`,
|
|
189
|
+
);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
active += 1;
|
|
193
|
+
let released = false;
|
|
194
|
+
return {
|
|
195
|
+
release: () => {
|
|
196
|
+
if (released) return;
|
|
197
|
+
released = true;
|
|
198
|
+
active = Math.max(0, active - 1);
|
|
199
|
+
},
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
export function queryConcurrencyMiddleware(
|
|
204
|
+
req: Request,
|
|
205
|
+
res: Response,
|
|
206
|
+
next: NextFunction,
|
|
207
|
+
): void {
|
|
208
|
+
let handle: QuerySlotHandle;
|
|
209
|
+
try {
|
|
210
|
+
// `req.route?.path` gives the Express-registered pattern
|
|
211
|
+
// (e.g. `/api/v0/environments/:environmentName/.../sqlQuery`)
|
|
212
|
+
// rather than the concrete URL, keeping label cardinality
|
|
213
|
+
// bounded.
|
|
214
|
+
handle = tryAcquireQuerySlot(req.route?.path ?? req.path);
|
|
215
|
+
} catch (error) {
|
|
216
|
+
next(error);
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
219
|
+
// Both events fire on different code paths; we want to release
|
|
220
|
+
// on whichever comes first and ignore the second:
|
|
221
|
+
// - `finish`: normal completion (response fully flushed).
|
|
222
|
+
// - `close`: client disconnected before completion (or after,
|
|
223
|
+
// in some Express/Node versions; hence the idempotency).
|
|
224
|
+
res.on("finish", handle.release);
|
|
225
|
+
res.on("close", handle.release);
|
|
226
|
+
next();
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Convenience for the route-registration call site: produces a
|
|
231
|
+
* single middleware reference so registrations stay readable.
|
|
232
|
+
* Returning a typed `RequestHandler` keeps Express' overloads happy.
|
|
233
|
+
*/
|
|
234
|
+
export function queryConcurrency(): RequestHandler {
|
|
235
|
+
return queryConcurrencyMiddleware;
|
|
236
|
+
}
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import { afterEach, beforeEach, describe, expect, it } from "bun:test";
|
|
2
|
+
|
|
3
|
+
import { QueryTimeoutError } from "./errors";
|
|
4
|
+
import {
|
|
5
|
+
PUBLISHER_QUERY_TIMEOUT_REASON,
|
|
6
|
+
resetQueryTimeoutTelemetryForTesting,
|
|
7
|
+
runWithQueryTimeout,
|
|
8
|
+
} from "./query_timeout";
|
|
9
|
+
import {
|
|
10
|
+
startMetricsHarness,
|
|
11
|
+
type MetricsHarness,
|
|
12
|
+
} from "./test_helpers/metrics_harness";
|
|
13
|
+
|
|
14
|
+
describe("runWithQueryTimeout", () => {
|
|
15
|
+
it("returns the inner result when fn finishes before the timeout", async () => {
|
|
16
|
+
const result = await runWithQueryTimeout(async (signal) => {
|
|
17
|
+
expect(signal).toBeInstanceOf(AbortSignal);
|
|
18
|
+
expect(signal.aborted).toBe(false);
|
|
19
|
+
return "ok";
|
|
20
|
+
}, 1000);
|
|
21
|
+
expect(result).toBe("ok");
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it("hands fn a signal even when the timeout is disabled (uniform contract)", async () => {
|
|
25
|
+
// timeoutMs=0 is the "operator opted out" path. We still want
|
|
26
|
+
// callers to be able to forward `signal` unconditionally; the
|
|
27
|
+
// signal must exist, be an AbortSignal, and never fire.
|
|
28
|
+
let observed: AbortSignal | undefined;
|
|
29
|
+
const result = await runWithQueryTimeout(async (signal) => {
|
|
30
|
+
observed = signal;
|
|
31
|
+
await new Promise((r) => setTimeout(r, 10));
|
|
32
|
+
return "still-here";
|
|
33
|
+
}, 0);
|
|
34
|
+
expect(result).toBe("still-here");
|
|
35
|
+
expect(observed).toBeInstanceOf(AbortSignal);
|
|
36
|
+
expect(observed?.aborted).toBe(false);
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("aborts the signal and throws QueryTimeoutError when fn exceeds the budget", async () => {
|
|
40
|
+
let observedSignal: AbortSignal | undefined;
|
|
41
|
+
await expect(
|
|
42
|
+
runWithQueryTimeout(async (signal) => {
|
|
43
|
+
observedSignal = signal;
|
|
44
|
+
// Mimic a slow driver: resolve only when the signal aborts
|
|
45
|
+
// (so the test doesn't hang). The thrown error from the
|
|
46
|
+
// driver is irrelevant — runWithQueryTimeout owns the
|
|
47
|
+
// verdict once the timer has fired.
|
|
48
|
+
await new Promise<void>((_resolve, reject) => {
|
|
49
|
+
signal.addEventListener("abort", () =>
|
|
50
|
+
reject(new Error("driver: aborted by AbortSignal")),
|
|
51
|
+
);
|
|
52
|
+
});
|
|
53
|
+
}, 25),
|
|
54
|
+
).rejects.toBeInstanceOf(QueryTimeoutError);
|
|
55
|
+
expect(observedSignal?.aborted).toBe(true);
|
|
56
|
+
// The reason sentinel lets composed helpers (e.g. streaming
|
|
57
|
+
// cap-abort) distinguish "publisher timeout" from "their own
|
|
58
|
+
// abort" without coupling to message strings.
|
|
59
|
+
expect(observedSignal?.reason).toBe(PUBLISHER_QUERY_TIMEOUT_REASON);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it("includes the configured timeout in the QueryTimeoutError message (operator can grep logs)", async () => {
|
|
63
|
+
let caught: unknown;
|
|
64
|
+
try {
|
|
65
|
+
await runWithQueryTimeout(async (signal) => {
|
|
66
|
+
await new Promise<void>((_resolve, reject) => {
|
|
67
|
+
signal.addEventListener("abort", () =>
|
|
68
|
+
reject(new Error("driver aborted")),
|
|
69
|
+
);
|
|
70
|
+
});
|
|
71
|
+
}, 17);
|
|
72
|
+
} catch (err) {
|
|
73
|
+
caught = err;
|
|
74
|
+
}
|
|
75
|
+
expect(caught).toBeInstanceOf(QueryTimeoutError);
|
|
76
|
+
expect((caught as Error).message).toContain("17ms");
|
|
77
|
+
expect((caught as Error).message).toContain("PUBLISHER_QUERY_TIMEOUT_MS");
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it("re-throws non-timeout errors verbatim (does not mask driver failures)", async () => {
|
|
81
|
+
// Driver fails *before* the timeout; we must surface its error
|
|
82
|
+
// unchanged so the controller's normal error mapping kicks in
|
|
83
|
+
// (502 ConnectionError, 400 BadRequestError, etc.) — wrapping
|
|
84
|
+
// every failure in QueryTimeoutError would lie to clients.
|
|
85
|
+
const driverError = new Error("upstream connection refused");
|
|
86
|
+
await expect(
|
|
87
|
+
runWithQueryTimeout(async () => {
|
|
88
|
+
throw driverError;
|
|
89
|
+
}, 1000),
|
|
90
|
+
).rejects.toBe(driverError);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it("does not wrap an error that happens to mention 'abort' if the timer never fired", async () => {
|
|
94
|
+
// Edge case: a driver might surface its own AbortError for an
|
|
95
|
+
// unrelated reason (e.g. caller canceled, transport reset). If
|
|
96
|
+
// the publisher's timer never fired, the error is not ours and
|
|
97
|
+
// must not be re-cast as QueryTimeoutError.
|
|
98
|
+
const fakeAbort = Object.assign(new Error("aborted by something else"), {
|
|
99
|
+
name: "AbortError",
|
|
100
|
+
});
|
|
101
|
+
await expect(
|
|
102
|
+
runWithQueryTimeout(async () => {
|
|
103
|
+
throw fakeAbort;
|
|
104
|
+
}, 1000),
|
|
105
|
+
).rejects.toBe(fakeAbort);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
describe("telemetry", () => {
|
|
109
|
+
let harness: MetricsHarness;
|
|
110
|
+
beforeEach(async () => {
|
|
111
|
+
harness = await startMetricsHarness();
|
|
112
|
+
// Drop cached instruments so they re-init against the new
|
|
113
|
+
// provider; otherwise this test's writes go to a counter
|
|
114
|
+
// bound to the *previous* provider's reader.
|
|
115
|
+
resetQueryTimeoutTelemetryForTesting();
|
|
116
|
+
});
|
|
117
|
+
afterEach(async () => {
|
|
118
|
+
delete process.env.PUBLISHER_QUERY_TIMEOUT_MS;
|
|
119
|
+
await harness.shutdown();
|
|
120
|
+
resetQueryTimeoutTelemetryForTesting();
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Install telemetry without firing a timeout. After the
|
|
125
|
+
* `ensureTimeoutTelemetry` fix any successful call is enough
|
|
126
|
+
* — both the counter and the gauge register on every entry,
|
|
127
|
+
* not just the timeout branch.
|
|
128
|
+
*/
|
|
129
|
+
async function primeTelemetry(): Promise<void> {
|
|
130
|
+
await runWithQueryTimeout(async () => 0, 10_000);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
it("publisher_query_timeout_total ticks each time the timer fires", async () => {
|
|
134
|
+
// Establish baseline (0) before the trigger so this test
|
|
135
|
+
// isn't sensitive to whatever else has happened earlier.
|
|
136
|
+
expect(
|
|
137
|
+
await harness.collectCounter("publisher_query_timeout_total"),
|
|
138
|
+
).toBe(0);
|
|
139
|
+
await expect(
|
|
140
|
+
runWithQueryTimeout(async (signal) => {
|
|
141
|
+
await new Promise<void>((_resolve, reject) => {
|
|
142
|
+
signal.addEventListener("abort", () =>
|
|
143
|
+
reject(new Error("driver aborted")),
|
|
144
|
+
);
|
|
145
|
+
});
|
|
146
|
+
}, 15),
|
|
147
|
+
).rejects.toBeInstanceOf(QueryTimeoutError);
|
|
148
|
+
expect(
|
|
149
|
+
await harness.collectCounter("publisher_query_timeout_total"),
|
|
150
|
+
).toBe(1);
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
it("does NOT tick the counter on non-timeout errors (driver failed before deadline)", async () => {
|
|
154
|
+
await expect(
|
|
155
|
+
runWithQueryTimeout(async () => {
|
|
156
|
+
throw new Error("upstream broken");
|
|
157
|
+
}, 1000),
|
|
158
|
+
).rejects.toThrow("upstream broken");
|
|
159
|
+
// A driver failure is not a timeout — the counter must
|
|
160
|
+
// stay at zero or operators will chase phantom timeouts.
|
|
161
|
+
expect(
|
|
162
|
+
await harness.collectCounter("publisher_query_timeout_total"),
|
|
163
|
+
).toBe(0);
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
it("publisher_query_timeout_ms gauge is registered after the FIRST call, not just after a timeout fires", async () => {
|
|
167
|
+
// Regression test for the lazy-init bug where the gauge
|
|
168
|
+
// installed only inside the timeout branch — leaving
|
|
169
|
+
// `publisher_query_timeout_ms` absent from `/metrics`
|
|
170
|
+
// until the first 504. Operators tuning the timeout
|
|
171
|
+
// BEFORE getting paged need this visible.
|
|
172
|
+
process.env.PUBLISHER_QUERY_TIMEOUT_MS = "30000";
|
|
173
|
+
await runWithQueryTimeout(async () => "ok", 60_000);
|
|
174
|
+
expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
|
|
175
|
+
30000,
|
|
176
|
+
);
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
it("publisher_query_timeout_ms gauge reports the current config", async () => {
|
|
180
|
+
process.env.PUBLISHER_QUERY_TIMEOUT_MS = "42000";
|
|
181
|
+
await primeTelemetry();
|
|
182
|
+
expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
|
|
183
|
+
42000,
|
|
184
|
+
);
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
it("publisher_query_timeout_ms gauge reports 0 when the timeout is opted out", async () => {
|
|
188
|
+
process.env.PUBLISHER_QUERY_TIMEOUT_MS = "0";
|
|
189
|
+
await primeTelemetry();
|
|
190
|
+
expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
|
|
191
|
+
0,
|
|
192
|
+
);
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
it("publisher_query_timeout_ms gauge surfaces -1 on misconfig instead of crashing the scrape", async () => {
|
|
196
|
+
process.env.PUBLISHER_QUERY_TIMEOUT_MS = "garbage";
|
|
197
|
+
await primeTelemetry();
|
|
198
|
+
// Operators must be able to *see* misconfig in dashboards
|
|
199
|
+
// — silently dropping the data point would hide the
|
|
200
|
+
// problem until a query timed out.
|
|
201
|
+
expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
|
|
202
|
+
-1,
|
|
203
|
+
);
|
|
204
|
+
});
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
it("clears the timer on success so the event loop can exit promptly", async () => {
|
|
208
|
+
// Hard to assert directly without leaking implementation
|
|
209
|
+
// internals. Proxy assertion: the call returns quickly and a
|
|
210
|
+
// subsequent timer doesn't race us. If the timer leaked we'd
|
|
211
|
+
// also see a QueryTimeoutError on the *next* call below — the
|
|
212
|
+
// signal would already be aborted.
|
|
213
|
+
const result = await runWithQueryTimeout(async () => "fast", 5000);
|
|
214
|
+
expect(result).toBe("fast");
|
|
215
|
+
// Run a second call: a leaked timer would have aborted the
|
|
216
|
+
// first signal, but the second call gets its own signal so
|
|
217
|
+
// this is really a smoke check.
|
|
218
|
+
const result2 = await runWithQueryTimeout(async (signal) => {
|
|
219
|
+
expect(signal.aborted).toBe(false);
|
|
220
|
+
return "also-fast";
|
|
221
|
+
}, 5000);
|
|
222
|
+
expect(result2).toBe("also-fast");
|
|
223
|
+
});
|
|
224
|
+
});
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import { metrics, type Counter } from "@opentelemetry/api";
|
|
2
|
+
|
|
3
|
+
import { getQueryTimeoutMs } from "./config";
|
|
4
|
+
import { QueryTimeoutError } from "./errors";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Lazy-initialized telemetry. Instruments are created on first use
|
|
8
|
+
* rather than at module load so unit tests that install a real
|
|
9
|
+
* MeterProvider AFTER the module is imported still record their
|
|
10
|
+
* data — OTel JS's `ProxyMeter` binds counters created before
|
|
11
|
+
* `setGlobalMeterProvider` to NoOp instruments (see
|
|
12
|
+
* https://github.com/open-telemetry/opentelemetry-js/issues/3505).
|
|
13
|
+
* In production the first request triggers initialization, which
|
|
14
|
+
* is well after `server.ts` boot and any OTel SDK setup, so the
|
|
15
|
+
* lazy-init has no observable latency cost.
|
|
16
|
+
*/
|
|
17
|
+
let queryTimeoutCounter: Counter | null = null;
|
|
18
|
+
let timeoutTelemetryInitialized = false;
|
|
19
|
+
/**
|
|
20
|
+
* Idempotent installer for the timeout counter + config gauge.
|
|
21
|
+
* Called at the top of {@link runWithQueryTimeout} (every path,
|
|
22
|
+
* including the opt-out branch) so the gauge is registered with
|
|
23
|
+
* the OTel SDK as soon as the publisher serves its first query —
|
|
24
|
+
* not just on the first timeout firing. Returns the counter for
|
|
25
|
+
* convenience.
|
|
26
|
+
*/
|
|
27
|
+
function ensureTimeoutTelemetry(): Counter {
|
|
28
|
+
if (queryTimeoutCounter && timeoutTelemetryInitialized) {
|
|
29
|
+
return queryTimeoutCounter;
|
|
30
|
+
}
|
|
31
|
+
const meter = metrics.getMeter("publisher");
|
|
32
|
+
if (!queryTimeoutCounter) {
|
|
33
|
+
queryTimeoutCounter = meter.createCounter(
|
|
34
|
+
"publisher_query_timeout_total",
|
|
35
|
+
{
|
|
36
|
+
description:
|
|
37
|
+
"Queries aborted because PUBLISHER_QUERY_TIMEOUT_MS elapsed before the underlying SDK call completed",
|
|
38
|
+
},
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
if (!timeoutTelemetryInitialized) {
|
|
42
|
+
// Observable gauge so dashboards can render the *configured*
|
|
43
|
+
// timeout alongside actual query durations from
|
|
44
|
+
// `malloy_model_query_duration` /
|
|
45
|
+
// `http_server_request_duration_ms`. Read live on each
|
|
46
|
+
// scrape so an env-var change between scrapes is visible
|
|
47
|
+
// without a restart.
|
|
48
|
+
meter
|
|
49
|
+
.createObservableGauge("publisher_query_timeout_ms", {
|
|
50
|
+
description:
|
|
51
|
+
"Current effective PUBLISHER_QUERY_TIMEOUT_MS (0 = disabled)",
|
|
52
|
+
unit: "ms",
|
|
53
|
+
})
|
|
54
|
+
.addCallback((observation) => {
|
|
55
|
+
try {
|
|
56
|
+
observation.observe(getQueryTimeoutMs());
|
|
57
|
+
} catch {
|
|
58
|
+
// A misconfigured env var should fail the request
|
|
59
|
+
// that observes it, not the metric scrape. Surface
|
|
60
|
+
// as -1 so dashboards reveal the misconfig rather
|
|
61
|
+
// than silently dropping the sample.
|
|
62
|
+
observation.observe(-1);
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
timeoutTelemetryInitialized = true;
|
|
66
|
+
}
|
|
67
|
+
return queryTimeoutCounter;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Visible for tests so they can re-trigger lazy init against a
|
|
72
|
+
* freshly-installed MeterProvider between cases. Do NOT call from
|
|
73
|
+
* production code.
|
|
74
|
+
*/
|
|
75
|
+
export function resetQueryTimeoutTelemetryForTesting(): void {
|
|
76
|
+
queryTimeoutCounter = null;
|
|
77
|
+
timeoutTelemetryInitialized = false;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Per-query wall-clock guard. Hands an {@link AbortSignal} to `fn`
|
|
82
|
+
* and arms a `setTimeout` for `timeoutMs`. When the timer fires the
|
|
83
|
+
* signal is aborted with reason `Symbol.for("publisher.queryTimeout")`
|
|
84
|
+
* so downstream catch blocks can distinguish a publisher-issued
|
|
85
|
+
* timeout from a caller cancel or a driver-internal abort.
|
|
86
|
+
*
|
|
87
|
+
* Contract:
|
|
88
|
+
* - `fn` MUST forward `signal` into the underlying SDK call
|
|
89
|
+
* (`runSQLOptions.abortSignal`, `runnable.run({ abortSignal })`,
|
|
90
|
+
* etc.) so the abort actually cancels the work — not just unblocks
|
|
91
|
+
* the awaiter. Failure to forward leaks the query for `timeoutMs`
|
|
92
|
+
* beyond the 504 response.
|
|
93
|
+
* - `timeoutMs === 0` opts out (no timer is armed); the signal is
|
|
94
|
+
* still passed for consistency. Use when an operator deliberately
|
|
95
|
+
* sets `PUBLISHER_QUERY_TIMEOUT_MS=0`.
|
|
96
|
+
* - On timeout AND a subsequent rejection from `fn`, this throws
|
|
97
|
+
* {@link QueryTimeoutError}. If `fn` happens to resolve cleanly
|
|
98
|
+
* between "timer fired" and "we entered the catch" (a race that
|
|
99
|
+
* any driver can win), the success value is returned to the
|
|
100
|
+
* caller — a query that completed is more useful than a 504 with
|
|
101
|
+
* an already-materialized result. The timeout counter ticks only
|
|
102
|
+
* when 504 is actually emitted.
|
|
103
|
+
* - On non-timeout error, the underlying error is re-thrown
|
|
104
|
+
* unmodified.
|
|
105
|
+
*/
|
|
106
|
+
export async function runWithQueryTimeout<T>(
|
|
107
|
+
fn: (signal: AbortSignal) => Promise<T>,
|
|
108
|
+
timeoutMs: number,
|
|
109
|
+
): Promise<T> {
|
|
110
|
+
// Install telemetry on every call (idempotent) so the
|
|
111
|
+
// `publisher_query_timeout_ms` gauge shows up in `/metrics` as
|
|
112
|
+
// soon as the publisher serves its first query, even if no
|
|
113
|
+
// timeout ever fires. Without this, the gauge would be absent
|
|
114
|
+
// until the first 504 — useless for "tune the timeout BEFORE
|
|
115
|
+
// you get paged" workflows.
|
|
116
|
+
ensureTimeoutTelemetry();
|
|
117
|
+
|
|
118
|
+
if (timeoutMs <= 0) {
|
|
119
|
+
// Opt-out path: no timer, no abort. We still pass a never-aborts
|
|
120
|
+
// signal so `fn`'s signature is uniform and forwarding stays
|
|
121
|
+
// mechanical — no per-call branching for "did we get a timeout?".
|
|
122
|
+
const ac = new AbortController();
|
|
123
|
+
return fn(ac.signal);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const ac = new AbortController();
|
|
127
|
+
const reason = PUBLISHER_QUERY_TIMEOUT_REASON;
|
|
128
|
+
let timedOut = false;
|
|
129
|
+
const timer = setTimeout(() => {
|
|
130
|
+
timedOut = true;
|
|
131
|
+
// `abort(reason)` propagates the reason through `signal.reason`
|
|
132
|
+
// so a downstream catch can `signal.reason === reason` to tell
|
|
133
|
+
// "publisher timeout" from "client disconnect" from "driver
|
|
134
|
+
// internal error" without string-matching error messages.
|
|
135
|
+
ac.abort(reason);
|
|
136
|
+
}, timeoutMs);
|
|
137
|
+
// Match HTTP request lifecycle: don't keep the event loop alive
|
|
138
|
+
// just for the timer. If the process is shutting down and the
|
|
139
|
+
// query has already resolved, we don't want this hanging the
|
|
140
|
+
// graceful-shutdown.
|
|
141
|
+
timer.unref?.();
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
return await fn(ac.signal);
|
|
145
|
+
} catch (error) {
|
|
146
|
+
if (timedOut) {
|
|
147
|
+
// Increment before throwing so the counter ticks even if
|
|
148
|
+
// the controller swallows the error (the MCP tool's catch
|
|
149
|
+
// surfaces failures as content payloads, for instance).
|
|
150
|
+
// Carry the configured timeout as a label so dashboards
|
|
151
|
+
// can pivot a flapping pod between "we tuned the env var
|
|
152
|
+
// down" and "queries got slower".
|
|
153
|
+
ensureTimeoutTelemetry().add(1, { timeout_ms: timeoutMs });
|
|
154
|
+
throw new QueryTimeoutError(
|
|
155
|
+
`Query exceeded PUBLISHER_QUERY_TIMEOUT_MS (${timeoutMs}ms) and was aborted. Refine the query (add a more selective WHERE, lower LIMIT, or simplify joins) or raise the timeout.`,
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
throw error;
|
|
159
|
+
} finally {
|
|
160
|
+
clearTimeout(timer);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Sentinel attached to `AbortSignal.reason` when a publisher-issued
|
|
166
|
+
* query timeout fires. Currently consumed by `runWithQueryTimeout`
|
|
167
|
+
* itself (and by tests verifying the wiring); exported so future
|
|
168
|
+
* call sites composing this signal with their own (e.g. a custom
|
|
169
|
+
* driver wrapper) can write `if (signal.reason === PUBLISHER_QUERY_TIMEOUT_REASON)`
|
|
170
|
+
* to detect "this was the publisher's timeout, not the cap" without
|
|
171
|
+
* coupling to error-message strings. `runWithQueryTimeout`'s own
|
|
172
|
+
* timeout-vs-other-error distinction uses the local `timedOut`
|
|
173
|
+
* flag rather than this symbol, so consumers can rely on the
|
|
174
|
+
* symbol being attached even if the implementation changes.
|
|
175
|
+
*/
|
|
176
|
+
export const PUBLISHER_QUERY_TIMEOUT_REASON = Symbol.for(
|
|
177
|
+
"publisher.queryTimeout",
|
|
178
|
+
);
|