@malloy-publisher/server 0.0.199 → 0.0.201

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/dist/app/api-doc.yaml +110 -118
  2. package/dist/app/assets/{EnvironmentPage-Dpee_Kn6.js → EnvironmentPage-KoP4wt8H.js} +1 -1
  3. package/dist/app/assets/HomePage-HbPwKL84.js +1 -0
  4. package/dist/app/assets/MainPage-DfK4zDYO.js +2 -0
  5. package/dist/app/assets/{ModelPage-AwAugZ37.js → ModelPage-CUgSwGXg.js} +1 -1
  6. package/dist/app/assets/{PackagePage-XQ-EWGTC.js → PackagePage-CUDQNL5k.js} +1 -1
  7. package/dist/app/assets/{RouteError-3Mv8JQw7.js → RouteError-sgmtBdg8.js} +1 -1
  8. package/dist/app/assets/{WorkbookPage-DHYYpcYc.js → WorkbookPage-tnWmLcrW.js} +1 -1
  9. package/dist/app/assets/{core-DfcpQGVP.es-DQggNOdX.js → core-B3IQNPBD.es-foBNuT8L.js} +10 -10
  10. package/dist/app/assets/{index-D1pdwrUW.js → index-B5We8x8r.js} +1 -1
  11. package/dist/app/assets/{index-BUp81Qdm.js → index-KIvi9k3F.js} +1 -1
  12. package/dist/app/assets/index-PNYovl3E.js +452 -0
  13. package/dist/app/assets/{index.umd-CQH4LZU8.js → index.umd-BXcsl2XW.js} +1 -1
  14. package/dist/app/index.html +1 -1
  15. package/dist/package_load_worker.mjs +1 -1
  16. package/dist/server.mjs +1556 -1018
  17. package/package.json +1 -1
  18. package/publisher.config.json +4 -0
  19. package/src/config.spec.ts +246 -0
  20. package/src/config.ts +121 -1
  21. package/src/constants.ts +84 -1
  22. package/src/controller/connection.controller.spec.ts +803 -0
  23. package/src/controller/connection.controller.ts +207 -20
  24. package/src/controller/model.controller.ts +16 -5
  25. package/src/controller/query.controller.ts +20 -7
  26. package/src/controller/watch-mode.controller.ts +11 -2
  27. package/src/errors.spec.ts +44 -0
  28. package/src/errors.ts +34 -0
  29. package/src/filter_deprecation.spec.ts +64 -0
  30. package/src/filter_deprecation.ts +42 -0
  31. package/src/heap_check.spec.ts +144 -0
  32. package/src/heap_check.ts +144 -0
  33. package/src/mcp/handler_utils.ts +14 -0
  34. package/src/mcp/tools/execute_query_tool.ts +44 -14
  35. package/src/oom_guards.integration.spec.ts +261 -0
  36. package/src/path_safety.ts +9 -3
  37. package/src/query_cap_metrics.spec.ts +89 -0
  38. package/src/query_cap_metrics.ts +115 -0
  39. package/src/query_concurrency.spec.ts +247 -0
  40. package/src/query_concurrency.ts +236 -0
  41. package/src/query_timeout.spec.ts +224 -0
  42. package/src/query_timeout.ts +178 -0
  43. package/src/server-old.ts +20 -0
  44. package/src/server.ts +57 -72
  45. package/src/service/connection.spec.ts +244 -0
  46. package/src/service/connection.ts +14 -4
  47. package/src/service/environment.ts +124 -4
  48. package/src/service/environment_admission.spec.ts +165 -1
  49. package/src/service/environment_store.spec.ts +103 -0
  50. package/src/service/environment_store.ts +74 -23
  51. package/src/service/filter_integration.spec.ts +69 -0
  52. package/src/service/model.spec.ts +193 -3
  53. package/src/service/model.ts +95 -14
  54. package/src/service/model_limits.spec.ts +181 -0
  55. package/src/service/model_limits.ts +110 -0
  56. package/src/service/package.spec.ts +2 -6
  57. package/src/service/package.ts +6 -1
  58. package/src/service/path_injection.spec.ts +39 -0
  59. package/src/stream_helpers.spec.ts +280 -0
  60. package/src/stream_helpers.ts +162 -0
  61. package/src/test_helpers/metrics_harness.ts +126 -0
  62. package/dist/app/assets/HomePage-DLRWTNoL.js +0 -1
  63. package/dist/app/assets/MainPage-DsVt5QGM.js +0 -2
  64. package/dist/app/assets/index-Dv5bF4Ii.js +0 -451
@@ -0,0 +1,236 @@
1
+ import { metrics, type Counter } from "@opentelemetry/api";
2
+ import type { NextFunction, Request, RequestHandler, Response } from "express";
3
+
4
+ import { getMaxConcurrentQueries } from "./config";
5
+ import { ServiceUnavailableError } from "./errors";
6
+ import { logger } from "./logger";
7
+
8
+ /**
9
+ * Process-wide query gauge — the only state owned by this module.
10
+ * Incremented when a gated handler accepts a request; decremented
11
+ * exactly once when the response finishes (whether the handler
12
+ * succeeded, errored, or the client disconnected mid-flight).
13
+ *
14
+ * Module-scoped so all gated routes share one slot pool: the
15
+ * concurrency cap bounds aggregate query load, not per-route load.
16
+ */
17
+ let active = 0;
18
+
19
+ /**
20
+ * OpenTelemetry instruments. Lazy-initialized on first use so unit
21
+ * tests that install a real MeterProvider AFTER the module is
22
+ * imported still receive their data — OTel JS's `ProxyMeter`
23
+ * binds counters created before `setGlobalMeterProvider` to NoOp
24
+ * instruments. Module-scoped state so all gated routes share one
25
+ * counter / one slot pool.
26
+ */
27
+ let queryConcurrencyRejectionsCounter: Counter | null = null;
28
+ let concurrencyTelemetryInitialized = false;
29
+ function ensureConcurrencyTelemetry(): Counter {
30
+ if (queryConcurrencyRejectionsCounter && concurrencyTelemetryInitialized) {
31
+ return queryConcurrencyRejectionsCounter;
32
+ }
33
+ const meter = metrics.getMeter("publisher");
34
+ queryConcurrencyRejectionsCounter = meter.createCounter(
35
+ "publisher_query_concurrency_rejections_total",
36
+ {
37
+ description:
38
+ "Queries rejected with 503 because the per-pod PUBLISHER_MAX_CONCURRENT_QUERIES cap was reached",
39
+ },
40
+ );
41
+ if (!concurrencyTelemetryInitialized) {
42
+ // Live count of in-flight queries holding a slot. Observable
43
+ // gauge so a scrape always sees the current value, not the
44
+ // value at the last admission/release event.
45
+ meter
46
+ .createObservableGauge("publisher_query_active_slots", {
47
+ description:
48
+ "In-flight queries currently holding a per-pod concurrency slot",
49
+ })
50
+ .addCallback((observation) => observation.observe(active));
51
+
52
+ // Configured cap, exposed so dashboards can render
53
+ // utilization (`active / max`) without needing a separate
54
+ // config feed. Read on every scrape so a runtime env-var
55
+ // change is reflected immediately; the parse is cheap. A
56
+ // `0` cap means "opt-out" (the middleware is a pass-through)
57
+ // and is reported verbatim.
58
+ meter
59
+ .createObservableGauge("publisher_query_max_slots", {
60
+ description:
61
+ "Current effective PUBLISHER_MAX_CONCURRENT_QUERIES (0 = disabled)",
62
+ })
63
+ .addCallback((observation) => {
64
+ try {
65
+ observation.observe(getMaxConcurrentQueries());
66
+ } catch {
67
+ // A misconfigured env var should fail the next
68
+ // request that observes it, not the metric scrape.
69
+ // Surface as -1 so the misconfig is visible.
70
+ observation.observe(-1);
71
+ }
72
+ });
73
+ concurrencyTelemetryInitialized = true;
74
+ }
75
+ return queryConcurrencyRejectionsCounter;
76
+ }
77
+
78
+ /**
79
+ * Visible for tests. Drops the cached instruments so a fresh
80
+ * MeterProvider can capture them on the next request. Do NOT
81
+ * call from production code.
82
+ */
83
+ export function resetQueryConcurrencyTelemetryForTesting(): void {
84
+ queryConcurrencyRejectionsCounter = null;
85
+ concurrencyTelemetryInitialized = false;
86
+ }
87
+
88
+ /**
89
+ * Visible for tests / metrics. Don't mutate from outside.
90
+ */
91
+ export function getActiveQueryCount(): number {
92
+ return active;
93
+ }
94
+
95
+ /**
96
+ * Visible for tests so a unit test that crashes mid-handler can
97
+ * reset between cases without spinning a fresh module loader.
98
+ */
99
+ export function resetActiveQueryCountForTesting(): void {
100
+ active = 0;
101
+ }
102
+
103
+ /**
104
+ * Express middleware that bounds the number of concurrently
105
+ * in-flight query requests per pod.
106
+ *
107
+ * Defense-in-depth on top of the per-request caps from Steps 1–5:
108
+ * - Row/byte caps bound a single response.
109
+ * - Memory governor (Step 4) sheds load when RSS crosses the
110
+ * high-water mark.
111
+ * - Query timeout (Step 5) prevents one query from monopolising a
112
+ * slot indefinitely.
113
+ * This middleware caps the *number of slots in flight* at any one
114
+ * moment so a burst of well-behaved but expensive queries can't all
115
+ * land simultaneously and stampede aggregate memory.
116
+ *
117
+ * Behavior:
118
+ * - When `active >= limit`, the request is rejected with HTTP 503
119
+ * and the response body identifies the cap so an operator's
120
+ * grep finds the rationale immediately.
121
+ * - When admitted, a single-shot decrement is registered on both
122
+ * `finish` (normal completion) and `close` (client disconnect).
123
+ * The handler must release exactly once even if both events
124
+ * fire.
125
+ * - `limit === 0` opts out (the middleware becomes a pass-through);
126
+ * `getMaxConcurrentQueries()` is read per-request so config
127
+ * changes propagate without a server restart. The per-request
128
+ * read is a single env-var parse — cheap.
129
+ *
130
+ * Failure-mode notes:
131
+ * - If the response never emits `finish`/`close` for some reason
132
+ * (e.g. a runtime crash that bypasses Express' normal
133
+ * teardown), the slot leaks until process restart. This is the
134
+ * same failure mode as any active-request counter; in practice
135
+ * `close` always fires on socket teardown.
136
+ * - We do NOT queue. A backed-up queue would hide load and inflate
137
+ * p99 latency; failing fast lets the upstream LB retry against
138
+ * a less-loaded pod.
139
+ */
140
+ /**
141
+ * Handle on an acquired concurrency slot. The caller MUST invoke
142
+ * `release()` exactly once when the work is done (success, error,
143
+ * or cancellation). `release()` is idempotent — calling it twice
144
+ * is a no-op rather than a double-decrement, so wrappers that
145
+ * register both `finish` and `close` listeners stay safe.
146
+ */
147
+ export interface QuerySlotHandle {
148
+ release: () => void;
149
+ }
150
+
151
+ /**
152
+ * Synchronous slot acquisition shared by the HTTP middleware and
153
+ * the MCP `executeQuery` tool. Throws {@link ServiceUnavailableError}
154
+ * (which controllers map to HTTP 503) when the pod is at its cap;
155
+ * returns a {@link QuerySlotHandle} on success. The `routeLabel`
156
+ * argument is used only for the rejection counter
157
+ * (`publisher_query_concurrency_rejections_total`) so dashboards
158
+ * can identify the hottest surface — keep its cardinality bounded
159
+ * (Express route patterns, fixed strings like `mcp:executeQuery`).
160
+ *
161
+ * Production callers should prefer {@link queryConcurrencyMiddleware}
162
+ * on HTTP routes (it wires the release to `res.finish`/`close`
163
+ * automatically). Direct callers (MCP) take responsibility for
164
+ * release in their own try/finally.
165
+ */
166
+ export function tryAcquireQuerySlot(routeLabel: string): QuerySlotHandle {
167
+ // Lazy-init runs on every call so the active/max gauges show up
168
+ // even on pods where the cap is never reached.
169
+ ensureConcurrencyTelemetry();
170
+
171
+ const limit = getMaxConcurrentQueries();
172
+ if (limit <= 0) {
173
+ // Opt-out: no slot bookkeeping. Useful for OSS deployments
174
+ // that already have an upstream concurrency bound.
175
+ return { release: () => undefined };
176
+ }
177
+
178
+ if (active >= limit) {
179
+ ensureConcurrencyTelemetry().add(1, {
180
+ "http.route": routeLabel,
181
+ limit,
182
+ });
183
+ logger.warn(
184
+ `Rejecting query: ${active}/${limit} slots in use (PUBLISHER_MAX_CONCURRENT_QUERIES).`,
185
+ { route: routeLabel },
186
+ );
187
+ throw new ServiceUnavailableError(
188
+ `Publisher pod is at its maximum concurrent query cap (${limit}). Retry after in-flight queries complete, or raise PUBLISHER_MAX_CONCURRENT_QUERIES.`,
189
+ );
190
+ }
191
+
192
+ active += 1;
193
+ let released = false;
194
+ return {
195
+ release: () => {
196
+ if (released) return;
197
+ released = true;
198
+ active = Math.max(0, active - 1);
199
+ },
200
+ };
201
+ }
202
+
203
+ export function queryConcurrencyMiddleware(
204
+ req: Request,
205
+ res: Response,
206
+ next: NextFunction,
207
+ ): void {
208
+ let handle: QuerySlotHandle;
209
+ try {
210
+ // `req.route?.path` gives the Express-registered pattern
211
+ // (e.g. `/api/v0/environments/:environmentName/.../sqlQuery`)
212
+ // rather than the concrete URL, keeping label cardinality
213
+ // bounded.
214
+ handle = tryAcquireQuerySlot(req.route?.path ?? req.path);
215
+ } catch (error) {
216
+ next(error);
217
+ return;
218
+ }
219
+ // Both events fire on different code paths; we want to release
220
+ // on whichever comes first and ignore the second:
221
+ // - `finish`: normal completion (response fully flushed).
222
+ // - `close`: client disconnected before completion (or after,
223
+ // in some Express/Node versions; hence the idempotency).
224
+ res.on("finish", handle.release);
225
+ res.on("close", handle.release);
226
+ next();
227
+ }
228
+
229
+ /**
230
+ * Convenience for the route-registration call site: produces a
231
+ * single middleware reference so registrations stay readable.
232
+ * Returning a typed `RequestHandler` keeps Express' overloads happy.
233
+ */
234
+ export function queryConcurrency(): RequestHandler {
235
+ return queryConcurrencyMiddleware;
236
+ }
@@ -0,0 +1,224 @@
1
+ import { afterEach, beforeEach, describe, expect, it } from "bun:test";
2
+
3
+ import { QueryTimeoutError } from "./errors";
4
+ import {
5
+ PUBLISHER_QUERY_TIMEOUT_REASON,
6
+ resetQueryTimeoutTelemetryForTesting,
7
+ runWithQueryTimeout,
8
+ } from "./query_timeout";
9
+ import {
10
+ startMetricsHarness,
11
+ type MetricsHarness,
12
+ } from "./test_helpers/metrics_harness";
13
+
14
+ describe("runWithQueryTimeout", () => {
15
+ it("returns the inner result when fn finishes before the timeout", async () => {
16
+ const result = await runWithQueryTimeout(async (signal) => {
17
+ expect(signal).toBeInstanceOf(AbortSignal);
18
+ expect(signal.aborted).toBe(false);
19
+ return "ok";
20
+ }, 1000);
21
+ expect(result).toBe("ok");
22
+ });
23
+
24
+ it("hands fn a signal even when the timeout is disabled (uniform contract)", async () => {
25
+ // timeoutMs=0 is the "operator opted out" path. We still want
26
+ // callers to be able to forward `signal` unconditionally; the
27
+ // signal must exist, be an AbortSignal, and never fire.
28
+ let observed: AbortSignal | undefined;
29
+ const result = await runWithQueryTimeout(async (signal) => {
30
+ observed = signal;
31
+ await new Promise((r) => setTimeout(r, 10));
32
+ return "still-here";
33
+ }, 0);
34
+ expect(result).toBe("still-here");
35
+ expect(observed).toBeInstanceOf(AbortSignal);
36
+ expect(observed?.aborted).toBe(false);
37
+ });
38
+
39
+ it("aborts the signal and throws QueryTimeoutError when fn exceeds the budget", async () => {
40
+ let observedSignal: AbortSignal | undefined;
41
+ await expect(
42
+ runWithQueryTimeout(async (signal) => {
43
+ observedSignal = signal;
44
+ // Mimic a slow driver: resolve only when the signal aborts
45
+ // (so the test doesn't hang). The thrown error from the
46
+ // driver is irrelevant — runWithQueryTimeout owns the
47
+ // verdict once the timer has fired.
48
+ await new Promise<void>((_resolve, reject) => {
49
+ signal.addEventListener("abort", () =>
50
+ reject(new Error("driver: aborted by AbortSignal")),
51
+ );
52
+ });
53
+ }, 25),
54
+ ).rejects.toBeInstanceOf(QueryTimeoutError);
55
+ expect(observedSignal?.aborted).toBe(true);
56
+ // The reason sentinel lets composed helpers (e.g. streaming
57
+ // cap-abort) distinguish "publisher timeout" from "their own
58
+ // abort" without coupling to message strings.
59
+ expect(observedSignal?.reason).toBe(PUBLISHER_QUERY_TIMEOUT_REASON);
60
+ });
61
+
62
+ it("includes the configured timeout in the QueryTimeoutError message (operator can grep logs)", async () => {
63
+ let caught: unknown;
64
+ try {
65
+ await runWithQueryTimeout(async (signal) => {
66
+ await new Promise<void>((_resolve, reject) => {
67
+ signal.addEventListener("abort", () =>
68
+ reject(new Error("driver aborted")),
69
+ );
70
+ });
71
+ }, 17);
72
+ } catch (err) {
73
+ caught = err;
74
+ }
75
+ expect(caught).toBeInstanceOf(QueryTimeoutError);
76
+ expect((caught as Error).message).toContain("17ms");
77
+ expect((caught as Error).message).toContain("PUBLISHER_QUERY_TIMEOUT_MS");
78
+ });
79
+
80
+ it("re-throws non-timeout errors verbatim (does not mask driver failures)", async () => {
81
+ // Driver fails *before* the timeout; we must surface its error
82
+ // unchanged so the controller's normal error mapping kicks in
83
+ // (502 ConnectionError, 400 BadRequestError, etc.) — wrapping
84
+ // every failure in QueryTimeoutError would lie to clients.
85
+ const driverError = new Error("upstream connection refused");
86
+ await expect(
87
+ runWithQueryTimeout(async () => {
88
+ throw driverError;
89
+ }, 1000),
90
+ ).rejects.toBe(driverError);
91
+ });
92
+
93
+ it("does not wrap an error that happens to mention 'abort' if the timer never fired", async () => {
94
+ // Edge case: a driver might surface its own AbortError for an
95
+ // unrelated reason (e.g. caller canceled, transport reset). If
96
+ // the publisher's timer never fired, the error is not ours and
97
+ // must not be re-cast as QueryTimeoutError.
98
+ const fakeAbort = Object.assign(new Error("aborted by something else"), {
99
+ name: "AbortError",
100
+ });
101
+ await expect(
102
+ runWithQueryTimeout(async () => {
103
+ throw fakeAbort;
104
+ }, 1000),
105
+ ).rejects.toBe(fakeAbort);
106
+ });
107
+
108
+ describe("telemetry", () => {
109
+ let harness: MetricsHarness;
110
+ beforeEach(async () => {
111
+ harness = await startMetricsHarness();
112
+ // Drop cached instruments so they re-init against the new
113
+ // provider; otherwise this test's writes go to a counter
114
+ // bound to the *previous* provider's reader.
115
+ resetQueryTimeoutTelemetryForTesting();
116
+ });
117
+ afterEach(async () => {
118
+ delete process.env.PUBLISHER_QUERY_TIMEOUT_MS;
119
+ await harness.shutdown();
120
+ resetQueryTimeoutTelemetryForTesting();
121
+ });
122
+
123
+ /**
124
+ * Install telemetry without firing a timeout. After the
125
+ * `ensureTimeoutTelemetry` fix any successful call is enough
126
+ * — both the counter and the gauge register on every entry,
127
+ * not just the timeout branch.
128
+ */
129
+ async function primeTelemetry(): Promise<void> {
130
+ await runWithQueryTimeout(async () => 0, 10_000);
131
+ }
132
+
133
+ it("publisher_query_timeout_total ticks each time the timer fires", async () => {
134
+ // Establish baseline (0) before the trigger so this test
135
+ // isn't sensitive to whatever else has happened earlier.
136
+ expect(
137
+ await harness.collectCounter("publisher_query_timeout_total"),
138
+ ).toBe(0);
139
+ await expect(
140
+ runWithQueryTimeout(async (signal) => {
141
+ await new Promise<void>((_resolve, reject) => {
142
+ signal.addEventListener("abort", () =>
143
+ reject(new Error("driver aborted")),
144
+ );
145
+ });
146
+ }, 15),
147
+ ).rejects.toBeInstanceOf(QueryTimeoutError);
148
+ expect(
149
+ await harness.collectCounter("publisher_query_timeout_total"),
150
+ ).toBe(1);
151
+ });
152
+
153
+ it("does NOT tick the counter on non-timeout errors (driver failed before deadline)", async () => {
154
+ await expect(
155
+ runWithQueryTimeout(async () => {
156
+ throw new Error("upstream broken");
157
+ }, 1000),
158
+ ).rejects.toThrow("upstream broken");
159
+ // A driver failure is not a timeout — the counter must
160
+ // stay at zero or operators will chase phantom timeouts.
161
+ expect(
162
+ await harness.collectCounter("publisher_query_timeout_total"),
163
+ ).toBe(0);
164
+ });
165
+
166
+ it("publisher_query_timeout_ms gauge is registered after the FIRST call, not just after a timeout fires", async () => {
167
+ // Regression test for the lazy-init bug where the gauge
168
+ // installed only inside the timeout branch — leaving
169
+ // `publisher_query_timeout_ms` absent from `/metrics`
170
+ // until the first 504. Operators tuning the timeout
171
+ // BEFORE getting paged need this visible.
172
+ process.env.PUBLISHER_QUERY_TIMEOUT_MS = "30000";
173
+ await runWithQueryTimeout(async () => "ok", 60_000);
174
+ expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
175
+ 30000,
176
+ );
177
+ });
178
+
179
+ it("publisher_query_timeout_ms gauge reports the current config", async () => {
180
+ process.env.PUBLISHER_QUERY_TIMEOUT_MS = "42000";
181
+ await primeTelemetry();
182
+ expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
183
+ 42000,
184
+ );
185
+ });
186
+
187
+ it("publisher_query_timeout_ms gauge reports 0 when the timeout is opted out", async () => {
188
+ process.env.PUBLISHER_QUERY_TIMEOUT_MS = "0";
189
+ await primeTelemetry();
190
+ expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
191
+ 0,
192
+ );
193
+ });
194
+
195
+ it("publisher_query_timeout_ms gauge surfaces -1 on misconfig instead of crashing the scrape", async () => {
196
+ process.env.PUBLISHER_QUERY_TIMEOUT_MS = "garbage";
197
+ await primeTelemetry();
198
+ // Operators must be able to *see* misconfig in dashboards
199
+ // — silently dropping the data point would hide the
200
+ // problem until a query timed out.
201
+ expect(await harness.collectGauge("publisher_query_timeout_ms")).toBe(
202
+ -1,
203
+ );
204
+ });
205
+ });
206
+
207
+ it("clears the timer on success so the event loop can exit promptly", async () => {
208
+ // Hard to assert directly without leaking implementation
209
+ // internals. Proxy assertion: the call returns quickly and a
210
+ // subsequent timer doesn't race us. If the timer leaked we'd
211
+ // also see a QueryTimeoutError on the *next* call below — the
212
+ // signal would already be aborted.
213
+ const result = await runWithQueryTimeout(async () => "fast", 5000);
214
+ expect(result).toBe("fast");
215
+ // Run a second call: a leaked timer would have aborted the
216
+ // first signal, but the second call gets its own signal so
217
+ // this is really a smoke check.
218
+ const result2 = await runWithQueryTimeout(async (signal) => {
219
+ expect(signal.aborted).toBe(false);
220
+ return "also-fast";
221
+ }, 5000);
222
+ expect(result2).toBe("also-fast");
223
+ });
224
+ });
@@ -0,0 +1,178 @@
1
+ import { metrics, type Counter } from "@opentelemetry/api";
2
+
3
+ import { getQueryTimeoutMs } from "./config";
4
+ import { QueryTimeoutError } from "./errors";
5
+
6
+ /**
7
+ * Lazy-initialized telemetry. Instruments are created on first use
8
+ * rather than at module load so unit tests that install a real
9
+ * MeterProvider AFTER the module is imported still record their
10
+ * data — OTel JS's `ProxyMeter` binds counters created before
11
+ * `setGlobalMeterProvider` to NoOp instruments (see
12
+ * https://github.com/open-telemetry/opentelemetry-js/issues/3505).
13
+ * In production the first request triggers initialization, which
14
+ * is well after `server.ts` boot and any OTel SDK setup, so the
15
+ * lazy-init has no observable latency cost.
16
+ */
17
+ let queryTimeoutCounter: Counter | null = null;
18
+ let timeoutTelemetryInitialized = false;
19
+ /**
20
+ * Idempotent installer for the timeout counter + config gauge.
21
+ * Called at the top of {@link runWithQueryTimeout} (every path,
22
+ * including the opt-out branch) so the gauge is registered with
23
+ * the OTel SDK as soon as the publisher serves its first query —
24
+ * not just on the first timeout firing. Returns the counter for
25
+ * convenience.
26
+ */
27
+ function ensureTimeoutTelemetry(): Counter {
28
+ if (queryTimeoutCounter && timeoutTelemetryInitialized) {
29
+ return queryTimeoutCounter;
30
+ }
31
+ const meter = metrics.getMeter("publisher");
32
+ if (!queryTimeoutCounter) {
33
+ queryTimeoutCounter = meter.createCounter(
34
+ "publisher_query_timeout_total",
35
+ {
36
+ description:
37
+ "Queries aborted because PUBLISHER_QUERY_TIMEOUT_MS elapsed before the underlying SDK call completed",
38
+ },
39
+ );
40
+ }
41
+ if (!timeoutTelemetryInitialized) {
42
+ // Observable gauge so dashboards can render the *configured*
43
+ // timeout alongside actual query durations from
44
+ // `malloy_model_query_duration` /
45
+ // `http_server_request_duration_ms`. Read live on each
46
+ // scrape so an env-var change between scrapes is visible
47
+ // without a restart.
48
+ meter
49
+ .createObservableGauge("publisher_query_timeout_ms", {
50
+ description:
51
+ "Current effective PUBLISHER_QUERY_TIMEOUT_MS (0 = disabled)",
52
+ unit: "ms",
53
+ })
54
+ .addCallback((observation) => {
55
+ try {
56
+ observation.observe(getQueryTimeoutMs());
57
+ } catch {
58
+ // A misconfigured env var should fail the request
59
+ // that observes it, not the metric scrape. Surface
60
+ // as -1 so dashboards reveal the misconfig rather
61
+ // than silently dropping the sample.
62
+ observation.observe(-1);
63
+ }
64
+ });
65
+ timeoutTelemetryInitialized = true;
66
+ }
67
+ return queryTimeoutCounter;
68
+ }
69
+
70
+ /**
71
+ * Visible for tests so they can re-trigger lazy init against a
72
+ * freshly-installed MeterProvider between cases. Do NOT call from
73
+ * production code.
74
+ */
75
+ export function resetQueryTimeoutTelemetryForTesting(): void {
76
+ queryTimeoutCounter = null;
77
+ timeoutTelemetryInitialized = false;
78
+ }
79
+
80
+ /**
81
+ * Per-query wall-clock guard. Hands an {@link AbortSignal} to `fn`
82
+ * and arms a `setTimeout` for `timeoutMs`. When the timer fires the
83
+ * signal is aborted with reason `Symbol.for("publisher.queryTimeout")`
84
+ * so downstream catch blocks can distinguish a publisher-issued
85
+ * timeout from a caller cancel or a driver-internal abort.
86
+ *
87
+ * Contract:
88
+ * - `fn` MUST forward `signal` into the underlying SDK call
89
+ * (`runSQLOptions.abortSignal`, `runnable.run({ abortSignal })`,
90
+ * etc.) so the abort actually cancels the work — not just unblocks
91
+ * the awaiter. Failure to forward leaks the query for `timeoutMs`
92
+ * beyond the 504 response.
93
+ * - `timeoutMs === 0` opts out (no timer is armed); the signal is
94
+ * still passed for consistency. Use when an operator deliberately
95
+ * sets `PUBLISHER_QUERY_TIMEOUT_MS=0`.
96
+ * - On timeout AND a subsequent rejection from `fn`, this throws
97
+ * {@link QueryTimeoutError}. If `fn` happens to resolve cleanly
98
+ * between "timer fired" and "we entered the catch" (a race that
99
+ * any driver can win), the success value is returned to the
100
+ * caller — a query that completed is more useful than a 504 with
101
+ * an already-materialized result. The timeout counter ticks only
102
+ * when 504 is actually emitted.
103
+ * - On non-timeout error, the underlying error is re-thrown
104
+ * unmodified.
105
+ */
106
+ export async function runWithQueryTimeout<T>(
107
+ fn: (signal: AbortSignal) => Promise<T>,
108
+ timeoutMs: number,
109
+ ): Promise<T> {
110
+ // Install telemetry on every call (idempotent) so the
111
+ // `publisher_query_timeout_ms` gauge shows up in `/metrics` as
112
+ // soon as the publisher serves its first query, even if no
113
+ // timeout ever fires. Without this, the gauge would be absent
114
+ // until the first 504 — useless for "tune the timeout BEFORE
115
+ // you get paged" workflows.
116
+ ensureTimeoutTelemetry();
117
+
118
+ if (timeoutMs <= 0) {
119
+ // Opt-out path: no timer, no abort. We still pass a never-aborts
120
+ // signal so `fn`'s signature is uniform and forwarding stays
121
+ // mechanical — no per-call branching for "did we get a timeout?".
122
+ const ac = new AbortController();
123
+ return fn(ac.signal);
124
+ }
125
+
126
+ const ac = new AbortController();
127
+ const reason = PUBLISHER_QUERY_TIMEOUT_REASON;
128
+ let timedOut = false;
129
+ const timer = setTimeout(() => {
130
+ timedOut = true;
131
+ // `abort(reason)` propagates the reason through `signal.reason`
132
+ // so a downstream catch can `signal.reason === reason` to tell
133
+ // "publisher timeout" from "client disconnect" from "driver
134
+ // internal error" without string-matching error messages.
135
+ ac.abort(reason);
136
+ }, timeoutMs);
137
+ // Match HTTP request lifecycle: don't keep the event loop alive
138
+ // just for the timer. If the process is shutting down and the
139
+ // query has already resolved, we don't want this hanging the
140
+ // graceful-shutdown.
141
+ timer.unref?.();
142
+
143
+ try {
144
+ return await fn(ac.signal);
145
+ } catch (error) {
146
+ if (timedOut) {
147
+ // Increment before throwing so the counter ticks even if
148
+ // the controller swallows the error (the MCP tool's catch
149
+ // surfaces failures as content payloads, for instance).
150
+ // Carry the configured timeout as a label so dashboards
151
+ // can pivot a flapping pod between "we tuned the env var
152
+ // down" and "queries got slower".
153
+ ensureTimeoutTelemetry().add(1, { timeout_ms: timeoutMs });
154
+ throw new QueryTimeoutError(
155
+ `Query exceeded PUBLISHER_QUERY_TIMEOUT_MS (${timeoutMs}ms) and was aborted. Refine the query (add a more selective WHERE, lower LIMIT, or simplify joins) or raise the timeout.`,
156
+ );
157
+ }
158
+ throw error;
159
+ } finally {
160
+ clearTimeout(timer);
161
+ }
162
+ }
163
+
164
+ /**
165
+ * Sentinel attached to `AbortSignal.reason` when a publisher-issued
166
+ * query timeout fires. Currently consumed by `runWithQueryTimeout`
167
+ * itself (and by tests verifying the wiring); exported so future
168
+ * call sites composing this signal with their own (e.g. a custom
169
+ * driver wrapper) can write `if (signal.reason === PUBLISHER_QUERY_TIMEOUT_REASON)`
170
+ * to detect "this was the publisher's timeout, not the cap" without
171
+ * coupling to error-message strings. `runWithQueryTimeout`'s own
172
+ * timeout-vs-other-error distinction uses the local `timedOut`
173
+ * flag rather than this symbol, so consumers can rely on the
174
+ * symbol being attached even if the implementation changes.
175
+ */
176
+ export const PUBLISHER_QUERY_TIMEOUT_REASON = Symbol.for(
177
+ "publisher.queryTimeout",
178
+ );