@malloy-publisher/server 0.0.198-dev3 → 0.0.198-dev6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,196 @@
1
+ /**
2
+ * Integration test: exercise `Package.create` with the package-load
3
+ * worker pool enabled (PACKAGE_LOAD_WORKERS=1).
4
+ *
5
+ * Validates that the worker-load path:
6
+ * - reads the manifest, probes embedded databases, and compiles
7
+ * every model in a single off-thread job
8
+ * - produces a live `Package` whose `Model`s have populated
9
+ * `modelDef` / `sources` / `queries`
10
+ * - hydrates the `ModelMaterializer` from `modelDef` on first
11
+ * query (no recompile) — verified end-to-end by running a
12
+ * query through the resulting Model and getting a result
13
+ *
14
+ * Kept separate from `package.spec.ts` so the existing tests keep
15
+ * running on the in-process path without paying worker startup cost.
16
+ *
17
+ * Pool reuse strategy: one `PackageLoadPool` shared across all
18
+ * cases in this file. Spawning a fresh worker per test crashes Bun
19
+ * (segfault) because DuckDB's native bindings don't tolerate being
20
+ * loaded concurrently into multiple worker isolates of the same Bun
21
+ * process. Production uses one pool; this matches.
22
+ */
23
+ import {
24
+ afterAll,
25
+ afterEach,
26
+ beforeAll,
27
+ beforeEach,
28
+ describe,
29
+ expect,
30
+ it,
31
+ } from "bun:test";
32
+ import * as fs from "fs";
33
+ import * as os from "os";
34
+ import * as path from "path";
35
+ import {
36
+ PackageLoadPool,
37
+ __setPackageLoadPoolForTests,
38
+ } from "../package_load/package_load_pool";
39
+ import { Package } from "./package";
40
+
41
+ const ORIGINAL_ENV = process.env.PACKAGE_LOAD_WORKERS;
42
+
43
+ describe("Package.create via worker pool", () => {
44
+ let tempDir: string;
45
+ let pool: PackageLoadPool;
46
+
47
+ beforeAll(async () => {
48
+ process.env.PACKAGE_LOAD_WORKERS = "1";
49
+ pool = new PackageLoadPool(1);
50
+ // Wire our pool into the module-level singleton so Package.create
51
+ // picks it up via getPackageLoadPool().
52
+ await __setPackageLoadPoolForTests(pool);
53
+ });
54
+
55
+ afterAll(async () => {
56
+ await __setPackageLoadPoolForTests(null);
57
+ if (ORIGINAL_ENV === undefined) {
58
+ delete process.env.PACKAGE_LOAD_WORKERS;
59
+ } else {
60
+ process.env.PACKAGE_LOAD_WORKERS = ORIGINAL_ENV;
61
+ }
62
+ });
63
+
64
+ beforeEach(() => {
65
+ tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "publisher-pkg-worker-"));
66
+ });
67
+
68
+ afterEach(() => {
69
+ if (tempDir) {
70
+ // tempDir gets wiped by Package.create on failure (it's the
71
+ // staging-cleanup path); ignore ENOENT here.
72
+ try {
73
+ fs.rmSync(tempDir, { recursive: true, force: true });
74
+ } catch {
75
+ /* already gone */
76
+ }
77
+ tempDir = "";
78
+ }
79
+ });
80
+
81
+ async function makeMalloyConfig(): Promise<{
82
+ malloyConfig: import("@malloydata/malloy").MalloyConfig;
83
+ duckdb: { close: () => Promise<void> };
84
+ }> {
85
+ const { MalloyConfig, FixedConnectionMap } = await import(
86
+ "@malloydata/malloy"
87
+ );
88
+ const { DuckDBConnection } = await import("@malloydata/db-duckdb");
89
+ const duckdb = new DuckDBConnection("duckdb", ":memory:");
90
+ const connections = new FixedConnectionMap(
91
+ new Map([["duckdb", duckdb]]),
92
+ "duckdb",
93
+ );
94
+ const malloyConfig = new MalloyConfig({ connections: {} });
95
+ malloyConfig.wrapConnections(() => connections);
96
+ return { malloyConfig, duckdb };
97
+ }
98
+
99
+ function writeManifest(): void {
100
+ fs.writeFileSync(
101
+ path.join(tempDir, "publisher.json"),
102
+ JSON.stringify({ name: "pkg", description: "test package" }),
103
+ );
104
+ }
105
+
106
+ it("loads a package end-to-end and serves a query through the hydrated materializer", async () => {
107
+ writeManifest();
108
+ // Define `total_v` as a *view* on the source so the query
109
+ // builder's `run: nums -> total_v` form resolves. (Top-level
110
+ // queries take the `run: total_q` form — orthogonal path that
111
+ // the in-process tests already cover.)
112
+ fs.writeFileSync(
113
+ path.join(tempDir, "trivial.malloy"),
114
+ `source: nums is duckdb.sql("select 1 as a, 2 as b") extend {
115
+ measure: total is a.sum()
116
+ view: total_v is { aggregate: total }
117
+ }`,
118
+ );
119
+
120
+ const { malloyConfig, duckdb } = await makeMalloyConfig();
121
+ try {
122
+ const pkg = await Package.create("env", "pkg", tempDir, malloyConfig);
123
+ expect(pkg).toBeInstanceOf(Package);
124
+ expect(pkg.getModelPaths()).toEqual(["trivial.malloy"]);
125
+
126
+ const model = pkg.getModel("trivial.malloy");
127
+ expect(model).toBeDefined();
128
+ const apiModel = (await model!.getModel()) as {
129
+ modelDef?: string;
130
+ sources?: { name?: string }[];
131
+ };
132
+ expect(apiModel.modelDef).toBeDefined();
133
+ expect(apiModel.sources?.[0]?.name).toBe("nums");
134
+
135
+ // First query against the package — hydrates the
136
+ // ModelMaterializer from the worker's modelDef without a
137
+ // recompile, then runs the SQL against the *main thread's*
138
+ // DuckDB connection (the only one with the in-memory `nums`
139
+ // source loaded via duckdb.sql()).
140
+ const { result } = await model!.getQueryResults(
141
+ "nums",
142
+ "total_v",
143
+ undefined,
144
+ );
145
+ expect(result.data).toBeDefined();
146
+ } finally {
147
+ await duckdb.close();
148
+ }
149
+ });
150
+
151
+ it("propagates a per-model compile failure as a thrown error from Package.create", async () => {
152
+ writeManifest();
153
+ fs.writeFileSync(
154
+ path.join(tempDir, "broken.malloy"),
155
+ `source: bad is duckdb.sql("select 1 as a") extend {
156
+ measure: oops is THIS_FUNC_DOES_NOT_EXIST(a)
157
+ }`,
158
+ );
159
+
160
+ const { malloyConfig, duckdb } = await makeMalloyConfig();
161
+ try {
162
+ await expect(
163
+ Package.create("env", "pkg", tempDir, malloyConfig),
164
+ ).rejects.toBeInstanceOf(Error);
165
+ } finally {
166
+ await duckdb.close();
167
+ }
168
+ });
169
+
170
+ // NB: kept last in this describe — swapping the singleton for a
171
+ // pre-shutdown pool also tears down the shared `pool` (the swap
172
+ // implementation shuts down the outgoing singleton). Subsequent
173
+ // tests in this describe would see a dead pool. afterAll only
174
+ // resets the singleton to null, so this is safe at the tail.
175
+ it("rewraps pool-infrastructure failures as ServiceUnavailableError (HTTP 503)", async () => {
176
+ writeManifest();
177
+ fs.writeFileSync(
178
+ path.join(tempDir, "trivial.malloy"),
179
+ `source: nums is duckdb.sql("select 1 as a")`,
180
+ );
181
+
182
+ const deadPool = new PackageLoadPool(1);
183
+ await deadPool.shutdown();
184
+ await __setPackageLoadPoolForTests(deadPool);
185
+
186
+ const { ServiceUnavailableError } = await import("../errors");
187
+ const { malloyConfig, duckdb } = await makeMalloyConfig();
188
+ try {
189
+ await expect(
190
+ Package.create("env", "pkg", tempDir, malloyConfig),
191
+ ).rejects.toBeInstanceOf(ServiceUnavailableError);
192
+ } finally {
193
+ await duckdb.close();
194
+ }
195
+ });
196
+ });
@@ -1,61 +0,0 @@
1
- // src/service/schema_worker.ts
2
- import { DuckDBConnection } from "@malloydata/db-duckdb";
3
- import"@malloydata/db-duckdb/native";
4
- import {
5
- ConnectionRuntime,
6
- EmptyURLReader
7
- } from "@malloydata/malloy";
8
- import * as path from "path";
9
- import { parentPort } from "worker_threads";
10
- if (!parentPort) {
11
- throw new Error("schema_worker.ts loaded outside a worker thread");
12
- }
13
- var connection = new DuckDBConnection({
14
- name: "duckdb",
15
- databasePath: ":memory:",
16
- threads: 1,
17
- memoryLimit: "256MB"
18
- });
19
- async function handleRequest(req) {
20
- try {
21
- const fullPath = path.join(req.packagePath, req.databasePath);
22
- const normalizedPath = fullPath.replace(/\\/g, "/");
23
- const runtime = new ConnectionRuntime({
24
- urlReader: new EmptyURLReader,
25
- connections: [connection]
26
- });
27
- const model = runtime.loadModel(`source: temp is duckdb.table('${normalizedPath}')`);
28
- const modelDef = await model.getModel();
29
- const fields = modelDef._modelDef.contents["temp"].fields;
30
- const columns = fields.map((field) => ({
31
- type: String(field.type),
32
- name: field.name
33
- }));
34
- const runner = model.loadQuery("run: temp->{aggregate: row_count is count()}");
35
- const result = await runner.run();
36
- const rowCount = result.data.value[0].row_count?.valueOf();
37
- return {
38
- id: req.id,
39
- ok: true,
40
- result: { name: req.databasePath, rowCount, columns }
41
- };
42
- } catch (err) {
43
- const error = err instanceof Error ? err : new Error(String(err));
44
- return {
45
- id: req.id,
46
- ok: false,
47
- error: { message: error.message, stack: error.stack }
48
- };
49
- }
50
- }
51
- parentPort.on("message", async (msg) => {
52
- const response = await handleRequest(msg);
53
- parentPort.postMessage(response);
54
- });
55
- var shutdown = async () => {
56
- try {
57
- await connection.close();
58
- } catch {}
59
- process.exit(0);
60
- };
61
- parentPort.on("close", () => void shutdown());
@@ -1,169 +0,0 @@
1
- import * as fs from "fs";
2
-
3
- import { logger } from "../logger";
4
- import type { PackageMemoryGovernor } from "./package_memory_governor";
5
-
6
- const DEFAULT_INTERVAL_MS = 30_000;
7
-
8
- interface LinuxProcStatus {
9
- threads?: number;
10
- vmRssBytes?: number;
11
- vmSizeBytes?: number;
12
- vmPeakBytes?: number;
13
- vmDataBytes?: number;
14
- voluntaryCtxSwitches?: number;
15
- nonvoluntaryCtxSwitches?: number;
16
- }
17
-
18
- /**
19
- * Parse the subset of `/proc/self/status` that matters for diagnosing
20
- * thread / virtual-memory leaks. The file is small (<5KB), so reading
21
- * it synchronously here is cheap and avoids fs-promise queueing.
22
- *
23
- * Format is `Key:\t<value> [unit]` per line. Sizes are reported in kB;
24
- * we normalize to bytes so log output matches `process.memoryUsage()`.
25
- */
26
- function readLinuxProcStatus(): LinuxProcStatus | null {
27
- try {
28
- const raw = fs.readFileSync("/proc/self/status", "utf8");
29
- const out: LinuxProcStatus = {};
30
- for (const line of raw.split("\n")) {
31
- const [keyRaw, valueRaw] = line.split(":");
32
- if (!keyRaw || !valueRaw) continue;
33
- const key = keyRaw.trim();
34
- const value = valueRaw.trim();
35
- switch (key) {
36
- case "Threads":
37
- out.threads = Number(value);
38
- break;
39
- case "VmRSS":
40
- out.vmRssBytes = kBToBytes(value);
41
- break;
42
- case "VmSize":
43
- out.vmSizeBytes = kBToBytes(value);
44
- break;
45
- case "VmPeak":
46
- out.vmPeakBytes = kBToBytes(value);
47
- break;
48
- case "VmData":
49
- out.vmDataBytes = kBToBytes(value);
50
- break;
51
- case "voluntary_ctxt_switches":
52
- out.voluntaryCtxSwitches = Number(value);
53
- break;
54
- case "nonvoluntary_ctxt_switches":
55
- out.nonvoluntaryCtxSwitches = Number(value);
56
- break;
57
- }
58
- }
59
- return out;
60
- } catch {
61
- return null;
62
- }
63
- }
64
-
65
- function kBToBytes(value: string): number | undefined {
66
- const num = Number(value.replace(/\s*kB$/, ""));
67
- if (!Number.isFinite(num)) return undefined;
68
- return num * 1024;
69
- }
70
-
71
- /**
72
- * Bun exposes JSC heap stats via the `bun:jsc` builtin. Optional —
73
- * absent under plain Node — and best-effort: failures are swallowed
74
- * so the reporter never crashes the process.
75
- */
76
- async function readBunJscStats(): Promise<Record<string, number> | null> {
77
- if (typeof (globalThis as { Bun?: unknown }).Bun === "undefined") {
78
- return null;
79
- }
80
- try {
81
- // Dynamic import so Node builds don't fail at parse time.
82
- const jsc = (await import("bun:jsc")) as unknown as {
83
- heapStats?: () => Record<string, number>;
84
- memoryUsage?: () => Record<string, number>;
85
- };
86
- const heap = jsc.heapStats?.();
87
- const mem = jsc.memoryUsage?.();
88
- if (!heap && !mem) return null;
89
- return { ...(heap ?? {}), ...(mem ?? {}) };
90
- } catch {
91
- return null;
92
- }
93
- }
94
-
95
- /**
96
- * Periodically logs process memory and thread counts to give ops a
97
- * cheap, always-on signal for the leak classes that have OOM-killed
98
- * prod (DuckDB connection thread pools, libuv worker pool, Malloy
99
- * compile heap, etc.).
100
- *
101
- * Logs at `info` so it shows up without flipping `LOG_LEVEL`. Volume
102
- * is low (~2 lines/minute by default). Pulls the memory governor's
103
- * snapshot too so RSS/back-pressure state appears in the same line as
104
- * Node/Bun heap.
105
- */
106
- export class ProcessStatsReporter {
107
- private timer: ReturnType<typeof setInterval> | null = null;
108
- private readonly intervalMs: number;
109
- private readonly memoryGovernor: PackageMemoryGovernor | null;
110
-
111
- constructor(
112
- memoryGovernor: PackageMemoryGovernor | null,
113
- intervalMs: number = DEFAULT_INTERVAL_MS,
114
- ) {
115
- this.memoryGovernor = memoryGovernor;
116
- this.intervalMs = intervalMs;
117
- }
118
-
119
- public start(): void {
120
- if (this.timer !== null) return;
121
- // Immediate first sample so a freshly-started pod logs its
122
- // baseline before the first 30s has elapsed.
123
- void this.tick();
124
- this.timer = setInterval(() => void this.tick(), this.intervalMs);
125
- // Don't keep the event loop alive on our account — if everything
126
- // else has shut down, the reporter shouldn't block exit.
127
- (
128
- this.timer as ReturnType<typeof setInterval> & {
129
- unref?: () => void;
130
- }
131
- ).unref?.();
132
- logger.info(
133
- `ProcessStatsReporter started (intervalMs=${this.intervalMs})`,
134
- );
135
- }
136
-
137
- public stop(): void {
138
- if (this.timer !== null) {
139
- clearInterval(this.timer);
140
- this.timer = null;
141
- }
142
- }
143
-
144
- private async tick(): Promise<void> {
145
- try {
146
- const mem = process.memoryUsage();
147
- const proc =
148
- process.platform === "linux" ? readLinuxProcStatus() : null;
149
- const bun = await readBunJscStats();
150
- const governor = this.memoryGovernor?.getStatus() ?? null;
151
-
152
- logger.info("process stats", {
153
- uptimeSeconds: Math.round(process.uptime()),
154
- nodeMemory: {
155
- rssBytes: mem.rss,
156
- heapTotalBytes: mem.heapTotal,
157
- heapUsedBytes: mem.heapUsed,
158
- externalBytes: mem.external,
159
- arrayBuffersBytes: mem.arrayBuffers,
160
- },
161
- linux: proc,
162
- bunJsc: bun,
163
- memoryGovernor: governor,
164
- });
165
- } catch (err) {
166
- logger.warn("ProcessStatsReporter tick failed", { error: err });
167
- }
168
- }
169
- }
@@ -1,123 +0,0 @@
1
- /**
2
- * Worker thread that owns one capped DuckDB connection and answers
3
- * schema-introspection requests for parquet/csv files. Running this
4
- * off the main thread isolates the native DuckDB thread pool — when
5
- * the worker exits, its threads die with it, which puts a hard
6
- * ceiling on the leak class that OOM-killed prod
7
- * (worker-76b49bdb89-8bsv4: 466 leaked Bun Pool threads).
8
- *
9
- * Protocol (parent ↔ worker):
10
- * parent → worker: { id, packagePath, databasePath }
11
- * worker → parent: { id, ok: true, result: SchemaResult }
12
- * | { id, ok: false, error: { message, stack? } }
13
- *
14
- * One request at a time per worker — the pool in the parent
15
- * (`schema_worker_pool.ts`) handles fan-out. Keeping the worker
16
- * single-threaded from the JS side matches DuckDB's behavior on a
17
- * single connection and avoids head-of-line blocking inside the
18
- * worker itself.
19
- */
20
- import { DuckDBConnection } from "@malloydata/db-duckdb";
21
- import "@malloydata/db-duckdb/native";
22
- import {
23
- ConnectionRuntime,
24
- EmptyURLReader,
25
- SourceDef,
26
- } from "@malloydata/malloy";
27
- import * as path from "path";
28
- import { parentPort } from "worker_threads";
29
-
30
- export interface SchemaRequest {
31
- id: number;
32
- packagePath: string;
33
- databasePath: string;
34
- }
35
-
36
- export interface SchemaResponse {
37
- id: number;
38
- ok: boolean;
39
- result?: {
40
- name: string;
41
- rowCount: number;
42
- columns: Array<{ type: string; name: string }>;
43
- };
44
- error?: { message: string; stack?: string };
45
- }
46
-
47
- if (!parentPort) {
48
- // Defensive: schema_worker.ts must only be loaded as a worker. If
49
- // someone accidentally imports it from the main thread the
50
- // connection below would still allocate its native pool there,
51
- // recreating the exact leak this file exists to fix.
52
- throw new Error("schema_worker.ts loaded outside a worker thread");
53
- }
54
-
55
- // One DuckDB connection per worker, capped tight. Schema introspection
56
- // reads parquet footers / csv headers — it does not need parallelism
57
- // or a large memory arena. The cap is what keeps the per-worker
58
- // native-thread cost bounded.
59
- const connection = new DuckDBConnection({
60
- name: "duckdb",
61
- databasePath: ":memory:",
62
- threads: 1,
63
- memoryLimit: "256MB",
64
- });
65
-
66
- async function handleRequest(req: SchemaRequest): Promise<SchemaResponse> {
67
- try {
68
- const fullPath = path.join(req.packagePath, req.databasePath);
69
- // DuckDB on Windows supports forward slashes, and this avoids
70
- // escaping issues in the inline SQL below.
71
- const normalizedPath = fullPath.replace(/\\/g, "/");
72
-
73
- const runtime = new ConnectionRuntime({
74
- urlReader: new EmptyURLReader(),
75
- connections: [connection],
76
- });
77
- const model = runtime.loadModel(
78
- `source: temp is duckdb.table('${normalizedPath}')`,
79
- );
80
- const modelDef = await model.getModel();
81
- const fields = (modelDef._modelDef.contents["temp"] as SourceDef).fields;
82
- const columns = fields.map((field) => ({
83
- type: String(field.type),
84
- name: field.name,
85
- }));
86
-
87
- const runner = model.loadQuery(
88
- "run: temp->{aggregate: row_count is count()}",
89
- );
90
- const result = await runner.run();
91
- const rowCount = result.data.value[0].row_count?.valueOf() as number;
92
-
93
- return {
94
- id: req.id,
95
- ok: true,
96
- result: { name: req.databasePath, rowCount, columns },
97
- };
98
- } catch (err) {
99
- const error = err instanceof Error ? err : new Error(String(err));
100
- return {
101
- id: req.id,
102
- ok: false,
103
- error: { message: error.message, stack: error.stack },
104
- };
105
- }
106
- }
107
-
108
- parentPort.on("message", async (msg: SchemaRequest) => {
109
- const response = await handleRequest(msg);
110
- parentPort!.postMessage(response);
111
- });
112
-
113
- // On any termination signal, close the connection so DuckDB releases
114
- // its native threads cleanly instead of leaking them past worker exit.
115
- const shutdown = async () => {
116
- try {
117
- await connection.close();
118
- } catch {
119
- // best effort
120
- }
121
- process.exit(0);
122
- };
123
- parentPort.on("close", () => void shutdown());