@malloy-publisher/server 0.0.198-dev3 → 0.0.198-dev4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,21 @@
1
1
  import * as fs from "fs/promises";
2
2
  import * as path from "path";
3
3
 
4
+ import { DuckDBConnection } from "@malloydata/db-duckdb";
5
+ import "@malloydata/db-duckdb/native";
4
6
  import {
5
7
  Connection,
8
+ ConnectionRuntime,
6
9
  contextOverlay,
10
+ EmptyURLReader,
7
11
  FixedConnectionMap,
8
12
  MalloyConfig,
13
+ SourceDef,
9
14
  } from "@malloydata/malloy";
10
15
  import { metrics } from "@opentelemetry/api";
11
16
  import recursive from "recursive-readdir";
12
17
  import { components } from "../api";
18
+ import { getCompilePool } from "../compile/compile_pool";
13
19
  import {
14
20
  API_PREFIX,
15
21
  MODEL_FILE_SUFFIX,
@@ -21,12 +27,13 @@ import { formatDuration, logger } from "../logger";
21
27
  import { BuildManifest } from "../storage/DatabaseInterface";
22
28
  import { ignoreDotfiles } from "../utils";
23
29
  import { Model } from "./model";
24
- import { getSchemaWorkerPool } from "./schema_worker_pool";
25
30
 
26
31
  type ApiDatabase = components["schemas"]["Database"];
27
32
  type ApiModel = components["schemas"]["Model"];
28
33
  type ApiNotebook = components["schemas"]["Notebook"];
29
34
  export type ApiPackage = components["schemas"]["Package"];
35
+ type ApiColumn = components["schemas"]["Column"];
36
+ type ApiTableDescription = components["schemas"]["TableDescription"];
30
37
  // A thunk lets callers pass a live reference to the *current* environment
31
38
  // MalloyConfig so the package wrapper resolves environment connections against the
32
39
  // generation that's active at lookup time, not the one that was current when
@@ -87,8 +94,6 @@ export class Package {
87
94
  duration: formatDuration(manifestValidationTime - startTime),
88
95
  });
89
96
 
90
- let packageMalloyConfig: MalloyConfig | undefined;
91
-
92
97
  try {
93
98
  const packageConfig = await Package.readPackageConfig(packagePath);
94
99
  const packageConfigTime = performance.now();
@@ -177,17 +182,6 @@ export class Package {
177
182
  malloy_package_name: packageName,
178
183
  status: "error",
179
184
  });
180
-
181
- if (packageMalloyConfig) {
182
- try {
183
- await packageMalloyConfig.shutdown("close");
184
- } catch (releaseError) {
185
- logger.warn(
186
- `Failed to release package-local DuckDB for ${packageName}`,
187
- { error: releaseError },
188
- );
189
- }
190
- }
191
185
  // Clean up package directory on failure
192
186
  try {
193
187
  await fs.rm(packagePath, {
@@ -437,43 +431,22 @@ export class Package {
437
431
  private static async readDatabases(
438
432
  packagePath: string,
439
433
  ): Promise<ApiDatabase[]> {
440
- const databasePaths = await Package.getDatabasePaths(packagePath);
441
- if (databasePaths.length === 0) return [];
442
-
443
- // Off-main-thread: schema introspection runs in the
444
- // SchemaWorkerPool so DuckDB's native thread pool lives inside
445
- // a worker we control. This is the leak class that OOM-killed
446
- // prod (466 leaked Bun Pool threads on worker-76b49bdb89-8bsv4)
447
- // — worker isolation puts a hard ceiling on per-package native
448
- // thread usage and the worker's connection is reused across all
449
- // schema queries for the life of the process.
450
- const pool = getSchemaWorkerPool();
451
- const settled = await Promise.allSettled(
452
- databasePaths.map((databasePath) =>
453
- pool.submit(packagePath, databasePath),
434
+ return await Promise.all(
435
+ (await Package.getDatabasePaths(packagePath)).map(
436
+ async (databasePath) => {
437
+ const databaseInfo = await Package.getDatabaseInfo(
438
+ packagePath,
439
+ databasePath,
440
+ );
441
+
442
+ return {
443
+ path: databasePath,
444
+ info: databaseInfo,
445
+ type: "embedded",
446
+ };
447
+ },
454
448
  ),
455
449
  );
456
-
457
- const results: ApiDatabase[] = [];
458
- for (let i = 0; i < settled.length; i++) {
459
- const outcome = settled[i];
460
- if (outcome.status === "fulfilled") {
461
- results.push({
462
- path: databasePaths[i],
463
- info: outcome.value,
464
- type: "embedded",
465
- });
466
- } else {
467
- // A single bad parquet (corrupt footer, unsupported type)
468
- // must not fail the whole package load. Log and skip.
469
- logger.warn("Schema introspection failed for database", {
470
- packagePath,
471
- databasePath: databasePaths[i],
472
- error: outcome.reason,
473
- });
474
- }
475
- }
476
- return results;
477
450
  }
478
451
 
479
452
  private static async getDatabasePaths(
@@ -490,6 +463,140 @@ export class Package {
490
463
  );
491
464
  }
492
465
 
466
+ private static async getDatabaseInfo(
467
+ packagePath: string,
468
+ databasePath: string,
469
+ ): Promise<ApiTableDescription> {
470
+ const fullPath = path.join(packagePath, databasePath);
471
+ // Normalize path to use forward slashes for cross-platform compatibility.
472
+ // DuckDB on Windows supports forward slashes, which avoids escaping issues.
473
+ const normalizedPath = fullPath.replace(/\\/g, "/");
474
+
475
+ // One DuckDB connection per file (matches the historical
476
+ // ConnectionRuntime shape). Reused for both the schema probe and
477
+ // the row-count SQL so we only pay native init once per call.
478
+ const conn = new DuckDBConnection("duckdb");
479
+
480
+ // Schema probe. We need Malloy's view of the column types
481
+ // (consumers of the API rely on Malloy type strings, not DuckDB
482
+ // native types), so this stays a Malloy compile. The compile is
483
+ // CPU-heavy on the main thread relative to the work it produces,
484
+ // so when the worker pool is enabled we ship the synthetic
485
+ // `source: temp is duckdb.table(…)` snippet to a worker and read
486
+ // the resulting modelDef back here. Schema-fetch RPCs from the
487
+ // worker proxy through the pool against `conn` below.
488
+ const pool = getCompilePool();
489
+ let schema: ApiColumn[];
490
+ if (pool.enabled) {
491
+ schema = await Package.getSchemaViaPool(
492
+ pool,
493
+ packagePath,
494
+ normalizedPath,
495
+ conn,
496
+ );
497
+ } else {
498
+ schema = await Package.getSchemaInProcess(normalizedPath, conn);
499
+ }
500
+
501
+ // Row count. We previously compiled
502
+ // `run: temp->{aggregate: row_count is count()}` via Malloy, which
503
+ // is another full parse + type-check + SQL gen on the main thread
504
+ // just to produce a literal `SELECT count(*)`. Skip Malloy entirely
505
+ // and call DuckDB directly — same wire result, no main-thread CPU.
506
+ // Single quotes in the path are doubled to be safe against
507
+ // SQL-injection from filenames (DuckDB uses ANSI quoting rules).
508
+ const escapedPath = normalizedPath.replace(/'/g, "''");
509
+ const sqlResult = await conn.runSQL(
510
+ `SELECT count(*)::BIGINT AS row_count FROM '${escapedPath}'`,
511
+ );
512
+ const firstRow = sqlResult.rows[0] as { row_count?: bigint | number };
513
+ const rowCount = Number(firstRow.row_count ?? 0);
514
+
515
+ return { name: databasePath, rowCount, columns: schema };
516
+ }
517
+
518
+ /**
519
+ * In-process schema probe (legacy / kill-switch path). Builds a
520
+ * minimal ConnectionRuntime + compiles the synthetic snippet on the
521
+ * main thread. Same behaviour as the pre-worker-pool implementation.
522
+ */
523
+ private static async getSchemaInProcess(
524
+ normalizedPath: string,
525
+ conn: DuckDBConnection,
526
+ ): Promise<ApiColumn[]> {
527
+ const runtime = new ConnectionRuntime({
528
+ urlReader: new EmptyURLReader(),
529
+ connections: [conn],
530
+ });
531
+ const model = runtime.loadModel(
532
+ `source: temp is duckdb.table('${normalizedPath}')`,
533
+ );
534
+ const modelDef = await model.getModel();
535
+ const fields = (modelDef._modelDef.contents["temp"] as SourceDef).fields;
536
+ return fields.map((field): ApiColumn => {
537
+ return { type: field.type, name: field.name };
538
+ });
539
+ }
540
+
541
+ /**
542
+ * Worker-pool schema probe. The synthetic Malloy snippet is compiled
543
+ * in a worker_threads worker; the worker's schema-fetch RPC bounces
544
+ * back to the main thread, which services it against the
545
+ * `MalloyConfig` we hold on `conn` below. The returned modelDef has
546
+ * the resolved field list ready to read.
547
+ */
548
+ private static async getSchemaViaPool(
549
+ pool: ReturnType<typeof getCompilePool>,
550
+ packagePath: string,
551
+ normalizedPath: string,
552
+ conn: DuckDBConnection,
553
+ ): Promise<ApiColumn[]> {
554
+ // Keep schema-fetch RPCs from the worker routed to *this* conn
555
+ // (same instance used by the row-count SQL) by wrapping it in a
556
+ // MalloyConfig. We don't ship this config across the worker
557
+ // boundary; the pool holds it on the main side.
558
+ const malloyConfig = new MalloyConfig(
559
+ { connections: {} },
560
+ { config: contextOverlay({ rootDirectory: packagePath }) },
561
+ );
562
+ malloyConfig.wrapConnections(() => ({
563
+ lookupConnection: async (_name?: string) =>
564
+ conn as unknown as Connection,
565
+ }));
566
+
567
+ try {
568
+ const outcome = await pool.compileInline({
569
+ packagePath,
570
+ source: `source: temp is duckdb.table('${normalizedPath}')`,
571
+ malloyConfig,
572
+ defaultConnectionName: "duckdb",
573
+ });
574
+ const modelDef = outcome.modelDef as unknown as {
575
+ contents: Record<string, SourceDef>;
576
+ };
577
+ const fields = modelDef.contents["temp"].fields;
578
+ return fields.map((field): ApiColumn => {
579
+ return { type: field.type, name: field.name };
580
+ });
581
+ } catch (error) {
582
+ // Transient pool issues (worker exit, RPC timeout) shouldn't
583
+ // break package loading. Compile errors here would mean the
584
+ // file isn't a readable parquet/csv, which in-process would
585
+ // also throw — so let those propagate.
586
+ if (
587
+ error instanceof Error &&
588
+ !/timed out|exited unexpectedly|shutting down/i.test(error.message)
589
+ ) {
590
+ throw error;
591
+ }
592
+ logger.warn(
593
+ "Compile worker failed for database probe; falling back to in-process",
594
+ { normalizedPath, error: (error as Error).message },
595
+ );
596
+ return Package.getSchemaInProcess(normalizedPath, conn);
597
+ }
598
+ }
599
+
493
600
  public setName(name: string) {
494
601
  this.packageName = name;
495
602
  }
@@ -1,61 +0,0 @@
1
- // src/service/schema_worker.ts
2
- import { DuckDBConnection } from "@malloydata/db-duckdb";
3
- import"@malloydata/db-duckdb/native";
4
- import {
5
- ConnectionRuntime,
6
- EmptyURLReader
7
- } from "@malloydata/malloy";
8
- import * as path from "path";
9
- import { parentPort } from "worker_threads";
10
- if (!parentPort) {
11
- throw new Error("schema_worker.ts loaded outside a worker thread");
12
- }
13
- var connection = new DuckDBConnection({
14
- name: "duckdb",
15
- databasePath: ":memory:",
16
- threads: 1,
17
- memoryLimit: "256MB"
18
- });
19
- async function handleRequest(req) {
20
- try {
21
- const fullPath = path.join(req.packagePath, req.databasePath);
22
- const normalizedPath = fullPath.replace(/\\/g, "/");
23
- const runtime = new ConnectionRuntime({
24
- urlReader: new EmptyURLReader,
25
- connections: [connection]
26
- });
27
- const model = runtime.loadModel(`source: temp is duckdb.table('${normalizedPath}')`);
28
- const modelDef = await model.getModel();
29
- const fields = modelDef._modelDef.contents["temp"].fields;
30
- const columns = fields.map((field) => ({
31
- type: String(field.type),
32
- name: field.name
33
- }));
34
- const runner = model.loadQuery("run: temp->{aggregate: row_count is count()}");
35
- const result = await runner.run();
36
- const rowCount = result.data.value[0].row_count?.valueOf();
37
- return {
38
- id: req.id,
39
- ok: true,
40
- result: { name: req.databasePath, rowCount, columns }
41
- };
42
- } catch (err) {
43
- const error = err instanceof Error ? err : new Error(String(err));
44
- return {
45
- id: req.id,
46
- ok: false,
47
- error: { message: error.message, stack: error.stack }
48
- };
49
- }
50
- }
51
- parentPort.on("message", async (msg) => {
52
- const response = await handleRequest(msg);
53
- parentPort.postMessage(response);
54
- });
55
- var shutdown = async () => {
56
- try {
57
- await connection.close();
58
- } catch {}
59
- process.exit(0);
60
- };
61
- parentPort.on("close", () => void shutdown());
@@ -1,169 +0,0 @@
1
- import * as fs from "fs";
2
-
3
- import { logger } from "../logger";
4
- import type { PackageMemoryGovernor } from "./package_memory_governor";
5
-
6
- const DEFAULT_INTERVAL_MS = 30_000;
7
-
8
- interface LinuxProcStatus {
9
- threads?: number;
10
- vmRssBytes?: number;
11
- vmSizeBytes?: number;
12
- vmPeakBytes?: number;
13
- vmDataBytes?: number;
14
- voluntaryCtxSwitches?: number;
15
- nonvoluntaryCtxSwitches?: number;
16
- }
17
-
18
- /**
19
- * Parse the subset of `/proc/self/status` that matters for diagnosing
20
- * thread / virtual-memory leaks. The file is small (<5KB), so reading
21
- * it synchronously here is cheap and avoids fs-promise queueing.
22
- *
23
- * Format is `Key:\t<value> [unit]` per line. Sizes are reported in kB;
24
- * we normalize to bytes so log output matches `process.memoryUsage()`.
25
- */
26
- function readLinuxProcStatus(): LinuxProcStatus | null {
27
- try {
28
- const raw = fs.readFileSync("/proc/self/status", "utf8");
29
- const out: LinuxProcStatus = {};
30
- for (const line of raw.split("\n")) {
31
- const [keyRaw, valueRaw] = line.split(":");
32
- if (!keyRaw || !valueRaw) continue;
33
- const key = keyRaw.trim();
34
- const value = valueRaw.trim();
35
- switch (key) {
36
- case "Threads":
37
- out.threads = Number(value);
38
- break;
39
- case "VmRSS":
40
- out.vmRssBytes = kBToBytes(value);
41
- break;
42
- case "VmSize":
43
- out.vmSizeBytes = kBToBytes(value);
44
- break;
45
- case "VmPeak":
46
- out.vmPeakBytes = kBToBytes(value);
47
- break;
48
- case "VmData":
49
- out.vmDataBytes = kBToBytes(value);
50
- break;
51
- case "voluntary_ctxt_switches":
52
- out.voluntaryCtxSwitches = Number(value);
53
- break;
54
- case "nonvoluntary_ctxt_switches":
55
- out.nonvoluntaryCtxSwitches = Number(value);
56
- break;
57
- }
58
- }
59
- return out;
60
- } catch {
61
- return null;
62
- }
63
- }
64
-
65
- function kBToBytes(value: string): number | undefined {
66
- const num = Number(value.replace(/\s*kB$/, ""));
67
- if (!Number.isFinite(num)) return undefined;
68
- return num * 1024;
69
- }
70
-
71
- /**
72
- * Bun exposes JSC heap stats via the `bun:jsc` builtin. Optional —
73
- * absent under plain Node — and best-effort: failures are swallowed
74
- * so the reporter never crashes the process.
75
- */
76
- async function readBunJscStats(): Promise<Record<string, number> | null> {
77
- if (typeof (globalThis as { Bun?: unknown }).Bun === "undefined") {
78
- return null;
79
- }
80
- try {
81
- // Dynamic import so Node builds don't fail at parse time.
82
- const jsc = (await import("bun:jsc")) as unknown as {
83
- heapStats?: () => Record<string, number>;
84
- memoryUsage?: () => Record<string, number>;
85
- };
86
- const heap = jsc.heapStats?.();
87
- const mem = jsc.memoryUsage?.();
88
- if (!heap && !mem) return null;
89
- return { ...(heap ?? {}), ...(mem ?? {}) };
90
- } catch {
91
- return null;
92
- }
93
- }
94
-
95
- /**
96
- * Periodically logs process memory and thread counts to give ops a
97
- * cheap, always-on signal for the leak classes that have OOM-killed
98
- * prod (DuckDB connection thread pools, libuv worker pool, Malloy
99
- * compile heap, etc.).
100
- *
101
- * Logs at `info` so it shows up without flipping `LOG_LEVEL`. Volume
102
- * is low (~2 lines/minute by default). Pulls the memory governor's
103
- * snapshot too so RSS/back-pressure state appears in the same line as
104
- * Node/Bun heap.
105
- */
106
- export class ProcessStatsReporter {
107
- private timer: ReturnType<typeof setInterval> | null = null;
108
- private readonly intervalMs: number;
109
- private readonly memoryGovernor: PackageMemoryGovernor | null;
110
-
111
- constructor(
112
- memoryGovernor: PackageMemoryGovernor | null,
113
- intervalMs: number = DEFAULT_INTERVAL_MS,
114
- ) {
115
- this.memoryGovernor = memoryGovernor;
116
- this.intervalMs = intervalMs;
117
- }
118
-
119
- public start(): void {
120
- if (this.timer !== null) return;
121
- // Immediate first sample so a freshly-started pod logs its
122
- // baseline before the first 30s has elapsed.
123
- void this.tick();
124
- this.timer = setInterval(() => void this.tick(), this.intervalMs);
125
- // Don't keep the event loop alive on our account — if everything
126
- // else has shut down, the reporter shouldn't block exit.
127
- (
128
- this.timer as ReturnType<typeof setInterval> & {
129
- unref?: () => void;
130
- }
131
- ).unref?.();
132
- logger.info(
133
- `ProcessStatsReporter started (intervalMs=${this.intervalMs})`,
134
- );
135
- }
136
-
137
- public stop(): void {
138
- if (this.timer !== null) {
139
- clearInterval(this.timer);
140
- this.timer = null;
141
- }
142
- }
143
-
144
- private async tick(): Promise<void> {
145
- try {
146
- const mem = process.memoryUsage();
147
- const proc =
148
- process.platform === "linux" ? readLinuxProcStatus() : null;
149
- const bun = await readBunJscStats();
150
- const governor = this.memoryGovernor?.getStatus() ?? null;
151
-
152
- logger.info("process stats", {
153
- uptimeSeconds: Math.round(process.uptime()),
154
- nodeMemory: {
155
- rssBytes: mem.rss,
156
- heapTotalBytes: mem.heapTotal,
157
- heapUsedBytes: mem.heapUsed,
158
- externalBytes: mem.external,
159
- arrayBuffersBytes: mem.arrayBuffers,
160
- },
161
- linux: proc,
162
- bunJsc: bun,
163
- memoryGovernor: governor,
164
- });
165
- } catch (err) {
166
- logger.warn("ProcessStatsReporter tick failed", { error: err });
167
- }
168
- }
169
- }
@@ -1,123 +0,0 @@
1
- /**
2
- * Worker thread that owns one capped DuckDB connection and answers
3
- * schema-introspection requests for parquet/csv files. Running this
4
- * off the main thread isolates the native DuckDB thread pool — when
5
- * the worker exits, its threads die with it, which puts a hard
6
- * ceiling on the leak class that OOM-killed prod
7
- * (worker-76b49bdb89-8bsv4: 466 leaked Bun Pool threads).
8
- *
9
- * Protocol (parent ↔ worker):
10
- * parent → worker: { id, packagePath, databasePath }
11
- * worker → parent: { id, ok: true, result: SchemaResult }
12
- * | { id, ok: false, error: { message, stack? } }
13
- *
14
- * One request at a time per worker — the pool in the parent
15
- * (`schema_worker_pool.ts`) handles fan-out. Keeping the worker
16
- * single-threaded from the JS side matches DuckDB's behavior on a
17
- * single connection and avoids head-of-line blocking inside the
18
- * worker itself.
19
- */
20
- import { DuckDBConnection } from "@malloydata/db-duckdb";
21
- import "@malloydata/db-duckdb/native";
22
- import {
23
- ConnectionRuntime,
24
- EmptyURLReader,
25
- SourceDef,
26
- } from "@malloydata/malloy";
27
- import * as path from "path";
28
- import { parentPort } from "worker_threads";
29
-
30
- export interface SchemaRequest {
31
- id: number;
32
- packagePath: string;
33
- databasePath: string;
34
- }
35
-
36
- export interface SchemaResponse {
37
- id: number;
38
- ok: boolean;
39
- result?: {
40
- name: string;
41
- rowCount: number;
42
- columns: Array<{ type: string; name: string }>;
43
- };
44
- error?: { message: string; stack?: string };
45
- }
46
-
47
- if (!parentPort) {
48
- // Defensive: schema_worker.ts must only be loaded as a worker. If
49
- // someone accidentally imports it from the main thread the
50
- // connection below would still allocate its native pool there,
51
- // recreating the exact leak this file exists to fix.
52
- throw new Error("schema_worker.ts loaded outside a worker thread");
53
- }
54
-
55
- // One DuckDB connection per worker, capped tight. Schema introspection
56
- // reads parquet footers / csv headers — it does not need parallelism
57
- // or a large memory arena. The cap is what keeps the per-worker
58
- // native-thread cost bounded.
59
- const connection = new DuckDBConnection({
60
- name: "duckdb",
61
- databasePath: ":memory:",
62
- threads: 1,
63
- memoryLimit: "256MB",
64
- });
65
-
66
- async function handleRequest(req: SchemaRequest): Promise<SchemaResponse> {
67
- try {
68
- const fullPath = path.join(req.packagePath, req.databasePath);
69
- // DuckDB on Windows supports forward slashes, and this avoids
70
- // escaping issues in the inline SQL below.
71
- const normalizedPath = fullPath.replace(/\\/g, "/");
72
-
73
- const runtime = new ConnectionRuntime({
74
- urlReader: new EmptyURLReader(),
75
- connections: [connection],
76
- });
77
- const model = runtime.loadModel(
78
- `source: temp is duckdb.table('${normalizedPath}')`,
79
- );
80
- const modelDef = await model.getModel();
81
- const fields = (modelDef._modelDef.contents["temp"] as SourceDef).fields;
82
- const columns = fields.map((field) => ({
83
- type: String(field.type),
84
- name: field.name,
85
- }));
86
-
87
- const runner = model.loadQuery(
88
- "run: temp->{aggregate: row_count is count()}",
89
- );
90
- const result = await runner.run();
91
- const rowCount = result.data.value[0].row_count?.valueOf() as number;
92
-
93
- return {
94
- id: req.id,
95
- ok: true,
96
- result: { name: req.databasePath, rowCount, columns },
97
- };
98
- } catch (err) {
99
- const error = err instanceof Error ? err : new Error(String(err));
100
- return {
101
- id: req.id,
102
- ok: false,
103
- error: { message: error.message, stack: error.stack },
104
- };
105
- }
106
- }
107
-
108
- parentPort.on("message", async (msg: SchemaRequest) => {
109
- const response = await handleRequest(msg);
110
- parentPort!.postMessage(response);
111
- });
112
-
113
- // On any termination signal, close the connection so DuckDB releases
114
- // its native threads cleanly instead of leaking them past worker exit.
115
- const shutdown = async () => {
116
- try {
117
- await connection.close();
118
- } catch {
119
- // best effort
120
- }
121
- process.exit(0);
122
- };
123
- parentPort.on("close", () => void shutdown());