@malloy-publisher/server 0.0.198-dev3 → 0.0.198-dev6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,32 +1,43 @@
1
1
  import * as fs from "fs/promises";
2
2
  import * as path from "path";
3
3
 
4
+ import "@malloydata/db-duckdb/native";
4
5
  import {
5
6
  Connection,
7
+ ConnectionRuntime,
6
8
  contextOverlay,
9
+ EmptyURLReader,
7
10
  FixedConnectionMap,
8
11
  MalloyConfig,
12
+ MalloyError,
13
+ SourceDef,
9
14
  } from "@malloydata/malloy";
10
15
  import { metrics } from "@opentelemetry/api";
11
16
  import recursive from "recursive-readdir";
12
17
  import { components } from "../api";
18
+ import { getPackageLoadPool } from "../package_load/package_load_pool";
13
19
  import {
14
20
  API_PREFIX,
15
21
  MODEL_FILE_SUFFIX,
16
22
  NOTEBOOK_FILE_SUFFIX,
17
23
  PACKAGE_MANIFEST_NAME,
18
24
  } from "../constants";
19
- import { PackageNotFoundError } from "../errors";
25
+ import {
26
+ ModelCompilationError,
27
+ PackageNotFoundError,
28
+ ServiceUnavailableError,
29
+ } from "../errors";
20
30
  import { formatDuration, logger } from "../logger";
21
31
  import { BuildManifest } from "../storage/DatabaseInterface";
22
32
  import { ignoreDotfiles } from "../utils";
23
33
  import { Model } from "./model";
24
- import { getSchemaWorkerPool } from "./schema_worker_pool";
25
34
 
26
35
  type ApiDatabase = components["schemas"]["Database"];
27
36
  type ApiModel = components["schemas"]["Model"];
28
37
  type ApiNotebook = components["schemas"]["Notebook"];
29
38
  export type ApiPackage = components["schemas"]["Package"];
39
+ type ApiColumn = components["schemas"]["Column"];
40
+ type ApiTableDescription = components["schemas"]["TableDescription"];
30
41
  // A thunk lets callers pass a live reference to the *current* environment
31
42
  // MalloyConfig so the package wrapper resolves environment connections against the
32
43
  // generation that's active at lookup time, not the one that was current when
@@ -87,26 +98,13 @@ export class Package {
87
98
  duration: formatDuration(manifestValidationTime - startTime),
88
99
  });
89
100
 
90
- let packageMalloyConfig: MalloyConfig | undefined;
91
-
92
101
  try {
93
- const packageConfig = await Package.readPackageConfig(packagePath);
94
- const packageConfigTime = performance.now();
95
- logger.info("Package config read completed", {
96
- packageName,
97
- duration: formatDuration(
98
- packageConfigTime - manifestValidationTime,
99
- ),
100
- });
101
- packageConfig.resource = `${API_PREFIX}/environments/${environmentName}/packages/${packageName}`;
102
-
103
- const databases = await Package.readDatabases(packagePath);
104
- const databasesTime = performance.now();
105
- logger.info("Databases read completed", {
106
- packageName,
107
- databaseCount: databases.length,
108
- duration: formatDuration(databasesTime - packageConfigTime),
109
- });
102
+ // The MalloyConfig is always built on the main thread — it
103
+ // owns the live native connection handles the package needs
104
+ // to *serve queries* after load (workers can't share native
105
+ // handles across the V8 isolate boundary). The worker proxies
106
+ // non-duckdb connection lookups back through this MalloyConfig
107
+ // during compile.
110
108
  const malloyConfig = Package.buildPackageMalloyConfig(
111
109
  packagePath,
112
110
  typeof environmentMalloyConfig === "function"
@@ -114,80 +112,30 @@ export class Package {
114
112
  : () => Package.toMalloyConfig(environmentMalloyConfig),
115
113
  );
116
114
 
117
- const models = await Package.loadModels(
118
- packageName,
119
- packagePath,
120
- malloyConfig,
121
- );
122
- const modelsTime = performance.now();
123
- logger.info("Models loaded", {
124
- packageName,
125
- modelCount: models.size,
126
- duration: formatDuration(modelsTime - databasesTime),
127
- });
128
- for (const [modelPath, model] of models.entries()) {
129
- const maybeModel = model as unknown as {
130
- compilationError?: unknown;
131
- };
132
- if (maybeModel.compilationError) {
133
- const err = maybeModel.compilationError;
134
- const message =
135
- err instanceof Error
136
- ? err.message
137
- : `Unknown compilation error in ${modelPath}`;
138
-
139
- logger.error("Model compilation failed", {
140
- packageName,
141
- modelPath,
142
- error: message,
143
- });
144
-
145
- this.packageLoadHistogram.record(performance.now() - startTime, {
146
- malloy_package_name: packageName,
147
- status: "compilation_error",
148
- });
149
- throw err;
150
- }
151
- }
152
- const endTime = performance.now();
153
- const executionTime = endTime - startTime;
154
- this.packageLoadHistogram.record(executionTime, {
155
- malloy_package_name: packageName,
156
- status: "success",
157
- });
158
- logger.info(`Successfully loaded package ${packageName}`, {
159
- packageName,
160
- duration: formatDuration(executionTime),
161
- });
162
- return new Package(
115
+ return await Package.loadViaWorker(
163
116
  environmentName,
164
117
  packageName,
165
118
  packagePath,
166
- packageConfig,
167
- databases,
168
- models,
169
119
  malloyConfig,
120
+ startTime,
121
+ manifestValidationTime,
170
122
  );
171
123
  } catch (error) {
172
124
  logger.error(`Error loading package ${packageName}`, { error });
173
125
  console.error(error);
174
126
  const endTime = performance.now();
175
127
  const executionTime = endTime - startTime;
128
+ const status =
129
+ error instanceof ModelCompilationError ||
130
+ error instanceof MalloyError
131
+ ? "compilation_error"
132
+ : error instanceof ServiceUnavailableError
133
+ ? "pool_unavailable"
134
+ : "error";
176
135
  this.packageLoadHistogram.record(executionTime, {
177
136
  malloy_package_name: packageName,
178
- status: "error",
137
+ status,
179
138
  });
180
-
181
- if (packageMalloyConfig) {
182
- try {
183
- await packageMalloyConfig.shutdown("close");
184
- } catch (releaseError) {
185
- logger.warn(
186
- `Failed to release package-local DuckDB for ${packageName}`,
187
- { error: releaseError },
188
- );
189
- }
190
- }
191
139
  // Clean up package directory on failure
192
140
  try {
193
141
  await fs.rm(packagePath, {
@@ -204,6 +152,141 @@ export class Package {
204
152
  }
205
153
  }
206
154
 
155
+ /**
156
+ * Load the package via the package-load worker pool. The worker
157
+ * performs the CPU-bound bulk of the load off-thread (manifest
158
+ * read, every `.malloy` / `.malloynb` compile) and ships back a
159
+ * structured-clonable `LoadPackageOutcome`. Database probes
160
+ * (`.parquet` / `.csv`) run on the main thread, in parallel with
161
+ * the worker compile, against the package's existing DuckDB
162
+ * connection — they're async-IO-bound and don't compete with the
163
+ * worker for CPU.
164
+ *
165
+ * Pool-infrastructure failures (worker crash, RPC timeout, pool
166
+ * shutting down) are rewrapped as `ServiceUnavailableError` so
167
+ * the HTTP layer responds 503 (transient, retryable). Real compile
168
+ * errors (`MalloyError` / `ModelCompilationError`) propagate
169
+ * unchanged so they keep their 4xx mapping.
170
+ */
171
+ private static async loadViaWorker(
172
+ environmentName: string,
173
+ packageName: string,
174
+ packagePath: string,
175
+ malloyConfig: MalloyConfig,
176
+ startTime: number,
177
+ manifestValidationTime: number,
178
+ ): Promise<Package> {
179
+ const pool = getPackageLoadPool();
180
+ const dispatchTime = performance.now();
181
+ // Submit the worker job and run database probing on the main
182
+ // thread in parallel. We isolate the worker-job promise inside
183
+ // a wrapper so we can map pool-infrastructure failures (worker
184
+ // crash, RPC timeout, pool shutting down) to a 503 without
185
+ // accidentally re-mapping `readDatabases`'s own errors.
186
+ const workerOutcome = pool
187
+ .loadPackage({
188
+ packagePath,
189
+ packageName,
190
+ malloyConfig,
191
+ defaultConnectionName: "duckdb",
192
+ })
193
+ .catch((err: unknown) => {
194
+ // Compile errors surface in-band via
195
+ // `LoadPackageOutcome.models[i].compilationError`; if the
196
+ // pool itself rejects, it's an infra-side failure
197
+ // (shutting down, worker spawn failed, worker crashed,
198
+ // RPC timeout) and the client should retry. Real Malloy
199
+ // compile errors deserialised by the pool still carry
200
+ // their MalloyError / ModelCompilationError identity —
201
+ // let those bubble untouched so they keep their 4xx
202
+ // mapping in `errors.ts`.
203
+ const realError =
204
+ err instanceof Error
205
+ ? err
206
+ : new Error(
207
+ `Package-load worker pool failure: ${String(err)}`,
208
+ );
209
+ if (
210
+ realError instanceof MalloyError ||
211
+ realError instanceof ModelCompilationError
212
+ ) {
213
+ throw realError;
214
+ }
215
+ throw new ServiceUnavailableError(
216
+ `Package-load worker pool unavailable: ${realError.message}`,
217
+ );
218
+ });
219
+ const [outcome, databases] = await Promise.all([
220
+ workerOutcome,
221
+ Package.readDatabases(packagePath, malloyConfig),
222
+ ]);
223
+ const workerDoneTime = performance.now();
224
+ logger.info("Package load via worker pool completed", {
225
+ packageName,
226
+ manifestValidationMs: dispatchTime - manifestValidationTime,
227
+ workerDurationMs: outcome.loadDurationMs,
228
+ dispatchOverheadMs:
229
+ workerDoneTime - dispatchTime - outcome.loadDurationMs,
230
+ modelCount: outcome.models.length,
231
+ databaseCount: databases.length,
232
+ });
233
+
234
+ // Override the manifest-derived resource URI — the worker only
235
+ // returns name/description from publisher.json, but the rest of
236
+ // the API surface expects a `resource` field too.
237
+ const packageConfig: ApiPackage = {
238
+ name: outcome.packageMetadata.name,
239
+ description: outcome.packageMetadata.description,
240
+ resource: `${API_PREFIX}/environments/${environmentName}/packages/${packageName}`,
241
+ };
242
+
243
+ // Build live `Model`s from worker output. Any per-model compile
244
+ // failure aborts the load — matches the historical behaviour of
245
+ // `Package.create` failing the whole package on the first model
246
+ // error. (`Package.reloadAllModels` keeps the failed-model
247
+ // placeholders instead; that branch goes through a different
248
+ // hydration path.)
249
+ const models = new Map<string, Model>();
250
+ for (const sm of outcome.models) {
251
+ if (sm.compilationError) {
252
+ const err = Model.deserializeCompilationError(sm.compilationError);
253
+ logger.error("Model compilation failed", {
254
+ packageName,
255
+ modelPath: sm.modelPath,
256
+ error: err.message,
257
+ });
258
+ // The outer catch in Package.create records the metric +
259
+ // cleans the package directory.
260
+ throw err;
261
+ }
262
+ models.set(
263
+ sm.modelPath,
264
+ Model.fromSerialized(packageName, packagePath, malloyConfig, sm),
265
+ );
266
+ }
267
+
268
+ const endTime = performance.now();
269
+ const executionTime = endTime - startTime;
270
+ this.packageLoadHistogram.record(executionTime, {
271
+ malloy_package_name: packageName,
272
+ status: "success",
273
+ });
274
+ logger.info(`Successfully loaded package ${packageName}`, {
275
+ packageName,
276
+ duration: formatDuration(executionTime),
277
+ });
278
+
279
+ return new Package(
280
+ environmentName,
281
+ packageName,
282
+ packagePath,
283
+ packageConfig,
284
+ databases,
285
+ models,
286
+ malloyConfig,
287
+ );
288
+ }
289
+
207
290
  public getPackageName(): string {
208
291
  return this.packageName;
209
292
  }
@@ -238,6 +321,21 @@ export class Package {
238
321
  return Array.from(this.models.keys());
239
322
  }
240
323
 
324
+ /**
325
+ * Re-compile every model in the package against a new build
326
+ * manifest (called after a materialization build commits new
327
+ * physicalised tables). Runs through the package-load worker pool
328
+ * — same off-main-thread compile path as initial `Package.create`
329
+ * — so a reload of a large package can't block the K8s liveness
330
+ * probe.
331
+ *
332
+ * Unlike `Package.create`, a per-model compile failure here does
333
+ * NOT abort the reload: we keep the failed model as a placeholder
334
+ * (`Model.fromCompilationError`) in `this.models`, matching the
335
+ * historical reload semantics. Whole-pool failures (worker crash,
336
+ * timeout, pool shutting down) propagate as `ServiceUnavailableError`
337
+ * — the caller (manifest service) decides how to retry.
338
+ */
241
339
  public async reloadAllModels(
242
340
  buildManifest: BuildManifest["entries"],
243
341
  ): Promise<void> {
@@ -247,20 +345,62 @@ export class Package {
247
345
  modelCount: modelPaths.length,
248
346
  manifestEntryCount: Object.keys(buildManifest).length,
249
347
  });
250
- const reloaded = await Promise.all(
251
- modelPaths.map((modelPath) =>
252
- Model.create(
253
- this.packageName,
254
- this.packagePath,
255
- modelPath,
256
- this.malloyConfig,
257
- { buildManifest },
258
- ),
259
- ),
260
- );
348
+
349
+ const pool = getPackageLoadPool();
350
+ let outcome;
351
+ try {
352
+ outcome = await pool.loadPackage({
353
+ packagePath: this.packagePath,
354
+ packageName: this.packageName,
355
+ malloyConfig: this.malloyConfig,
356
+ defaultConnectionName: "duckdb",
357
+ buildManifest,
358
+ });
359
+ } catch (err) {
360
+ const realError =
361
+ err instanceof Error
362
+ ? err
363
+ : new Error(`Package-load worker pool failure: ${String(err)}`);
364
+ if (
365
+ realError instanceof MalloyError ||
366
+ realError instanceof ModelCompilationError
367
+ ) {
368
+ throw realError;
369
+ }
370
+ throw new ServiceUnavailableError(
371
+ `Package-load worker pool unavailable: ${realError.message}`,
372
+ );
373
+ }
374
+
261
375
  const nextModels = new Map<string, Model>();
262
- for (const model of reloaded) {
263
- nextModels.set(model.getPath(), model);
376
+ for (const sm of outcome.models) {
377
+ if (sm.compilationError) {
378
+ const err = Model.deserializeCompilationError(sm.compilationError);
379
+ logger.warn("Model compilation failed during reload", {
380
+ packageName: this.packageName,
381
+ modelPath: sm.modelPath,
382
+ error: err.message,
383
+ });
384
+ nextModels.set(
385
+ sm.modelPath,
386
+ Model.fromCompilationError(
387
+ this.packageName,
388
+ sm.modelPath,
389
+ sm.modelType,
390
+ err,
391
+ ),
392
+ );
393
+ } else {
394
+ nextModels.set(
395
+ sm.modelPath,
396
+ Model.fromSerialized(
397
+ this.packageName,
398
+ this.packagePath,
399
+ this.malloyConfig,
400
+ sm,
401
+ ),
402
+ );
403
+ }
264
404
  }
265
405
  this.models = nextModels;
266
406
  }
@@ -323,20 +463,6 @@ export class Package {
323
463
  );
324
464
  }
325
465
 
326
- private static async loadModels(
327
- packageName: string,
328
- packagePath: string,
329
- malloyConfig: MalloyConfig,
330
- ): Promise<Map<string, Model>> {
331
- const modelPaths = await Package.getModelPaths(packagePath);
332
- const models = await Promise.all(
333
- modelPaths.map((modelPath) =>
334
- Model.create(packageName, packagePath, modelPath, malloyConfig),
335
- ),
336
- );
337
- return new Map(models.map((model) => [model.getPath(), model]));
338
- }
339
-
340
466
  private static buildPackageMalloyConfig(
341
467
  packagePath: string,
342
468
  getEnvironmentMalloyConfig: () => MalloyConfig,
@@ -386,27 +512,6 @@ export class Package {
386
512
  return malloyConfig;
387
513
  }
388
514
 
389
- private static async getModelPaths(packagePath: string): Promise<string[]> {
390
- let files = undefined;
391
- try {
392
- files = await recursive(packagePath, [ignoreDotfiles]);
393
- } catch (error) {
394
- logger.error(error);
395
- throw new PackageNotFoundError(
396
- `Package config for ${packagePath} does not exist.`,
397
- );
398
- }
399
- return files
400
- .map((fullPath: string) => {
401
- return path.relative(packagePath, fullPath).replace(/\\/g, "/");
402
- })
403
- .filter(
404
- (modelPath: string) =>
405
- modelPath.endsWith(MODEL_FILE_SUFFIX) ||
406
- modelPath.endsWith(NOTEBOOK_FILE_SUFFIX),
407
- );
408
- }
409
-
410
515
  private static async validatePackageManifestExistsOrThrowError(
411
516
  packagePath: string,
412
517
  ) {
@@ -421,59 +526,33 @@ export class Package {
421
526
  }
422
527
  }
423
528
 
424
- private static async readPackageConfig(
425
- packagePath: string,
426
- ): Promise<ApiPackage> {
427
- const packageConfigPath = path.join(packagePath, PACKAGE_MANIFEST_NAME);
428
- const packageConfigContents = await fs.readFile(packageConfigPath);
429
- // TODO: Validate package manifest. Define manifest type in public API.
430
- const packageManifest = JSON.parse(packageConfigContents.toString());
431
- return {
432
- name: packageManifest.name,
433
- description: packageManifest.description,
434
- };
435
- }
436
-
437
529
  private static async readDatabases(
438
530
  packagePath: string,
531
+ malloyConfig: MalloyConfig,
439
532
  ): Promise<ApiDatabase[]> {
440
533
  const databasePaths = await Package.getDatabasePaths(packagePath);
441
- if (databasePaths.length === 0) return [];
442
-
443
- // Off-main-thread: schema introspection runs in the
444
- // SchemaWorkerPool so DuckDB's native thread pool lives inside
445
- // a worker we control. This is the leak class that OOM-killed
446
- // prod (466 leaked Bun Pool threads on worker-76b49bdb89-8bsv4)
447
- // — worker isolation puts a hard ceiling on per-package native
448
- // thread usage and the worker's connection is reused across all
449
- // schema queries for the life of the process.
450
- const pool = getSchemaWorkerPool();
451
- const settled = await Promise.allSettled(
452
- databasePaths.map((databasePath) =>
453
- pool.submit(packagePath, databasePath),
454
- ),
455
- );
456
-
457
- const results: ApiDatabase[] = [];
458
- for (let i = 0; i < settled.length; i++) {
459
- const outcome = settled[i];
460
- if (outcome.status === "fulfilled") {
461
- results.push({
462
- path: databasePaths[i],
463
- info: outcome.value,
464
- type: "embedded",
465
- });
466
- } else {
467
- // A single bad parquet (corrupt footer, unsupported type)
468
- // must not fail the whole package load. Log and skip.
469
- logger.warn("Schema introspection failed for database", {
470
- packagePath,
471
- databasePath: databasePaths[i],
472
- error: outcome.reason,
473
- });
474
- }
534
+ if (databasePaths.length === 0) {
535
+ return [];
475
536
  }
476
- return results;
537
+ // Resolve the package's duckdb connection ONCE and reuse it for
538
+ // every schema/row-count probe in this package. Malloy caches the
539
+ // materialized connection on the MalloyConfig so the same instance
540
+ // will be returned to model compiles later in `Package.create`.
541
+ // This is the substantive optimization over the previous code:
542
+ // we go from `databasePaths.length` separate DuckDBConnections
543
+ // (each doing its own native init + extension load) to one.
544
+ const conn = await malloyConfig.connections.lookupConnection("duckdb");
545
+ return await Promise.all(
546
+ databasePaths.map(async (databasePath) => ({
547
+ path: databasePath,
548
+ info: await Package.getDatabaseInfo(
549
+ packagePath,
550
+ databasePath,
551
+ conn,
552
+ ),
553
+ type: "embedded" as const,
554
+ })),
555
+ );
477
556
  }
478
557
 
479
558
  private static async getDatabasePaths(
@@ -490,6 +569,43 @@ export class Package {
490
569
  );
491
570
  }
492
571
 
572
+ private static async getDatabaseInfo(
573
+ packagePath: string,
574
+ databasePath: string,
575
+ conn: Connection,
576
+ ): Promise<ApiTableDescription> {
577
+ const fullPath = path.join(packagePath, databasePath);
578
+
579
+ // Create a DuckDB source then:
580
+ // 1. Load the model and get the table schema from model
581
+ // 2. Run a query to get the row count from the table
582
+ // ConnectionRuntime is cheap (just a wrapper), and creating one
583
+ // per call keeps each probe's compile state isolated. The
584
+ // expensive piece — the underlying DuckDBConnection — is shared
585
+ // across all probes via `conn` (resolved once in readDatabases).
586
+ const runtime = new ConnectionRuntime({
587
+ urlReader: new EmptyURLReader(),
588
+ connections: [conn],
589
+ });
590
+ // Normalize path to use forward slashes for cross-platform compatibility
591
+ // DuckDB on Windows supports forward slashes, and this avoids escaping issues
592
+ const normalizedPath = fullPath.replace(/\\/g, "/");
593
+ const model = runtime.loadModel(
594
+ `source: temp is duckdb.table('${normalizedPath}')`,
595
+ );
596
+ const modelDef = await model.getModel();
597
+ const fields = (modelDef._modelDef.contents["temp"] as SourceDef).fields;
598
+ const schema = fields.map((field): ApiColumn => {
599
+ return { type: field.type, name: field.name };
600
+ });
601
+ const runner = model.loadQuery(
602
+ "run: temp->{aggregate: row_count is count()}",
603
+ );
604
+ const result = await runner.run();
605
+ const rowCount = result.data.value[0].row_count?.valueOf() as number;
606
+ return { name: databasePath, rowCount, columns: schema };
607
+ }
608
+
493
609
  public setName(name: string) {
494
610
  this.packageName = name;
495
611
  }