@malloy-publisher/server 0.0.198-dev4 → 0.0.198-dev6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,336 @@
1
+ /**
2
+ * Wire protocol between the main thread (`PackageLoadPool`) and a
3
+ * package-load worker thread.
4
+ *
5
+ * Boundary
6
+ * --------
7
+ * The worker performs the **CPU-bound bulk of `Package.create`** off
8
+ * the main event loop:
9
+ *
10
+ * 1. Read `publisher.config.json` (cheap, but already on the worker
11
+ * side of the boundary so the main thread isn't blocked).
12
+ * 2. Compile every `.malloy` / `.malloynb` (the Malloy parser,
13
+ * type-checker, and IR-builder — the dominant CPU cost).
14
+ * 3. Return a structured-clonable POJO carrying every `modelDef`,
15
+ * `sourceInfos`, dataStyles, etc. that the main thread needs to
16
+ * reconstitute a live `Package`.
17
+ *
18
+ * Embedded database probing (`.parquet` / `.csv` schema + row count)
19
+ * stays on the main thread — it reuses the package's existing DuckDB
20
+ * connection (PR #772) and the probe queries are async-IO-bound, not
21
+ * CPU-bound. Keeping all native-DB handles on the main thread also
22
+ * sidesteps Bun crash 0x20131 where duckdb-native cannot be loaded
23
+ * into more than one isolate of the same process.
24
+ *
25
+ * The main thread reconstitutes by:
26
+ * - Building a fresh `MalloyConfig` against its own connection pool
27
+ * (live native handles can't cross the worker boundary).
28
+ * - Lazy-hydrating each model's `ModelMaterializer` from `modelDef`
29
+ * via `Runtime._loadModelFromModelDef` on first query — NO
30
+ * recompile. This is what closes the loop on PR #767's original
31
+ * "first-query recompile on main thread" gap.
32
+ *
33
+ * Per-model compile failures are returned in-band on
34
+ * `SerializedModel.compilationError` so a single bad model doesn't
35
+ * abort the rest of the package load. The main thread decides
36
+ * whether/when to surface as a fatal `Package.create` error (today
37
+ * it throws on the first error; `Package.reloadAllModels` keeps the
38
+ * failed models as placeholders in the package's model map).
39
+ *
40
+ * Whole-package failures (manifest missing, FS errors, worker
41
+ * crashes) come back as `LoadPackageError`. The pool main-thread
42
+ * half (`PackageLoadPool.loadPackage`) rejects with a deserialised
43
+ * Error; `Package.loadViaWorker` then rewraps any non-compile
44
+ * failure as `ServiceUnavailableError` so the HTTP layer responds
45
+ * 503 (transient, retryable) — there is no in-process fallback.
46
+ *
47
+ * Direction summary
48
+ * -----------------
49
+ * main ──▶ worker: LoadPackageRequest (start)
50
+ * worker ──▶ main: LoadPackageResult (success)
51
+ * worker ──▶ main: LoadPackageError (whole-package failure)
52
+ *
53
+ * worker ──▶ main: ConnectionMetadataRequest (proxy non-duckdb lookups)
54
+ * worker ──▶ main: SchemaForTablesRequest (proxy schema fetch)
55
+ * worker ──▶ main: SchemaForSqlRequest (proxy SQL block schema)
56
+ * worker ──▶ main: ReadUrlRequest (proxy non-file URL reads)
57
+ * main ──▶ worker: *Response (correlated by requestId)
58
+ *
59
+ * main ──▶ worker: ShutdownRequest (graceful drain)
60
+ * worker ──▶ main: ReadyMessage (post-init handshake)
61
+ *
62
+ * The protocol uses plain structured-clonable POJOs so the
63
+ * `postMessage` transfer goes through V8's structured clone — much
64
+ * cheaper than `JSON.stringify` for the multi-MB modelDef payloads.
65
+ */
66
+
67
+ import type {
68
+ Annotation,
69
+ SQLSourceDef,
70
+ TableSourceDef,
71
+ } from "@malloydata/malloy";
72
+
73
+ // ──────────────────────────────────────────────────────────────────────
74
+ // Direction: main ──▶ worker (load-package job)
75
+ // ──────────────────────────────────────────────────────────────────────
76
+
77
+ /**
78
+ * Connection metadata the worker needs to construct a stub
79
+ * `InfoConnection`. Resolved lazily — the worker asks the main thread
80
+ * on the first `lookupConnection(name)` call (see
81
+ * {@link ConnectionMetadataRequest}). We don't ship the full list
82
+ * upfront because Malloy only references connections by name as it
83
+ * encounters `<connection>.table('...')` / `<connection>.sql('...')`
84
+ * inside the model.
85
+ */
86
+ export interface ConnectionMetadata {
87
+ name: string;
88
+ dialectName: string;
89
+ digest: string;
90
+ }
91
+
92
+ export interface LoadPackageRequest {
93
+ type: "load-package";
94
+ requestId: string;
95
+ /** Absolute path to the package directory on disk. */
96
+ packagePath: string;
97
+ /** Logical package name (used in metric labels + log fields). */
98
+ packageName: string;
99
+ /**
100
+ * Default connection name (passed verbatim to the worker; today
101
+ * always `"duckdb"` for embedded packages, but kept configurable
102
+ * to mirror Malloy's own surface).
103
+ */
104
+ defaultConnectionName: string | null;
105
+ /** Optional row-build manifest passed through to Malloy Runtime. */
106
+ buildManifest?: unknown;
107
+ }
108
+
109
+ // ──────────────────────────────────────────────────────────────────────
110
+ // Direction: worker ──▶ main (load-package result)
111
+ // ──────────────────────────────────────────────────────────────────────
112
+
113
+ /**
114
+ * Wire shape for one compiled model in the package. Mirrors the
115
+ * data a main-thread `Model` constructor needs without holding a
116
+ * `ModelMaterializer` reference (that binds to live native
117
+ * connection handles and can't cross the worker boundary).
118
+ *
119
+ * `compilationError` is set when this single model failed to
120
+ * compile but the rest of the package is fine; the main thread
121
+ * decides whether to abort `Package.create`.
122
+ */
123
+ export interface SerializedModel {
124
+ /** Path relative to the package root, forward-slash normalized. */
125
+ modelPath: string;
126
+ modelType: "model" | "notebook";
127
+ /** Set when the model compiled successfully. Wire-typed as
128
+ * `unknown` so the protocol module doesn't drag in the full
129
+ * Malloy type surface; cast to `ModelDef` on receipt. */
130
+ modelDef?: unknown;
131
+ /**
132
+ * Precomputed `modelDefToModelInfo(modelDef)`. Shipped from the
133
+ * worker so the main-thread `Model` constructor doesn't pay the
134
+ * derivation cost on every package load and every subsequent
135
+ * `getModel()` / `getNotebook()` API hit can stringify a cached
136
+ * object instead of recomputing.
137
+ */
138
+ modelInfo?: unknown;
139
+ sourceInfos?: unknown[];
140
+ sources?: unknown[];
141
+ queries?: unknown[];
142
+ filterMap?: Array<[string, unknown[]]>;
143
+ givens?: unknown[];
144
+ /** Notebook (.malloynb) only — per-cell pre-extracted info. */
145
+ notebookCells?: SerializedNotebookCell[];
146
+ /** Accumulated dataStyles from sibling `.styles.json` files. */
147
+ dataStyles?: unknown;
148
+ /** Wall-clock ms spent compiling this single model in the worker. */
149
+ compileDurationMs?: number;
150
+ /** Set when the model failed to compile. */
151
+ compilationError?: SerializedError;
152
+ }
153
+
154
+ export interface SerializedNotebookCell {
155
+ type: "code" | "markdown";
156
+ /** Raw cell text. */
157
+ text: string;
158
+ /**
159
+ * Per-cell ModelDef captured at the cell's point in the
160
+ * `extendModel` chain. The main thread hydrates a per-cell
161
+ * `ModelMaterializer` from this via
162
+ * `Runtime._loadModelFromModelDef`, so cell-level filter
163
+ * refinement can compile new queries against the correct scope
164
+ * without ever recompiling the .malloynb itself.
165
+ */
166
+ cellModelDef?: unknown;
167
+ /**
168
+ * The final-query QueryDef for this cell, captured during the
169
+ * worker's compile. Main thread hydrates a `QueryMaterializer`
170
+ * via `ModelMaterializer._loadQueryFromQueryDef` — no recompile.
171
+ */
172
+ cellQueryDef?: unknown;
173
+ newSources?: unknown[];
174
+ queryInfo?: unknown;
175
+ }
176
+
177
+ export interface LoadPackageResult {
178
+ type: "load-package-result";
179
+ requestId: string;
180
+ packageMetadata: { name?: string; description?: string };
181
+ models: SerializedModel[];
182
+ /** Wall-clock ms inside the worker for the full package load. */
183
+ loadDurationMs: number;
184
+ }
185
+
186
+ export interface LoadPackageError {
187
+ type: "load-package-error";
188
+ requestId: string;
189
+ error: SerializedError;
190
+ }
191
+
192
+ /**
193
+ * Error wire-shape. We cannot transfer `Error` instances directly
194
+ * across `postMessage` cleanly (Bun/Node behaviour diverges on stack
195
+ * propagation), so we ship a structured payload and reconstitute on
196
+ * the main thread.
197
+ */
198
+ export interface SerializedError {
199
+ name: string;
200
+ message: string;
201
+ stack?: string;
202
+ /** Set when the error originated as a Malloy `MalloyError`. */
203
+ malloyProblems?: unknown[];
204
+ /** Set when the error originated as `ModelCompilationError`. */
205
+ isCompilationError?: boolean;
206
+ }
207
+
208
+ // ──────────────────────────────────────────────────────────────────────
209
+ // Direction: worker ──▶ main (proxy connection metadata)
210
+ // ──────────────────────────────────────────────────────────────────────
211
+
212
+ export interface ConnectionMetadataRequest {
213
+ type: "connection-metadata";
214
+ requestId: string;
215
+ jobId: string;
216
+ connectionName: string;
217
+ }
218
+
219
+ export interface ConnectionMetadataResponse {
220
+ type: "connection-metadata-response";
221
+ requestId: string;
222
+ ok: true;
223
+ metadata: ConnectionMetadata;
224
+ }
225
+
226
+ // ──────────────────────────────────────────────────────────────────────
227
+ // Direction: worker ──▶ main (proxy schema fetches for non-duckdb)
228
+ // ──────────────────────────────────────────────────────────────────────
229
+
230
+ export interface SchemaForTablesRequest {
231
+ type: "schema-for-tables";
232
+ requestId: string;
233
+ /** Job this RPC belongs to (so main routes to the right config). */
234
+ jobId: string;
235
+ connectionName: string;
236
+ tables: Record<string, string>;
237
+ options: {
238
+ refreshTimestamp?: number;
239
+ modelAnnotation?: Annotation;
240
+ };
241
+ }
242
+
243
+ export interface SchemaForTablesResponse {
244
+ type: "schema-for-tables-response";
245
+ requestId: string;
246
+ ok: true;
247
+ schemas: Record<string, TableSourceDef>;
248
+ errors: Record<string, string>;
249
+ }
250
+
251
+ export interface SchemaForSqlRequest {
252
+ type: "schema-for-sql";
253
+ requestId: string;
254
+ jobId: string;
255
+ connectionName: string;
256
+ sentence: unknown;
257
+ options: {
258
+ refreshTimestamp?: number;
259
+ modelAnnotation?: Annotation;
260
+ };
261
+ }
262
+
263
+ export interface SchemaForSqlResponse {
264
+ type: "schema-for-sql-response";
265
+ requestId: string;
266
+ ok: true;
267
+ structDef?: SQLSourceDef;
268
+ error?: string;
269
+ }
270
+
271
+ export interface RpcErrorResponse {
272
+ type: "rpc-error";
273
+ requestId: string;
274
+ ok: false;
275
+ error: SerializedError;
276
+ }
277
+
278
+ // ──────────────────────────────────────────────────────────────────────
279
+ // Direction: worker ──▶ main (file read for non-file URLs)
280
+ // ──────────────────────────────────────────────────────────────────────
281
+
282
+ /**
283
+ * Workers read most files directly via `fs` (they share the host's
284
+ * filesystem namespace). This RPC exists for the rare case where the
285
+ * package URL reader has host-specific behaviour (e.g. virtual files,
286
+ * remote URLs) — we delegate back to the main thread's URL reader so
287
+ * compile semantics stay identical to the in-process path.
288
+ */
289
+ export interface ReadUrlRequest {
290
+ type: "read-url";
291
+ requestId: string;
292
+ jobId: string;
293
+ url: string;
294
+ }
295
+
296
+ export interface ReadUrlResponse {
297
+ type: "read-url-response";
298
+ requestId: string;
299
+ ok: true;
300
+ contents: string;
301
+ invalidationKey?: string | number | null;
302
+ }
303
+
304
+ // ──────────────────────────────────────────────────────────────────────
305
+ // Lifecycle
306
+ // ──────────────────────────────────────────────────────────────────────
307
+
308
+ export interface ShutdownRequest {
309
+ type: "shutdown";
310
+ }
311
+
312
+ export interface ReadyMessage {
313
+ type: "ready";
314
+ }
315
+
316
+ // ──────────────────────────────────────────────────────────────────────
317
+ // Union types for routing
318
+ // ──────────────────────────────────────────────────────────────────────
319
+
320
+ export type MainToWorkerMessage =
321
+ | LoadPackageRequest
322
+ | ConnectionMetadataResponse
323
+ | SchemaForTablesResponse
324
+ | SchemaForSqlResponse
325
+ | ReadUrlResponse
326
+ | RpcErrorResponse
327
+ | ShutdownRequest;
328
+
329
+ export type WorkerToMainMessage =
330
+ | LoadPackageResult
331
+ | LoadPackageError
332
+ | ConnectionMetadataRequest
333
+ | SchemaForTablesRequest
334
+ | SchemaForSqlRequest
335
+ | ReadUrlRequest
336
+ | ReadyMessage;
package/src/server.ts CHANGED
@@ -1447,6 +1447,18 @@ app.use(
1447
1447
  },
1448
1448
  );
1449
1449
 
1450
+ // Eagerly construct the package-load worker pool so we fail fast at
1451
+ // boot if PACKAGE_LOAD_WORKERS is misconfigured (e.g. set to 0, the
1452
+ // removed in-process fallback). Surfacing the bad config here is much
1453
+ // friendlier than surfacing it on the first package load, which could
1454
+ // be hours after start.
1455
+ {
1456
+ const { getPackageLoadPool } = await import(
1457
+ "./package_load/package_load_pool"
1458
+ );
1459
+ getPackageLoadPool();
1460
+ }
1461
+
1450
1462
  const mainServer = http.createServer({ maxHeaderSize: 262144 }, app);
1451
1463
 
1452
1464
  mainServer.timeout = 600000;
@@ -1,8 +1,8 @@
1
1
  import { GetObjectCommand, S3 } from "@aws-sdk/client-s3";
2
2
  import { Storage } from "@google-cloud/storage";
3
- import AdmZip from "adm-zip";
4
3
  import { Mutex } from "async-mutex";
5
4
  import crypto from "crypto";
5
+ import extract from "extract-zip";
6
6
  import * as fs from "fs";
7
7
  import * as path from "path";
8
8
  import simpleGit from "simple-git";
@@ -884,6 +884,7 @@ export class EnvironmentStore {
884
884
  }
885
885
 
886
886
  public async unzipEnvironment(absoluteEnvironmentPath: string) {
887
+ const startedAt = Date.now();
887
888
  logger.info(
888
889
  `Detected zip file at "${absoluteEnvironmentPath}". Unzipping...`,
889
890
  );
@@ -897,8 +898,28 @@ export class EnvironmentStore {
897
898
  });
898
899
  await fs.promises.mkdir(unzippedEnvironmentPath, { recursive: true });
899
900
 
900
- const zip = new AdmZip(absoluteEnvironmentPath);
901
- zip.extractAllTo(unzippedEnvironmentPath, true);
901
+ // Stream-extract via yauzl (wrapped by extract-zip). Each entry's
902
+ // inflate and write are dispatched to the libuv thread pool, so the
903
+ // main event loop stays responsive even for very large archives.
904
+ // The previous adm-zip path used fs.readFileSync + zlib.inflateRawSync
905
+ // on the main thread, which parked the loop long enough on multi-
906
+ // hundred-MB packages to fail Kubernetes liveness probes mid-extract.
907
+ let entryCount = 0;
908
+ let totalUncompressedBytes = 0;
909
+ await extract(absoluteEnvironmentPath, {
910
+ dir: path.resolve(unzippedEnvironmentPath),
911
+ onEntry: (entry) => {
912
+ entryCount += 1;
913
+ totalUncompressedBytes += entry.uncompressedSize ?? 0;
914
+ },
915
+ });
916
+
917
+ const mib = (totalUncompressedBytes / (1024 * 1024)).toFixed(1);
918
+ logger.info(
919
+ `Unzipped "${absoluteEnvironmentPath}" -> "${unzippedEnvironmentPath}" ` +
920
+ `(${entryCount} entries, ${mib} MiB uncompressed) in ` +
921
+ `${formatDuration(Date.now() - startedAt)}`,
922
+ );
902
923
 
903
924
  return unzippedEnvironmentPath;
904
925
  }
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Shared utilities for surfacing Malloy `Given` declarations on
3
+ * compiled models.
4
+ *
5
+ * The Malloy SDK's `Given` class is declared in
6
+ * `@malloydata/malloy/dist/api/foundation/core.d.ts` but is not
7
+ * re-exported from the package root, so we duck-type against the
8
+ * surface we actually use and don't pull in the private type.
9
+ *
10
+ * Lives here so both the main-thread `Model` constructor and the
11
+ * package-load worker can use the same conversion. The worker
12
+ * imports this file directly (it's pure TypeScript with no native
13
+ * deps, so it's safe to bundle into the worker entry).
14
+ */
15
+
16
+ /**
17
+ * Duck-typed shape of a Malloy SDK `Given` instance (the value type
18
+ * of `Model.givens`).
19
+ */
20
+ export interface MalloyGiven {
21
+ readonly name: string;
22
+ readonly type: { type: string; filterType?: string };
23
+ getTaglines(prefix?: RegExp): string[];
24
+ }
25
+
26
+ /**
27
+ * Wire/API shape of a given. Structurally identical to the
28
+ * `components["schemas"]["Given"]` shape from the OpenAPI spec —
29
+ * callers can cast freely.
30
+ */
31
+ export interface MalloyGivenApi {
32
+ name: string;
33
+ type: string;
34
+ annotations?: string[];
35
+ }
36
+
37
+ /**
38
+ * Convert a Malloy SDK `Given` to the wire/API shape.
39
+ *
40
+ * Two fields are deliberately not surfaced:
41
+ *
42
+ * - `location` — Malloy's `DocumentLocation.url` is an absolute
43
+ * `file://` path on the publisher's filesystem. Surfacing it
44
+ * would leak the OS user, install directory, and internal
45
+ * layout. Existing `Filter` introspection does not expose
46
+ * location either; matching that floor. A future PR can add a
47
+ * sanitised package-relative path if a client needs it.
48
+ *
49
+ * - `default` / `defaultText` — Malloy's API only exposes the
50
+ * parsed `ConstantExpr` AST, not a rendered source string.
51
+ * Rendering it here would duplicate the Malloy printer. Add
52
+ * when Malloy surfaces a stringified accessor.
53
+ *
54
+ * `annotations` is restricted to `#(...)` declaration annotations
55
+ * (the caller-facing kind, e.g. `#(doc)`). `getTaglines()` with no
56
+ * prefix would also return `##` doc-comment lines and the
57
+ * model-level `##!` pragma, which aren't part of the given's
58
+ * surface contract.
59
+ *
60
+ * Type rendering: `GivenTypeDef` is typed as `AtomicTypeDef |
61
+ * FilterExpressionParamTypeDef`, but Malloy's grammar only emits
62
+ * the scalar parameter types (`string` | `number` | `boolean` |
63
+ * `date` | `timestamp` | `timestamptz` | `filter expression` |
64
+ * `error`) for given declarations today. If the grammar expands
65
+ * to allow array or record givens, the bare `type.type`
66
+ * discriminator (`'array'`, `'record'`) will land in the wire
67
+ * response with no element info — revisit when that happens.
68
+ */
69
+ export function malloyGivenToApi(given: MalloyGiven): MalloyGivenApi {
70
+ const type = given.type;
71
+ const renderedType =
72
+ type.type === "filter expression"
73
+ ? `filter<${type.filterType}>`
74
+ : type.type;
75
+ return {
76
+ name: given.name,
77
+ type: renderedType,
78
+ annotations: given.getTaglines(/^#\(/),
79
+ };
80
+ }