@query-farm/vgi-rpc 0.3.4 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +47 -0
  2. package/dist/auth.d.ts +13 -0
  3. package/dist/auth.d.ts.map +1 -0
  4. package/dist/client/connect.d.ts.map +1 -1
  5. package/dist/client/index.d.ts +2 -0
  6. package/dist/client/index.d.ts.map +1 -1
  7. package/dist/client/introspect.d.ts +1 -0
  8. package/dist/client/introspect.d.ts.map +1 -1
  9. package/dist/client/oauth.d.ts +62 -0
  10. package/dist/client/oauth.d.ts.map +1 -0
  11. package/dist/client/pipe.d.ts +3 -0
  12. package/dist/client/pipe.d.ts.map +1 -1
  13. package/dist/client/stream.d.ts +5 -0
  14. package/dist/client/stream.d.ts.map +1 -1
  15. package/dist/client/types.d.ts +6 -0
  16. package/dist/client/types.d.ts.map +1 -1
  17. package/dist/constants.d.ts +3 -1
  18. package/dist/constants.d.ts.map +1 -1
  19. package/dist/dispatch/describe.d.ts.map +1 -1
  20. package/dist/dispatch/stream.d.ts +2 -1
  21. package/dist/dispatch/stream.d.ts.map +1 -1
  22. package/dist/dispatch/unary.d.ts +2 -1
  23. package/dist/dispatch/unary.d.ts.map +1 -1
  24. package/dist/external.d.ts +45 -0
  25. package/dist/external.d.ts.map +1 -0
  26. package/dist/gcs.d.ts +38 -0
  27. package/dist/gcs.d.ts.map +1 -0
  28. package/dist/http/auth.d.ts +32 -0
  29. package/dist/http/auth.d.ts.map +1 -0
  30. package/dist/http/bearer.d.ts +34 -0
  31. package/dist/http/bearer.d.ts.map +1 -0
  32. package/dist/http/dispatch.d.ts +4 -0
  33. package/dist/http/dispatch.d.ts.map +1 -1
  34. package/dist/http/handler.d.ts.map +1 -1
  35. package/dist/http/index.d.ts +8 -0
  36. package/dist/http/index.d.ts.map +1 -1
  37. package/dist/http/jwt.d.ts +21 -0
  38. package/dist/http/jwt.d.ts.map +1 -0
  39. package/dist/http/mtls.d.ts +78 -0
  40. package/dist/http/mtls.d.ts.map +1 -0
  41. package/dist/http/pages.d.ts +9 -0
  42. package/dist/http/pages.d.ts.map +1 -0
  43. package/dist/http/types.d.ts +22 -1
  44. package/dist/http/types.d.ts.map +1 -1
  45. package/dist/index.d.ts +4 -2
  46. package/dist/index.d.ts.map +1 -1
  47. package/dist/index.js +2576 -317
  48. package/dist/index.js.map +27 -18
  49. package/dist/otel.d.ts +47 -0
  50. package/dist/otel.d.ts.map +1 -0
  51. package/dist/s3.d.ts +43 -0
  52. package/dist/s3.d.ts.map +1 -0
  53. package/dist/server.d.ts +6 -0
  54. package/dist/server.d.ts.map +1 -1
  55. package/dist/types.d.ts +38 -2
  56. package/dist/types.d.ts.map +1 -1
  57. package/dist/wire/response.d.ts.map +1 -1
  58. package/package.json +46 -2
  59. package/src/auth.ts +31 -0
  60. package/src/client/connect.ts +28 -6
  61. package/src/client/index.ts +11 -0
  62. package/src/client/introspect.ts +15 -3
  63. package/src/client/oauth.ts +167 -0
  64. package/src/client/pipe.ts +19 -4
  65. package/src/client/stream.ts +32 -7
  66. package/src/client/types.ts +6 -0
  67. package/src/constants.ts +4 -1
  68. package/src/dispatch/describe.ts +20 -0
  69. package/src/dispatch/stream.ts +18 -4
  70. package/src/dispatch/unary.ts +6 -1
  71. package/src/external.ts +209 -0
  72. package/src/gcs.ts +86 -0
  73. package/src/http/auth.ts +110 -0
  74. package/src/http/bearer.ts +107 -0
  75. package/src/http/dispatch.ts +32 -10
  76. package/src/http/handler.ts +120 -3
  77. package/src/http/index.ts +14 -0
  78. package/src/http/jwt.ts +80 -0
  79. package/src/http/mtls.ts +298 -0
  80. package/src/http/pages.ts +298 -0
  81. package/src/http/types.ts +23 -1
  82. package/src/index.ts +32 -0
  83. package/src/otel.ts +161 -0
  84. package/src/s3.ts +94 -0
  85. package/src/server.ts +42 -8
  86. package/src/types.ts +51 -3
  87. package/src/wire/response.ts +28 -14
@@ -12,6 +12,7 @@ import {
12
12
  } from "@query-farm/apache-arrow";
13
13
  import { DESCRIBE_METHOD_NAME } from "../constants.js";
14
14
  import { RpcError } from "../errors.js";
15
+ import { type ExternalLocationConfig, isExternalLocationBatch, resolveExternalLocation } from "../external.js";
15
16
  import { serializeIpcStream } from "../http/common.js";
16
17
  import { IpcStreamReader } from "../wire/reader.js";
17
18
  import type { RpcClient } from "./connect.js";
@@ -86,6 +87,7 @@ export class PipeStreamSession implements StreamSession {
86
87
  private _outputSchema: Schema;
87
88
  private _releaseBusy: () => void;
88
89
  private _setDrainPromise: (p: Promise<void>) => void;
90
+ private _externalConfig?: ExternalLocationConfig;
89
91
 
90
92
  constructor(opts: {
91
93
  reader: IpcStreamReader;
@@ -95,6 +97,7 @@ export class PipeStreamSession implements StreamSession {
95
97
  outputSchema: Schema;
96
98
  releaseBusy: () => void;
97
99
  setDrainPromise: (p: Promise<void>) => void;
100
+ externalConfig?: ExternalLocationConfig;
98
101
  }) {
99
102
  this._reader = opts.reader;
100
103
  this._writeFn = opts.writeFn;
@@ -103,6 +106,7 @@ export class PipeStreamSession implements StreamSession {
103
106
  this._outputSchema = opts.outputSchema;
104
107
  this._releaseBusy = opts.releaseBusy;
105
108
  this._setDrainPromise = opts.setDrainPromise;
109
+ this._externalConfig = opts.externalConfig;
106
110
  }
107
111
 
108
112
  get header(): Record<string, any> | null {
@@ -120,6 +124,10 @@ export class PipeStreamSession implements StreamSession {
120
124
  if (batch === null) return null; // Server closed output stream
121
125
 
122
126
  if (batch.numRows === 0) {
127
+ // Check for external location pointer batch
128
+ if (isExternalLocationBatch(batch)) {
129
+ return await resolveExternalLocation(batch, this._externalConfig);
130
+ }
123
131
  // Check if it's a log/error batch. If so, dispatch and continue.
124
132
  // Otherwise it's a zero-row data batch — return it.
125
133
  if (dispatchLogOrError(batch, this._onLog)) {
@@ -375,6 +383,7 @@ export function pipeConnect(
375
383
  options?: PipeConnectOptions,
376
384
  ): RpcClient {
377
385
  const onLog = options?.onLog;
386
+ const externalConfig = options?.externalLocation;
378
387
 
379
388
  let reader: IpcStreamReader | null = null;
380
389
  let readerPromise: Promise<IpcStreamReader> | null = null;
@@ -483,12 +492,16 @@ export function pipeConnect(
483
492
  throw new Error("EOF reading response");
484
493
  }
485
494
 
486
- // Process batches: dispatch logs, find result
495
+ // Process batches: dispatch logs, resolve external pointers, find result
487
496
  let resultBatch: RecordBatch | null = null;
488
- for (const batch of response.batches) {
497
+ for (let batch of response.batches) {
489
498
  if (batch.numRows === 0) {
490
- dispatchLogOrError(batch, onLog);
491
- continue;
499
+ if (isExternalLocationBatch(batch)) {
500
+ batch = await resolveExternalLocation(batch, externalConfig);
501
+ } else {
502
+ dispatchLogOrError(batch, onLog);
503
+ continue;
504
+ }
492
505
  }
493
506
  resultBatch = batch;
494
507
  }
@@ -557,6 +570,7 @@ export function pipeConnect(
557
570
  outputSchema,
558
571
  releaseBusy,
559
572
  setDrainPromise,
573
+ externalConfig,
560
574
  });
561
575
  } catch (e) {
562
576
  // Init error (e.g., server raised exception during init).
@@ -624,6 +638,7 @@ export function subprocessConnect(cmd: string[], options?: SubprocessConnectOpti
624
638
 
625
639
  const client = pipeConnect(stdout, writable, {
626
640
  onLog: options?.onLog,
641
+ externalLocation: options?.externalLocation,
627
642
  });
628
643
 
629
644
  // Wrap close to also kill the subprocess
@@ -4,6 +4,7 @@
4
4
  import { Field, makeData, RecordBatch, Schema, Struct, vectorFromArray } from "@query-farm/apache-arrow";
5
5
  import { STATE_KEY } from "../constants.js";
6
6
  import { RpcError } from "../errors.js";
7
+ import { type ExternalLocationConfig, isExternalLocationBatch, resolveExternalLocation } from "../external.js";
7
8
  import { ARROW_CONTENT_TYPE, serializeIpcStream } from "../http/common.js";
8
9
  import { dispatchLogOrError, extractBatchRows, inferArrowType, readResponseBatches } from "./ipc.js";
9
10
  import type { LogMessage, StreamSession } from "./types.js";
@@ -25,6 +26,8 @@ export class HttpStreamSession implements StreamSession {
25
26
  private _compressionLevel?: number;
26
27
  private _compressFn?: CompressFn;
27
28
  private _decompressFn?: DecompressFn;
29
+ private _authorization?: string;
30
+ private _externalConfig?: ExternalLocationConfig;
28
31
 
29
32
  constructor(opts: {
30
33
  baseUrl: string;
@@ -40,6 +43,8 @@ export class HttpStreamSession implements StreamSession {
40
43
  compressionLevel?: number;
41
44
  compressFn?: CompressFn;
42
45
  decompressFn?: DecompressFn;
46
+ authorization?: string;
47
+ externalConfig?: ExternalLocationConfig;
43
48
  }) {
44
49
  this._baseUrl = opts.baseUrl;
45
50
  this._prefix = opts.prefix;
@@ -54,6 +59,8 @@ export class HttpStreamSession implements StreamSession {
54
59
  this._compressionLevel = opts.compressionLevel;
55
60
  this._compressFn = opts.compressFn;
56
61
  this._decompressFn = opts.decompressFn;
62
+ this._authorization = opts.authorization;
63
+ this._externalConfig = opts.externalConfig;
57
64
  }
58
65
 
59
66
  get header(): Record<string, any> | null {
@@ -68,6 +75,9 @@ export class HttpStreamSession implements StreamSession {
68
75
  headers["Content-Encoding"] = "zstd";
69
76
  headers["Accept-Encoding"] = "zstd";
70
77
  }
78
+ if (this._authorization) {
79
+ headers.Authorization = this._authorization;
80
+ }
71
81
  return headers;
72
82
  }
73
83
 
@@ -154,6 +164,9 @@ export class HttpStreamSession implements StreamSession {
154
164
  headers: this._buildHeaders(),
155
165
  body: this._prepareBody(body) as unknown as BodyInit,
156
166
  });
167
+ if (resp.status === 401) {
168
+ throw new RpcError("AuthenticationError", "Authentication required", "");
169
+ }
157
170
 
158
171
  const responseBody = await this._readResponse(resp);
159
172
  const { batches: responseBatches } = await readResponseBatches(responseBody);
@@ -202,10 +215,14 @@ export class HttpStreamSession implements StreamSession {
202
215
  */
203
216
  async *[Symbol.asyncIterator](): AsyncIterableIterator<Record<string, any>[]> {
204
217
  // Yield pre-loaded batches from init
205
- for (const batch of this._pendingBatches) {
218
+ for (let batch of this._pendingBatches) {
206
219
  if (batch.numRows === 0) {
207
- dispatchLogOrError(batch, this._onLog);
208
- continue;
220
+ if (isExternalLocationBatch(batch)) {
221
+ batch = await resolveExternalLocation(batch, this._externalConfig);
222
+ } else {
223
+ dispatchLogOrError(batch, this._onLog);
224
+ continue;
225
+ }
209
226
  }
210
227
  yield extractBatchRows(batch);
211
228
  }
@@ -220,7 +237,7 @@ export class HttpStreamSession implements StreamSession {
220
237
  const { batches } = await readResponseBatches(responseBody);
221
238
 
222
239
  let gotContinuation = false;
223
- for (const batch of batches) {
240
+ for (let batch of batches) {
224
241
  if (batch.numRows === 0) {
225
242
  // Check for continuation token
226
243
  const token = batch.metadata?.get(STATE_KEY);
@@ -229,9 +246,14 @@ export class HttpStreamSession implements StreamSession {
229
246
  gotContinuation = true;
230
247
  continue;
231
248
  }
232
- // Log/error batch
233
- dispatchLogOrError(batch, this._onLog);
234
- continue;
249
+ // Check for external location pointer
250
+ if (isExternalLocationBatch(batch)) {
251
+ batch = await resolveExternalLocation(batch, this._externalConfig);
252
+ } else {
253
+ // Log/error batch
254
+ dispatchLogOrError(batch, this._onLog);
255
+ continue;
256
+ }
235
257
  }
236
258
 
237
259
  yield extractBatchRows(batch);
@@ -261,6 +283,9 @@ export class HttpStreamSession implements StreamSession {
261
283
  headers: this._buildHeaders(),
262
284
  body: this._prepareBody(body) as unknown as BodyInit,
263
285
  });
286
+ if (resp.status === 401) {
287
+ throw new RpcError("AuthenticationError", "Authentication required", "");
288
+ }
264
289
 
265
290
  return this._readResponse(resp);
266
291
  }
@@ -5,6 +5,10 @@ export interface HttpConnectOptions {
5
5
  prefix?: string;
6
6
  onLog?: (msg: LogMessage) => void;
7
7
  compressionLevel?: number;
8
+ /** Authorization header value (e.g. "Bearer <token>"). Sent with every request. */
9
+ authorization?: string;
10
+ /** External storage config for resolving externalized batches. */
11
+ externalLocation?: import("../external.js").ExternalLocationConfig;
8
12
  }
9
13
 
10
14
  export interface LogMessage {
@@ -22,6 +26,8 @@ export interface StreamSession {
22
26
 
23
27
  export interface PipeConnectOptions {
24
28
  onLog?: (msg: LogMessage) => void;
29
+ /** External storage config for resolving externalized batches. */
30
+ externalLocation?: import("../external.js").ExternalLocationConfig;
25
31
  }
26
32
 
27
33
  export interface SubprocessConnectOptions extends PipeConnectOptions {
package/src/constants.ts CHANGED
@@ -15,8 +15,11 @@ export const REQUEST_ID_KEY = "vgi_rpc.request_id";
15
15
 
16
16
  export const PROTOCOL_NAME_KEY = "vgi_rpc.protocol_name";
17
17
  export const DESCRIBE_VERSION_KEY = "vgi_rpc.describe_version";
18
- export const DESCRIBE_VERSION = "2";
18
+ export const DESCRIBE_VERSION = "3";
19
19
 
20
20
  export const DESCRIBE_METHOD_NAME = "__describe__";
21
21
 
22
22
  export const STATE_KEY = "vgi_rpc.stream_state#b64";
23
+
24
+ export const LOCATION_KEY = "vgi_rpc.location";
25
+ export const LOCATION_SHA256_KEY = "vgi_rpc.location.sha256";
@@ -37,6 +37,8 @@ export const DESCRIBE_SCHEMA = new Schema([
37
37
  new Field("param_defaults_json", new Utf8(), true),
38
38
  new Field("has_header", new Bool(), false),
39
39
  new Field("header_schema_ipc", new Binary(), true),
40
+ new Field("is_exchange", new Bool(), true),
41
+ new Field("param_docs_json", new Utf8(), true),
40
42
  ]);
41
43
 
42
44
  /**
@@ -60,6 +62,8 @@ export function buildDescribeBatch(
60
62
  const paramDefaultsJsons: (string | null)[] = [];
61
63
  const hasHeaders: boolean[] = [];
62
64
  const headerSchemas: (Uint8Array | null)[] = [];
65
+ const isExchanges: (boolean | null)[] = [];
66
+ const paramDocsJsons: (string | null)[] = [];
63
67
 
64
68
  for (const [name, method] of sortedEntries) {
65
69
  names.push(name);
@@ -95,6 +99,18 @@ export function buildDescribeBatch(
95
99
 
96
100
  hasHeaders.push(!!method.headerSchema);
97
101
  headerSchemas.push(method.headerSchema ? serializeSchema(method.headerSchema) : null);
102
+
103
+ // is_exchange: true for exchange, false for producer, null for unary
104
+ if (method.exchangeFn) {
105
+ isExchanges.push(true);
106
+ } else if (method.producerFn) {
107
+ isExchanges.push(false);
108
+ } else {
109
+ isExchanges.push(null);
110
+ }
111
+
112
+ // param_docs_json: no docstring source in TypeScript, always null
113
+ paramDocsJsons.push(null);
98
114
  }
99
115
 
100
116
  // Build the batch using vectorFromArray for each column
@@ -108,6 +124,8 @@ export function buildDescribeBatch(
108
124
  const paramDefaultsArr = vectorFromArray(paramDefaultsJsons, new Utf8());
109
125
  const hasHeaderArr = vectorFromArray(hasHeaders, new Bool());
110
126
  const headerSchemaArr = vectorFromArray(headerSchemas, new Binary());
127
+ const isExchangeArr = vectorFromArray(isExchanges, new Bool());
128
+ const paramDocsArr = vectorFromArray(paramDocsJsons, new Utf8());
111
129
 
112
130
  const children = [
113
131
  nameArr.data[0],
@@ -120,6 +138,8 @@ export function buildDescribeBatch(
120
138
  paramDefaultsArr.data[0],
121
139
  hasHeaderArr.data[0],
122
140
  headerSchemaArr.data[0],
141
+ isExchangeArr.data[0],
142
+ paramDocsArr.data[0],
123
143
  ];
124
144
 
125
145
  const structType = new Struct(DESCRIBE_SCHEMA.fields);
@@ -2,6 +2,7 @@
2
2
  // SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  import { Schema } from "@query-farm/apache-arrow";
5
+ import { type ExternalLocationConfig, maybeExternalizeBatch } from "../external.js";
5
6
  import type { MethodDefinition } from "../types.js";
6
7
  import { OutputCollector } from "../types.js";
7
8
  import { conformBatchToSchema } from "../util/conform.js";
@@ -33,6 +34,7 @@ export async function dispatchStream(
33
34
  reader: IpcStreamReader,
34
35
  serverId: string,
35
36
  requestId: string | null,
37
+ externalConfig?: ExternalLocationConfig,
36
38
  ): Promise<void> {
37
39
  const isProducer = !!method.producerFn;
38
40
 
@@ -107,12 +109,20 @@ export async function dispatchStream(
107
109
  let inputBatch = await reader.readNextBatch();
108
110
  if (!inputBatch) break;
109
111
 
110
- // Cast compatible input types when schema doesn't match exactly
112
+ // Cast compatible input types when schema doesn't match exactly.
113
+ // If conformance fails (e.g., completely different schemas like a dummy
114
+ // registration schema vs actual data), pass the original batch through —
115
+ // the exchange handler may handle dynamic schemas internally.
111
116
  if (expectedInputSchema && !isProducer && inputBatch.schema !== expectedInputSchema) {
112
117
  try {
113
118
  inputBatch = conformBatchToSchema(inputBatch, expectedInputSchema);
114
- } catch {
115
- throw new TypeError(`Input schema mismatch: expected ${expectedInputSchema}, got ${inputBatch.schema}`);
119
+ } catch (e) {
120
+ if (e instanceof TypeError) {
121
+ // Field name/count mismatch — propagate as error (matches Python behavior).
122
+ throw e;
123
+ }
124
+ // Other conformance failures: pass through for dynamic schema handlers.
125
+ console.debug?.(`Schema conformance skipped: ${e instanceof Error ? e.message : e}`);
116
126
  }
117
127
  }
118
128
 
@@ -125,7 +135,11 @@ export async function dispatchStream(
125
135
  }
126
136
 
127
137
  for (const emitted of out.batches) {
128
- stream.write(emitted.batch);
138
+ let batch = emitted.batch;
139
+ if (externalConfig) {
140
+ batch = await maybeExternalizeBatch(batch, externalConfig);
141
+ }
142
+ stream.write(batch);
129
143
  }
130
144
 
131
145
  if (out.finished) {
@@ -1,6 +1,7 @@
1
1
  // © Copyright 2025-2026, Query.Farm LLC - https://query.farm
2
2
  // SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ import { type ExternalLocationConfig, maybeExternalizeBatch } from "../external.js";
4
5
  import type { MethodDefinition } from "../types.js";
5
6
  import { OutputCollector } from "../types.js";
6
7
  import { buildErrorBatch, buildResultBatch } from "../wire/response.js";
@@ -17,13 +18,17 @@ export async function dispatchUnary(
17
18
  writer: IpcStreamWriter,
18
19
  serverId: string,
19
20
  requestId: string | null,
21
+ externalConfig?: ExternalLocationConfig,
20
22
  ): Promise<void> {
21
23
  const schema = method.resultSchema;
22
24
  const out = new OutputCollector(schema, true, serverId, requestId);
23
25
 
24
26
  try {
25
27
  const result = await method.handler!(params, out);
26
- const resultBatch = buildResultBatch(schema, result, serverId, requestId);
28
+ let resultBatch = buildResultBatch(schema, result, serverId, requestId);
29
+ if (externalConfig) {
30
+ resultBatch = await maybeExternalizeBatch(resultBatch, externalConfig);
31
+ }
27
32
  // Collect log batches (from clientLog) + result batch
28
33
  const batches = [...out.batches.map((b) => b.batch), resultBatch];
29
34
  writer.writeStream(schema, batches);
@@ -0,0 +1,209 @@
1
+ // © Copyright 2025-2026, Query.Farm LLC - https://query.farm
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ /**
5
+ * External storage support for large Arrow IPC batches.
6
+ *
7
+ * When a batch exceeds a configurable threshold, it is serialized to IPC,
8
+ * optionally compressed with zstd, and uploaded to pluggable storage.
9
+ * The batch is replaced with a zero-row "pointer batch" containing the
10
+ * download URL and SHA-256 checksum in metadata.
11
+ */
12
+
13
+ import { type RecordBatch, RecordBatchReader, RecordBatchStreamWriter, type Schema } from "@query-farm/apache-arrow";
14
+ import { LOCATION_KEY, LOCATION_SHA256_KEY, LOG_LEVEL_KEY } from "./constants.js";
15
+ import { zstdCompress, zstdDecompress } from "./util/zstd.js";
16
+ import { buildEmptyBatch } from "./wire/response.js";
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Interfaces and configuration
20
+ // ---------------------------------------------------------------------------
21
+
22
+ /** Pluggable storage backend for uploading large batches. */
23
+ export interface ExternalStorage {
24
+ /** Upload IPC data and return a URL for retrieval. */
25
+ upload(data: Uint8Array, contentEncoding: string): Promise<string>;
26
+ }
27
+
28
+ /** Configuration for external storage of large batches. */
29
+ export interface ExternalLocationConfig {
30
+ /** Storage backend for uploading. */
31
+ storage: ExternalStorage;
32
+ /** Minimum batch byte size to trigger externalization. Default: 1MB. */
33
+ externalizeThresholdBytes?: number;
34
+ /** Optional zstd compression for uploaded data. */
35
+ compression?: { algorithm: "zstd"; level?: number };
36
+ /** URL validator called before fetching. Throw to reject. Default: HTTPS-only. */
37
+ urlValidator?: ((url: string) => void) | null;
38
+ }
39
+
40
+ const DEFAULT_THRESHOLD = 1_048_576; // 1 MB
41
+
42
+ // ---------------------------------------------------------------------------
43
+ // URL validation
44
+ // ---------------------------------------------------------------------------
45
+
46
+ /** Default validator that rejects non-HTTPS URLs. */
47
+ export function httpsOnlyValidator(url: string): void {
48
+ const parsed = new URL(url);
49
+ if (parsed.protocol !== "https:") {
50
+ throw new Error(`External location URL must use HTTPS, got "${parsed.protocol}"`);
51
+ }
52
+ }
53
+
54
+ // ---------------------------------------------------------------------------
55
+ // SHA-256 helpers
56
+ // ---------------------------------------------------------------------------
57
+
58
+ async function sha256Hex(data: Uint8Array): Promise<string> {
59
+ // Copy to a plain ArrayBuffer to satisfy Web Crypto API type requirements
60
+ const buf = new ArrayBuffer(data.byteLength);
61
+ new Uint8Array(buf).set(data);
62
+ const hash = await crypto.subtle.digest("SHA-256", buf);
63
+ return Array.from(new Uint8Array(hash))
64
+ .map((b) => b.toString(16).padStart(2, "0"))
65
+ .join("");
66
+ }
67
+
68
+ // ---------------------------------------------------------------------------
69
+ // Detection
70
+ // ---------------------------------------------------------------------------
71
+
72
+ /** Returns true if the batch is a zero-row pointer to external data. */
73
+ export function isExternalLocationBatch(batch: RecordBatch): boolean {
74
+ if (batch.numRows !== 0) return false;
75
+ const meta = batch.metadata;
76
+ if (!meta) return false;
77
+ return meta.has(LOCATION_KEY) && !meta.has(LOG_LEVEL_KEY);
78
+ }
79
+
80
+ // ---------------------------------------------------------------------------
81
+ // Pointer batch creation
82
+ // ---------------------------------------------------------------------------
83
+
84
+ /** Create a zero-row pointer batch with location URL and optional SHA-256. */
85
+ export function makeExternalLocationBatch(schema: Schema, url: string, sha256?: string): RecordBatch {
86
+ const metadata = new Map<string, string>();
87
+ metadata.set(LOCATION_KEY, url);
88
+ if (sha256) {
89
+ metadata.set(LOCATION_SHA256_KEY, sha256);
90
+ }
91
+ return buildEmptyBatch(schema, metadata);
92
+ }
93
+
94
+ // ---------------------------------------------------------------------------
95
+ // IPC serialization helpers
96
+ // ---------------------------------------------------------------------------
97
+
98
+ function serializeBatchToIpc(batch: RecordBatch): Uint8Array {
99
+ const writer = new RecordBatchStreamWriter();
100
+ writer.reset(undefined, batch.schema);
101
+ writer.write(batch);
102
+ writer.close();
103
+ return writer.toUint8Array(true);
104
+ }
105
+
106
+ function batchByteSize(batch: RecordBatch): number {
107
+ // Arrow TS data.byteLength doesn't reflect actual data size.
108
+ // Estimate from IPC serialization size for threshold check.
109
+ const writer = new RecordBatchStreamWriter();
110
+ writer.reset(undefined, batch.schema);
111
+ writer.write(batch);
112
+ writer.close();
113
+ return writer.toUint8Array(true).byteLength;
114
+ }
115
+
116
+ // ---------------------------------------------------------------------------
117
+ // Write path: externalization
118
+ // ---------------------------------------------------------------------------
119
+
120
+ /**
121
+ * Maybe externalize a batch if it exceeds the threshold.
122
+ * Returns the original batch unchanged if below threshold or no config.
123
+ */
124
+ export async function maybeExternalizeBatch(
125
+ batch: RecordBatch,
126
+ config?: ExternalLocationConfig | null,
127
+ ): Promise<RecordBatch> {
128
+ if (!config?.storage) return batch;
129
+ if (batch.numRows === 0) return batch;
130
+
131
+ const threshold = config.externalizeThresholdBytes ?? DEFAULT_THRESHOLD;
132
+ if (batchByteSize(batch) < threshold) return batch;
133
+
134
+ // Serialize to IPC
135
+ let ipcData = serializeBatchToIpc(batch);
136
+
137
+ // Compute SHA-256 of raw IPC bytes (pre-compression)
138
+ const checksum = await sha256Hex(ipcData);
139
+
140
+ // Optionally compress
141
+ let contentEncoding = "";
142
+ if (config.compression?.algorithm === "zstd") {
143
+ ipcData = zstdCompress(ipcData, config.compression.level ?? 3) as Uint8Array;
144
+ contentEncoding = "zstd";
145
+ }
146
+
147
+ // Upload
148
+ const url = await config.storage.upload(ipcData, contentEncoding);
149
+
150
+ // Return pointer batch
151
+ return makeExternalLocationBatch(batch.schema, url, checksum);
152
+ }
153
+
154
+ // ---------------------------------------------------------------------------
155
+ // Read path: resolution
156
+ // ---------------------------------------------------------------------------
157
+
158
+ /**
159
+ * Resolve an external pointer batch by fetching the data from the URL.
160
+ * Returns the original batch unchanged if not a pointer or no config.
161
+ */
162
+ export async function resolveExternalLocation(
163
+ batch: RecordBatch,
164
+ config?: ExternalLocationConfig | null,
165
+ ): Promise<RecordBatch> {
166
+ if (!config) return batch;
167
+ if (!isExternalLocationBatch(batch)) return batch;
168
+
169
+ const url = batch.metadata?.get(LOCATION_KEY);
170
+ if (!url) return batch;
171
+
172
+ // Validate URL
173
+ const validator = config.urlValidator === null ? undefined : (config.urlValidator ?? httpsOnlyValidator);
174
+ if (validator) {
175
+ validator(url);
176
+ }
177
+
178
+ // Fetch
179
+ const response = await fetch(url);
180
+ if (!response.ok) {
181
+ throw new Error(`External location fetch failed: ${response.status} ${response.statusText} [url: ${url}]`);
182
+ }
183
+ let data = new Uint8Array(await response.arrayBuffer());
184
+
185
+ // Decompress if needed
186
+ const contentEncoding = response.headers.get("Content-Encoding");
187
+ if (contentEncoding === "zstd") {
188
+ data = new Uint8Array(zstdDecompress(data));
189
+ }
190
+
191
+ // Verify SHA-256 if present
192
+ const expectedSha256 = batch.metadata?.get(LOCATION_SHA256_KEY);
193
+ if (expectedSha256) {
194
+ const actualSha256 = await sha256Hex(data);
195
+ if (actualSha256 !== expectedSha256) {
196
+ throw new Error(`SHA-256 checksum mismatch for ${url}: expected ${expectedSha256}, got ${actualSha256}`);
197
+ }
198
+ }
199
+
200
+ // Parse IPC stream
201
+ const reader = await RecordBatchReader.from(data);
202
+ await reader.open();
203
+ const resolved = reader.next();
204
+ if (!resolved || resolved.done || !resolved.value) {
205
+ throw new Error(`No data batch found in external IPC stream from ${url}`);
206
+ }
207
+
208
+ return resolved.value;
209
+ }
package/src/gcs.ts ADDED
@@ -0,0 +1,86 @@
1
+ // © Copyright 2025-2026, Query.Farm LLC - https://query.farm
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ /**
5
+ * Google Cloud Storage backend for external storage of large Arrow IPC batches.
6
+ *
7
+ * Requires `@google-cloud/storage` as a peer dependency.
8
+ *
9
+ * @example
10
+ * ```typescript
11
+ * import { createGCSStorage } from "@query-farm/vgi-rpc/gcs";
12
+ *
13
+ * const storage = createGCSStorage({
14
+ * bucket: "my-bucket",
15
+ * prefix: "vgi-rpc/",
16
+ * });
17
+ * const handler = createHttpHandler(protocol, {
18
+ * externalLocation: { storage, externalizeThresholdBytes: 1_048_576 },
19
+ * });
20
+ * ```
21
+ */
22
+
23
+ import type { ExternalStorage } from "./external.js";
24
+
25
+ /** Configuration for the GCS storage backend. */
26
+ export interface GCSStorageConfig {
27
+ /** GCS bucket name. */
28
+ bucket: string;
29
+ /** Key prefix for uploaded objects. Default: "vgi-rpc/". */
30
+ prefix?: string;
31
+ /** Lifetime of signed GET URLs in seconds. Default: 3600 (1 hour). */
32
+ presignExpirySeconds?: number;
33
+ /** GCS project ID. If omitted, uses Application Default Credentials. */
34
+ projectId?: string;
35
+ }
36
+
37
+ /**
38
+ * Create a GCS-backed ExternalStorage.
39
+ *
40
+ * Lazily imports `@google-cloud/storage` on first upload to avoid
41
+ * loading the SDK unless needed.
42
+ */
43
+ export function createGCSStorage(config: GCSStorageConfig): ExternalStorage {
44
+ const bucket = config.bucket;
45
+ const prefix = config.prefix ?? "vgi-rpc/";
46
+ const presignExpiry = config.presignExpirySeconds ?? 3600;
47
+
48
+ let storageClient: any = null;
49
+
50
+ async function ensureClient(): Promise<any> {
51
+ if (storageClient) return storageClient;
52
+ const { Storage } = await import("@google-cloud/storage");
53
+ const clientConfig: Record<string, any> = {};
54
+ if (config.projectId) clientConfig.projectId = config.projectId;
55
+ storageClient = new Storage(clientConfig);
56
+ return storageClient;
57
+ }
58
+
59
+ return {
60
+ async upload(data: Uint8Array, contentEncoding: string): Promise<string> {
61
+ const client = await ensureClient();
62
+ const bucketRef = client.bucket(bucket);
63
+ const blobName = `${prefix}${crypto.randomUUID()}${contentEncoding === "zstd" ? ".arrow.zst" : ".arrow"}`;
64
+ const blob = bucketRef.file(blobName);
65
+
66
+ const options: Record<string, any> = {
67
+ contentType: "application/vnd.apache.arrow.stream",
68
+ resumable: false,
69
+ };
70
+ if (contentEncoding) {
71
+ options.metadata = { contentEncoding };
72
+ }
73
+
74
+ await blob.save(Buffer.from(data), options);
75
+
76
+ // Generate signed GET URL
77
+ const [url] = await blob.getSignedUrl({
78
+ version: "v4" as const,
79
+ action: "read" as const,
80
+ expires: Date.now() + presignExpiry * 1000,
81
+ });
82
+
83
+ return url;
84
+ },
85
+ };
86
+ }