@gscdump/cloudflare 0.19.7 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -13,18 +13,27 @@ interface AnalyticsEnv {
13
13
  * Optional: DuckDB service binding (Workers RPC) for server-side execution.
14
14
  * Structural shape — any binding with `runSQL` + `ping` satisfies it, so
15
15
  * hosts can declare their own binding interface without coupling to this one.
16
+ * Request tables use Arrow IPC chunks. Small queries pass tables inline to
17
+ * `runSQL`; larger queries stage chunks via `stageArrowTable`, then call
18
+ * `runSQL` with only SQL, and finally `dropTables`.
16
19
  */
17
20
  DUCKDB_SVC?: {
18
21
  runSQL: (args: {
19
22
  sql: string;
20
23
  tables?: Record<string, {
21
- rows: unknown[];
22
- ddl?: string;
24
+ ipc: Uint8Array;
23
25
  }>;
24
26
  }) => Promise<{
25
27
  rows: unknown[];
26
28
  sql: string;
27
29
  }>;
30
+ stageArrowTable?: (args: {
31
+ table: string;
32
+ ipc: Uint8Array;
33
+ }) => Promise<void>;
34
+ dropTables?: (args: {
35
+ tables: string[];
36
+ }) => Promise<void>;
28
37
  ping: () => Promise<string>;
29
38
  };
30
39
  /** Route override: force D1 as the manifest source even if R2 is bound. */
@@ -83,5 +92,10 @@ declare function createR2Presigner(env: AnalyticsEnv): ({
83
92
  declare function signSizeHint(env: AnalyticsEnv, key: string, bytes: number): Promise<string>;
84
93
  declare function verifySizeHint(env: AnalyticsEnv, key: string, bytes: number, providedHex: string): Promise<boolean>;
85
94
  declare function createDucklingsCodec(_env: AnalyticsEnv): ParquetCodec;
86
- declare function createDucklingsExecutor(env: AnalyticsEnv): QueryExecutor;
95
+ interface DucklingsExecutorOptions {
96
+ ipcChunkBytes?: number;
97
+ ipcDirectCallBytes?: number;
98
+ ipcTotalBytes?: number;
99
+ }
100
+ declare function createDucklingsExecutor(env: AnalyticsEnv, opts?: DucklingsExecutorOptions): QueryExecutor;
87
101
  export { type AnalyticsEngineHooks, type AnalyticsEnv, type HostedR2QueryKeyInput, type InflightDedupe, type PresignOptions, type Row, createDucklingsCodec, createDucklingsExecutor, createInflightDedupe, createR2Presigner, getAnalyticsEngine, getHostedR2QueryKey, getWasmDuckDBFactory, resetWasmDuckDB, signSizeHint, useAnalyticsEnv, verifySizeHint };
package/dist/index.mjs CHANGED
@@ -1,7 +1,8 @@
1
- import { bindLiterals, canonicalEmptyParquetSchema, coerceRow, createStorageEngine } from "@gscdump/engine";
1
+ import { SCHEMAS, bindLiterals, coerceRow, createStorageEngine } from "@gscdump/engine";
2
2
  import { createD1ManifestStore } from "@gscdump/engine-sqlite";
3
3
  import { createR2DataSource } from "@gscdump/engine/r2";
4
4
  import { createHyparquetCodec, decodeParquetToRows } from "@gscdump/engine/hyparquet";
5
+ import { float64, int32, int64, tableFromArrays, tableToIPC, utf8 } from "@uwdata/flechette";
5
6
  import { createError } from "h3";
6
7
  import { AwsClient } from "aws4fetch";
7
8
  let handle = null;
@@ -17,12 +18,67 @@ function getWasmDuckDBFactory() {
17
18
  function resetWasmDuckDB() {
18
19
  handle = null;
19
20
  }
21
+ function arrowTypeForColumn(type) {
22
+ switch (type) {
23
+ case "VARCHAR":
24
+ case "DATE": return utf8();
25
+ case "BIGINT": return int64();
26
+ case "INTEGER": return int32();
27
+ case "DOUBLE": return float64();
28
+ }
29
+ }
30
+ function rowsToArrowIPC(rows, schemaColumns) {
31
+ const colNames = [];
32
+ const seen = /* @__PURE__ */ new Set();
33
+ const add = (name) => {
34
+ if (!seen.has(name)) {
35
+ seen.add(name);
36
+ colNames.push(name);
37
+ }
38
+ };
39
+ if (schemaColumns) for (const c of schemaColumns) add(c.name);
40
+ for (const row of rows) for (const key in row) add(key);
41
+ const schemaType = /* @__PURE__ */ new Map();
42
+ if (schemaColumns) for (const c of schemaColumns) schemaType.set(c.name, c.type);
43
+ const data = {};
44
+ const types = {};
45
+ for (const name of colNames) {
46
+ const values = rows.map((r) => r[name] ?? null);
47
+ data[name] = values;
48
+ const known = schemaType.get(name);
49
+ if (known) {
50
+ types[name] = arrowTypeForColumn(known);
51
+ continue;
52
+ }
53
+ let hasString = false;
54
+ let hasValue = false;
55
+ for (const v of values) {
56
+ if (v === null || v === void 0) continue;
57
+ hasValue = true;
58
+ if (typeof v === "string") {
59
+ hasString = true;
60
+ break;
61
+ }
62
+ }
63
+ if (hasString || !hasValue) types[name] = utf8();
64
+ }
65
+ const ipc = tableToIPC(tableFromArrays(data, { types }), { format: "stream" });
66
+ if (!ipc) throw new Error("rowsToArrowIPC: tableToIPC returned null");
67
+ return ipc;
68
+ }
20
69
  function resolveSvc(env) {
21
70
  const svc = env.DUCKDB_SVC;
22
71
  if (!svc) throw new Error("DUCKDB_SVC service binding is not configured");
23
72
  return svc;
24
73
  }
25
74
  const DUCKDB_RPC_TIMEOUT_MS = 22e3;
75
+ const WORKER_R2_MAX_FILES = 96;
76
+ const WORKER_R2_MAX_BYTES = 64 * 1024 * 1024;
77
+ const WORKER_R2_DECODE_CONCURRENCY = 2;
78
+ const WORKER_R2_HEAD_CONCURRENCY = 4;
79
+ const IPC_CHUNK_BUDGET = 8 * 1024 * 1024;
80
+ const IPC_DIRECT_CALL_BUDGET = 30 * 1024 * 1024;
81
+ const IPC_STAGED_TOTAL_BUDGET = 64 * 1024 * 1024;
26
82
  var DuckDBServiceTimeoutError = class extends Error {
27
83
  name = "DuckDBServiceTimeoutError";
28
84
  constructor(timeoutMs) {
@@ -49,6 +105,33 @@ function tmpTableName(placeholder) {
49
105
  const uuid = crypto.randomUUID().replace(/-/g, "_");
50
106
  return `tmp_${placeholder.toLowerCase()}_${uuid}`;
51
107
  }
108
+ async function mapLimit(items, concurrency, fn) {
109
+ const limit = Math.max(1, Math.floor(concurrency));
110
+ const out = Array.from({ length: items.length });
111
+ let next = 0;
112
+ async function worker() {
113
+ while (next < items.length) {
114
+ const index = next++;
115
+ out[index] = await fn(items[index], index);
116
+ }
117
+ }
118
+ await Promise.all(Array.from({ length: Math.min(limit, items.length) }, worker));
119
+ return out;
120
+ }
121
+ function assertWorkerReadBudget(opts) {
122
+ const maxFiles = opts.maxFiles ?? WORKER_R2_MAX_FILES;
123
+ const maxBytes = opts.maxBytes ?? WORKER_R2_MAX_BYTES;
124
+ const entries = Object.entries(opts.fileKeys);
125
+ const totalFiles = entries.reduce((acc, [, keys]) => acc + keys.length, 0);
126
+ if (totalFiles > maxFiles) throw new Error(`createDucklingsExecutor: planned read spans ${totalFiles} files, exceeding the ${maxFiles} file Worker budget. Narrow the date range or route through a background/windowed query.`);
127
+ if (!opts.sizes) return;
128
+ let totalBytes = 0;
129
+ for (const [, keys] of entries) for (const key of keys) {
130
+ const bytes = opts.sizes[key];
131
+ if (bytes !== void 0) totalBytes += Math.max(0, bytes);
132
+ }
133
+ if (totalBytes > maxBytes) throw new Error(`createDucklingsExecutor: planned read spans ${totalBytes} bytes, exceeding the ${maxBytes} byte Worker budget. Narrow the date range or route through a background/windowed query.`);
134
+ }
52
135
  const ROW_CACHE_MAX_BYTES = 16 * 1024 * 1024;
53
136
  let rowCacheBytes = 0;
54
137
  const rowCache = /* @__PURE__ */ new Map();
@@ -57,6 +140,88 @@ function estimateRowsBytes(rows) {
57
140
  const cols = Object.keys(rows[0]).length;
58
141
  return rows.length * cols * 64;
59
142
  }
143
+ function inferExtraColumnType(values) {
144
+ let hasValue = false;
145
+ let hasString = false;
146
+ let hasFloat = false;
147
+ let hasBigInt = false;
148
+ for (const value of values) {
149
+ if (value === null || value === void 0) continue;
150
+ hasValue = true;
151
+ if (typeof value === "string") {
152
+ hasString = true;
153
+ break;
154
+ }
155
+ if (typeof value === "bigint") {
156
+ hasBigInt = true;
157
+ continue;
158
+ }
159
+ if (typeof value === "number") {
160
+ if (!Number.isInteger(value)) hasFloat = true;
161
+ if (value > 2147483647 || value < -2147483648) hasBigInt = true;
162
+ }
163
+ }
164
+ if (!hasValue || hasString) return "VARCHAR";
165
+ if (hasFloat) return "DOUBLE";
166
+ return hasBigInt ? "BIGINT" : "INTEGER";
167
+ }
168
+ function chunkSchemaColumns(rows, schemaColumns) {
169
+ const columns = schemaColumns ? [...schemaColumns] : [];
170
+ const seen = new Set(columns.map((c) => c.name));
171
+ const extraValues = /* @__PURE__ */ new Map();
172
+ for (const row of rows) for (const key in row) {
173
+ if (seen.has(key)) continue;
174
+ let values = extraValues.get(key);
175
+ if (!values) {
176
+ values = [];
177
+ extraValues.set(key, values);
178
+ }
179
+ values.push(row[key]);
180
+ }
181
+ if (extraValues.size === 0) return schemaColumns ? columns : void 0;
182
+ for (const [name, values] of extraValues) columns.push({
183
+ name,
184
+ type: inferExtraColumnType(values),
185
+ nullable: true
186
+ });
187
+ return columns;
188
+ }
189
+ function rowsToArrowIPCChunks(rows, schemaColumns, opts = {}) {
190
+ const maxChunkBytes = Math.max(1, Math.floor(opts.maxChunkBytes ?? IPC_CHUNK_BUDGET));
191
+ const placeholder = opts.placeholder ? `{{${opts.placeholder}}}` : "placeholder";
192
+ const chunkColumns = chunkSchemaColumns(rows, schemaColumns);
193
+ if (rows.length === 0) {
194
+ const ipc = rowsToArrowIPC([], chunkColumns);
195
+ if (ipc.byteLength > maxChunkBytes) throw new Error(`createDucklingsExecutor: empty ${placeholder} Arrow IPC schema encoded to ${ipc.byteLength} bytes, exceeding the ${maxChunkBytes}-byte service-binding chunk budget.`);
196
+ return [{
197
+ ipc,
198
+ rows: 0
199
+ }];
200
+ }
201
+ const estimatedPerRow = Math.max(1, Math.ceil(estimateRowsBytes(rows) / rows.length));
202
+ let targetRows = Math.max(1, Math.min(rows.length, Math.floor(maxChunkBytes * .75 / estimatedPerRow)));
203
+ const chunks = [];
204
+ let index = 0;
205
+ while (index < rows.length) {
206
+ let take = Math.min(targetRows, rows.length - index);
207
+ while (true) {
208
+ const slice = rows.slice(index, index + take);
209
+ const ipc = rowsToArrowIPC(slice, chunkColumns);
210
+ if (ipc.byteLength <= maxChunkBytes) {
211
+ chunks.push({
212
+ ipc,
213
+ rows: slice.length
214
+ });
215
+ index += take;
216
+ if (ipc.byteLength < maxChunkBytes * .4 && take === targetRows) targetRows = Math.min(rows.length - index || targetRows, targetRows * 2);
217
+ break;
218
+ }
219
+ if (take === 1) throw new Error(`createDucklingsExecutor: one ${placeholder} row encoded to ${ipc.byteLength} bytes of Arrow IPC, exceeding the ${maxChunkBytes}-byte service-binding chunk budget. Narrow the query or route through a background/windowed query.`);
220
+ take = Math.max(1, Math.floor(take / 2));
221
+ }
222
+ }
223
+ return chunks;
224
+ }
60
225
  function rowCacheGet(key) {
61
226
  const hit = rowCache.get(key);
62
227
  if (!hit) return void 0;
@@ -80,41 +245,97 @@ function rowCachePut(key, rows) {
80
245
  });
81
246
  rowCacheBytes += bytes;
82
247
  }
83
- function createDucklingsExecutor(env) {
84
- return { async execute({ sql, params, fileKeys, dataSource, signal, table }) {
248
+ function createDucklingsExecutor(env, opts = {}) {
249
+ return { async execute({ sql, params, fileKeys, placeholderTables, dataSource, signal, table }) {
85
250
  signal?.throwIfAborted();
86
251
  const svc = resolveSvc(env);
252
+ assertWorkerReadBudget({ fileKeys });
253
+ if (dataSource.head) {
254
+ const uniqueKeys = [...new Set(Object.values(fileKeys).flat())];
255
+ const sizes = {};
256
+ await mapLimit(uniqueKeys, WORKER_R2_HEAD_CONCURRENCY, async (key) => {
257
+ signal?.throwIfAborted();
258
+ sizes[key] = (await dataSource.head(key))?.bytes;
259
+ });
260
+ assertWorkerReadBudget({
261
+ fileKeys,
262
+ sizes
263
+ });
264
+ }
87
265
  const tempNames = {};
88
- const tables = {};
89
- await Promise.all(Object.entries(fileKeys).map(async ([placeholder, keys]) => {
90
- const perFile = await Promise.all(keys.map(async (key) => {
266
+ const tableChunks = {};
267
+ let totalIpcBytes = 0;
268
+ const maxChunkBytes = opts.ipcChunkBytes ?? IPC_CHUNK_BUDGET;
269
+ const maxDirectCallBytes = opts.ipcDirectCallBytes ?? IPC_DIRECT_CALL_BUDGET;
270
+ const maxTotalBytes = opts.ipcTotalBytes ?? IPC_STAGED_TOTAL_BUDGET;
271
+ for (const [placeholder, keys] of Object.entries(fileKeys)) {
272
+ signal?.throwIfAborted();
273
+ const perFile = await mapLimit(keys, WORKER_R2_DECODE_CONCURRENCY, async (key) => {
91
274
  const cached = rowCacheGet(key);
92
275
  if (cached) return cached;
93
- const rows = await decodeParquetToRows(await dataSource.read(key));
276
+ signal?.throwIfAborted();
277
+ const bytes = await dataSource.read(key, void 0, signal);
278
+ signal?.throwIfAborted();
279
+ const rows = await decodeParquetToRows(bytes);
94
280
  rowCachePut(key, rows);
95
281
  return rows;
96
- }));
282
+ });
97
283
  const merged = [];
98
284
  for (const rows of perFile) merged.push(...rows);
99
- const mergedBytes = estimateRowsBytes(merged);
100
- if (mergedBytes > 28 * 1024 * 1024) throw new Error(`createDucklingsExecutor: placeholder {{${placeholder}}} decoded to ~${mergedBytes} bytes (${merged.length} rows), exceeding the 28MiB service-binding RPC budget. The rollup builder must window this query (chunk partitions) instead of scanning all files at once.`);
285
+ const chunks = rowsToArrowIPCChunks(merged, SCHEMAS[placeholderTables?.[placeholder] ?? table]?.columns, {
286
+ maxChunkBytes,
287
+ placeholder
288
+ });
289
+ signal?.throwIfAborted();
290
+ totalIpcBytes += chunks.reduce((acc, chunk) => acc + chunk.ipc.byteLength, 0);
291
+ if (totalIpcBytes > maxTotalBytes) throw new Error(`createDucklingsExecutor: query encoded to ${totalIpcBytes} bytes of Arrow IPC across ${Object.keys(tableChunks).length + 1} placeholders, exceeding the ${maxTotalBytes}-byte service-binding transport budget. Window the query (chunk partitions / narrow the range).`);
101
292
  const tmp = tmpTableName(placeholder);
102
293
  tempNames[placeholder] = tmp;
103
- tables[tmp] = {
104
- rows: merged,
105
- ddl: `AS SELECT * FROM ${canonicalEmptyParquetSchema(table)} WHERE FALSE`
106
- };
107
- }));
294
+ tableChunks[tmp] = chunks;
295
+ }
108
296
  signal?.throwIfAborted();
109
297
  const finalSql = bindLiterals(sql.replace(READ_PARQUET_PLACEHOLDER, (_, placeholder) => {
110
298
  const tmp = tempNames[placeholder];
111
299
  if (!tmp) throw new Error(`createDucklingsExecutor: SQL references {{${placeholder}}} but no fileKeys entry provided`);
112
300
  return tmp;
113
301
  }), params);
114
- const result = await withDuckDBDeadline(svc.runSQL({
115
- sql: finalSql,
116
- tables
117
- }), DUCKDB_RPC_TIMEOUT_MS, signal);
302
+ const canInlineTables = totalIpcBytes <= maxDirectCallBytes && Object.values(tableChunks).every((chunks) => chunks.length === 1);
303
+ let result;
304
+ if (canInlineTables) {
305
+ const tables = {};
306
+ for (const [name, chunks] of Object.entries(tableChunks)) tables[name] = { ipc: chunks[0].ipc };
307
+ result = await withDuckDBDeadline(svc.runSQL({
308
+ sql: finalSql,
309
+ tables
310
+ }), DUCKDB_RPC_TIMEOUT_MS, signal);
311
+ } else {
312
+ if (!svc.stageArrowTable || !svc.dropTables) throw new Error("createDucklingsExecutor: DUCKDB_SVC does not support chunked Arrow IPC staging. Deploy the gscdump-duckdb worker with stageArrowTable/dropTables support.");
313
+ const staged = /* @__PURE__ */ new Set();
314
+ let primaryError;
315
+ let cleanupError;
316
+ try {
317
+ for (const [name, chunks] of Object.entries(tableChunks)) for (const chunk of chunks) {
318
+ signal?.throwIfAborted();
319
+ staged.add(name);
320
+ await withDuckDBDeadline(svc.stageArrowTable({
321
+ table: name,
322
+ ipc: chunk.ipc
323
+ }), DUCKDB_RPC_TIMEOUT_MS, signal);
324
+ }
325
+ result = await withDuckDBDeadline(svc.runSQL({ sql: finalSql }), DUCKDB_RPC_TIMEOUT_MS, signal);
326
+ } catch (error) {
327
+ primaryError = error;
328
+ } finally {
329
+ if (staged.size > 0) try {
330
+ await withDuckDBDeadline(svc.dropTables({ tables: [...staged] }), DUCKDB_RPC_TIMEOUT_MS);
331
+ } catch (error) {
332
+ cleanupError = error;
333
+ }
334
+ }
335
+ if (primaryError) throw primaryError;
336
+ if (cleanupError) throw cleanupError;
337
+ result = result;
338
+ }
118
339
  return {
119
340
  rows: result.rows.map(coerceRow),
120
341
  sql: result.sql
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/cloudflare",
3
3
  "type": "module",
4
- "version": "0.19.7",
4
+ "version": "0.20.1",
5
5
  "description": "Cloudflare-Workers-flavored helpers for the gscdump analytics stack: AnalyticsEnv binding contract, R2 SigV4 presigner, size-hint HMAC, DuckDB Workers shims, engine factory.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -39,12 +39,13 @@
39
39
  "h3": "^1.15.0"
40
40
  },
41
41
  "dependencies": {
42
+ "@uwdata/flechette": "^2.5.0",
42
43
  "aws4fetch": "^1.0.20",
43
- "@gscdump/engine-sqlite": "0.19.7",
44
- "@gscdump/engine": "0.19.7"
44
+ "@gscdump/engine-sqlite": "0.20.1",
45
+ "@gscdump/engine": "0.20.1"
45
46
  },
46
47
  "devDependencies": {
47
- "@cloudflare/workers-types": "^4.20260519.1",
48
+ "@cloudflare/workers-types": "^4.20260520.1",
48
49
  "h3": "^1.15.11",
49
50
  "typescript": "^6.0.3"
50
51
  },