npm - @gscdump/cloudflare - Versions diffs - 0.19.7 → 0.20.1 - Mend

@gscdump/cloudflare 0.19.7 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -13,18 +13,27 @@ interface AnalyticsEnv {
    * Optional: DuckDB service binding (Workers RPC) for server-side execution.
    * Structural shape — any binding with `runSQL` + `ping` satisfies it, so
    * hosts can declare their own binding interface without coupling to this one.
+   * Request tables use Arrow IPC chunks. Small queries pass tables inline to
+   * `runSQL`; larger queries stage chunks via `stageArrowTable`, then call
+   * `runSQL` with only SQL, and finally `dropTables`.
    */
   DUCKDB_SVC?: {
     runSQL: (args: {
       sql: string;
       tables?: Record<string, {
-        rows: unknown[];
-        ddl?: string;
+        ipc: Uint8Array;
       }>;
     }) => Promise<{
       rows: unknown[];
       sql: string;
     }>;
+    stageArrowTable?: (args: {
+      table: string;
+      ipc: Uint8Array;
+    }) => Promise<void>;
+    dropTables?: (args: {
+      tables: string[];
+    }) => Promise<void>;
     ping: () => Promise<string>;
   };
   /** Route override: force D1 as the manifest source even if R2 is bound. */
@@ -83,5 +92,10 @@ declare function createR2Presigner(env: AnalyticsEnv): ({
 declare function signSizeHint(env: AnalyticsEnv, key: string, bytes: number): Promise<string>;
 declare function verifySizeHint(env: AnalyticsEnv, key: string, bytes: number, providedHex: string): Promise<boolean>;
 declare function createDucklingsCodec(_env: AnalyticsEnv): ParquetCodec;
-declare function createDucklingsExecutor(env: AnalyticsEnv): QueryExecutor;
+interface DucklingsExecutorOptions {
+  ipcChunkBytes?: number;
+  ipcDirectCallBytes?: number;
+  ipcTotalBytes?: number;
+}
+declare function createDucklingsExecutor(env: AnalyticsEnv, opts?: DucklingsExecutorOptions): QueryExecutor;
 export { type AnalyticsEngineHooks, type AnalyticsEnv, type HostedR2QueryKeyInput, type InflightDedupe, type PresignOptions, type Row, createDucklingsCodec, createDucklingsExecutor, createInflightDedupe, createR2Presigner, getAnalyticsEngine, getHostedR2QueryKey, getWasmDuckDBFactory, resetWasmDuckDB, signSizeHint, useAnalyticsEnv, verifySizeHint };

package/dist/index.mjs CHANGED Viewed

@@ -1,7 +1,8 @@
-import { bindLiterals, canonicalEmptyParquetSchema, coerceRow, createStorageEngine } from "@gscdump/engine";
+import { SCHEMAS, bindLiterals, coerceRow, createStorageEngine } from "@gscdump/engine";
 import { createD1ManifestStore } from "@gscdump/engine-sqlite";
 import { createR2DataSource } from "@gscdump/engine/r2";
 import { createHyparquetCodec, decodeParquetToRows } from "@gscdump/engine/hyparquet";
+import { float64, int32, int64, tableFromArrays, tableToIPC, utf8 } from "@uwdata/flechette";
 import { createError } from "h3";
 import { AwsClient } from "aws4fetch";
 let handle = null;
@@ -17,12 +18,67 @@ function getWasmDuckDBFactory() {
 function resetWasmDuckDB() {
 	handle = null;
 }
+function arrowTypeForColumn(type) {
+	switch (type) {
+		case "VARCHAR":
+		case "DATE": return utf8();
+		case "BIGINT": return int64();
+		case "INTEGER": return int32();
+		case "DOUBLE": return float64();
+	}
+}
+function rowsToArrowIPC(rows, schemaColumns) {
+	const colNames = [];
+	const seen = /* @__PURE__ */ new Set();
+	const add = (name) => {
+		if (!seen.has(name)) {
+			seen.add(name);
+			colNames.push(name);
+		}
+	};
+	if (schemaColumns) for (const c of schemaColumns) add(c.name);
+	for (const row of rows) for (const key in row) add(key);
+	const schemaType = /* @__PURE__ */ new Map();
+	if (schemaColumns) for (const c of schemaColumns) schemaType.set(c.name, c.type);
+	const data = {};
+	const types = {};
+	for (const name of colNames) {
+		const values = rows.map((r) => r[name] ?? null);
+		data[name] = values;
+		const known = schemaType.get(name);
+		if (known) {
+			types[name] = arrowTypeForColumn(known);
+			continue;
+		}
+		let hasString = false;
+		let hasValue = false;
+		for (const v of values) {
+			if (v === null || v === void 0) continue;
+			hasValue = true;
+			if (typeof v === "string") {
+				hasString = true;
+				break;
+			}
+		}
+		if (hasString || !hasValue) types[name] = utf8();
+	}
+	const ipc = tableToIPC(tableFromArrays(data, { types }), { format: "stream" });
+	if (!ipc) throw new Error("rowsToArrowIPC: tableToIPC returned null");
+	return ipc;
+}
 function resolveSvc(env) {
 	const svc = env.DUCKDB_SVC;
 	if (!svc) throw new Error("DUCKDB_SVC service binding is not configured");
 	return svc;
 }
 const DUCKDB_RPC_TIMEOUT_MS = 22e3;
+const WORKER_R2_MAX_FILES = 96;
+const WORKER_R2_MAX_BYTES = 64 * 1024 * 1024;
+const WORKER_R2_DECODE_CONCURRENCY = 2;
+const WORKER_R2_HEAD_CONCURRENCY = 4;
+const IPC_CHUNK_BUDGET = 8 * 1024 * 1024;
+const IPC_DIRECT_CALL_BUDGET = 30 * 1024 * 1024;
+const IPC_STAGED_TOTAL_BUDGET = 64 * 1024 * 1024;
 var DuckDBServiceTimeoutError = class extends Error {
 	name = "DuckDBServiceTimeoutError";
 	constructor(timeoutMs) {
@@ -49,6 +105,33 @@ function tmpTableName(placeholder) {
 	const uuid = crypto.randomUUID().replace(/-/g, "_");
 	return `tmp_${placeholder.toLowerCase()}_${uuid}`;
 }
+async function mapLimit(items, concurrency, fn) {
+	const limit = Math.max(1, Math.floor(concurrency));
+	const out = Array.from({ length: items.length });
+	let next = 0;
+	async function worker() {
+		while (next < items.length) {
+			const index = next++;
+			out[index] = await fn(items[index], index);
+		}
+	}
+	await Promise.all(Array.from({ length: Math.min(limit, items.length) }, worker));
+	return out;
+}
+function assertWorkerReadBudget(opts) {
+	const maxFiles = opts.maxFiles ?? WORKER_R2_MAX_FILES;
+	const maxBytes = opts.maxBytes ?? WORKER_R2_MAX_BYTES;
+	const entries = Object.entries(opts.fileKeys);
+	const totalFiles = entries.reduce((acc, [, keys]) => acc + keys.length, 0);
+	if (totalFiles > maxFiles) throw new Error(`createDucklingsExecutor: planned read spans ${totalFiles} files, exceeding the ${maxFiles} file Worker budget. Narrow the date range or route through a background/windowed query.`);
+	if (!opts.sizes) return;
+	let totalBytes = 0;
+	for (const [, keys] of entries) for (const key of keys) {
+		const bytes = opts.sizes[key];
+		if (bytes !== void 0) totalBytes += Math.max(0, bytes);
+	}
+	if (totalBytes > maxBytes) throw new Error(`createDucklingsExecutor: planned read spans ${totalBytes} bytes, exceeding the ${maxBytes} byte Worker budget. Narrow the date range or route through a background/windowed query.`);
+}
 const ROW_CACHE_MAX_BYTES = 16 * 1024 * 1024;
 let rowCacheBytes = 0;
 const rowCache = /* @__PURE__ */ new Map();
@@ -57,6 +140,88 @@ function estimateRowsBytes(rows) {
 	const cols = Object.keys(rows[0]).length;
 	return rows.length * cols * 64;
 }
+function inferExtraColumnType(values) {
+	let hasValue = false;
+	let hasString = false;
+	let hasFloat = false;
+	let hasBigInt = false;
+	for (const value of values) {
+		if (value === null || value === void 0) continue;
+		hasValue = true;
+		if (typeof value === "string") {
+			hasString = true;
+			break;
+		}
+		if (typeof value === "bigint") {
+			hasBigInt = true;
+			continue;
+		}
+		if (typeof value === "number") {
+			if (!Number.isInteger(value)) hasFloat = true;
+			if (value > 2147483647 || value < -2147483648) hasBigInt = true;
+		}
+	}
+	if (!hasValue || hasString) return "VARCHAR";
+	if (hasFloat) return "DOUBLE";
+	return hasBigInt ? "BIGINT" : "INTEGER";
+}
+function chunkSchemaColumns(rows, schemaColumns) {
+	const columns = schemaColumns ? [...schemaColumns] : [];
+	const seen = new Set(columns.map((c) => c.name));
+	const extraValues = /* @__PURE__ */ new Map();
+	for (const row of rows) for (const key in row) {
+		if (seen.has(key)) continue;
+		let values = extraValues.get(key);
+		if (!values) {
+			values = [];
+			extraValues.set(key, values);
+		}
+		values.push(row[key]);
+	}
+	if (extraValues.size === 0) return schemaColumns ? columns : void 0;
+	for (const [name, values] of extraValues) columns.push({
+		name,
+		type: inferExtraColumnType(values),
+		nullable: true
+	});
+	return columns;
+}
+function rowsToArrowIPCChunks(rows, schemaColumns, opts = {}) {
+	const maxChunkBytes = Math.max(1, Math.floor(opts.maxChunkBytes ?? IPC_CHUNK_BUDGET));
+	const placeholder = opts.placeholder ? `{{${opts.placeholder}}}` : "placeholder";
+	const chunkColumns = chunkSchemaColumns(rows, schemaColumns);
+	if (rows.length === 0) {
+		const ipc = rowsToArrowIPC([], chunkColumns);
+		if (ipc.byteLength > maxChunkBytes) throw new Error(`createDucklingsExecutor: empty ${placeholder} Arrow IPC schema encoded to ${ipc.byteLength} bytes, exceeding the ${maxChunkBytes}-byte service-binding chunk budget.`);
+		return [{
+			ipc,
+			rows: 0
+		}];
+	}
+	const estimatedPerRow = Math.max(1, Math.ceil(estimateRowsBytes(rows) / rows.length));
+	let targetRows = Math.max(1, Math.min(rows.length, Math.floor(maxChunkBytes * .75 / estimatedPerRow)));
+	const chunks = [];
+	let index = 0;
+	while (index < rows.length) {
+		let take = Math.min(targetRows, rows.length - index);
+		while (true) {
+			const slice = rows.slice(index, index + take);
+			const ipc = rowsToArrowIPC(slice, chunkColumns);
+			if (ipc.byteLength <= maxChunkBytes) {
+				chunks.push({
+					ipc,
+					rows: slice.length
+				});
+				index += take;
+				if (ipc.byteLength < maxChunkBytes * .4 && take === targetRows) targetRows = Math.min(rows.length - index || targetRows, targetRows * 2);
+				break;
+			}
+			if (take === 1) throw new Error(`createDucklingsExecutor: one ${placeholder} row encoded to ${ipc.byteLength} bytes of Arrow IPC, exceeding the ${maxChunkBytes}-byte service-binding chunk budget. Narrow the query or route through a background/windowed query.`);
+			take = Math.max(1, Math.floor(take / 2));
+		}
+	}
+	return chunks;
+}
 function rowCacheGet(key) {
 	const hit = rowCache.get(key);
 	if (!hit) return void 0;
@@ -80,41 +245,97 @@ function rowCachePut(key, rows) {
 	});
 	rowCacheBytes += bytes;
 }
-function createDucklingsExecutor(env) {
-	return { async execute({ sql, params, fileKeys, dataSource, signal, table }) {
+function createDucklingsExecutor(env, opts = {}) {
+	return { async execute({ sql, params, fileKeys, placeholderTables, dataSource, signal, table }) {
 		signal?.throwIfAborted();
 		const svc = resolveSvc(env);
+		assertWorkerReadBudget({ fileKeys });
+		if (dataSource.head) {
+			const uniqueKeys = [...new Set(Object.values(fileKeys).flat())];
+			const sizes = {};
+			await mapLimit(uniqueKeys, WORKER_R2_HEAD_CONCURRENCY, async (key) => {
+				signal?.throwIfAborted();
+				sizes[key] = (await dataSource.head(key))?.bytes;
+			});
+			assertWorkerReadBudget({
+				fileKeys,
+				sizes
+			});
+		}
 		const tempNames = {};
-		const tables = {};
-		await Promise.all(Object.entries(fileKeys).map(async ([placeholder, keys]) => {
-			const perFile = await Promise.all(keys.map(async (key) => {
+		const tableChunks = {};
+		let totalIpcBytes = 0;
+		const maxChunkBytes = opts.ipcChunkBytes ?? IPC_CHUNK_BUDGET;
+		const maxDirectCallBytes = opts.ipcDirectCallBytes ?? IPC_DIRECT_CALL_BUDGET;
+		const maxTotalBytes = opts.ipcTotalBytes ?? IPC_STAGED_TOTAL_BUDGET;
+		for (const [placeholder, keys] of Object.entries(fileKeys)) {
+			signal?.throwIfAborted();
+			const perFile = await mapLimit(keys, WORKER_R2_DECODE_CONCURRENCY, async (key) => {
 				const cached = rowCacheGet(key);
 				if (cached) return cached;
-				const rows = await decodeParquetToRows(await dataSource.read(key));
+				signal?.throwIfAborted();
+				const bytes = await dataSource.read(key, void 0, signal);
+				signal?.throwIfAborted();
+				const rows = await decodeParquetToRows(bytes);
 				rowCachePut(key, rows);
 				return rows;
-			}));
+			});
 			const merged = [];
 			for (const rows of perFile) merged.push(...rows);
-			const mergedBytes = estimateRowsBytes(merged);
-			if (mergedBytes > 28 * 1024 * 1024) throw new Error(`createDucklingsExecutor: placeholder {{${placeholder}}} decoded to ~${mergedBytes} bytes (${merged.length} rows), exceeding the 28MiB service-binding RPC budget. The rollup builder must window this query (chunk partitions) instead of scanning all files at once.`);
+			const chunks = rowsToArrowIPCChunks(merged, SCHEMAS[placeholderTables?.[placeholder] ?? table]?.columns, {
+				maxChunkBytes,
+				placeholder
+			});
+			signal?.throwIfAborted();
+			totalIpcBytes += chunks.reduce((acc, chunk) => acc + chunk.ipc.byteLength, 0);
+			if (totalIpcBytes > maxTotalBytes) throw new Error(`createDucklingsExecutor: query encoded to ${totalIpcBytes} bytes of Arrow IPC across ${Object.keys(tableChunks).length + 1} placeholders, exceeding the ${maxTotalBytes}-byte service-binding transport budget. Window the query (chunk partitions / narrow the range).`);
 			const tmp = tmpTableName(placeholder);
 			tempNames[placeholder] = tmp;
-			tables[tmp] = {
-				rows: merged,
-				ddl: `AS SELECT * FROM ${canonicalEmptyParquetSchema(table)} WHERE FALSE`
-			};
-		}));
+			tableChunks[tmp] = chunks;
+		}
 		signal?.throwIfAborted();
 		const finalSql = bindLiterals(sql.replace(READ_PARQUET_PLACEHOLDER, (_, placeholder) => {
 			const tmp = tempNames[placeholder];
 			if (!tmp) throw new Error(`createDucklingsExecutor: SQL references {{${placeholder}}} but no fileKeys entry provided`);
 			return tmp;
 		}), params);
-		const result = await withDuckDBDeadline(svc.runSQL({
-			sql: finalSql,
-			tables
-		}), DUCKDB_RPC_TIMEOUT_MS, signal);
+		const canInlineTables = totalIpcBytes <= maxDirectCallBytes && Object.values(tableChunks).every((chunks) => chunks.length === 1);
+		let result;
+		if (canInlineTables) {
+			const tables = {};
+			for (const [name, chunks] of Object.entries(tableChunks)) tables[name] = { ipc: chunks[0].ipc };
+			result = await withDuckDBDeadline(svc.runSQL({
+				sql: finalSql,
+				tables
+			}), DUCKDB_RPC_TIMEOUT_MS, signal);
+		} else {
+			if (!svc.stageArrowTable || !svc.dropTables) throw new Error("createDucklingsExecutor: DUCKDB_SVC does not support chunked Arrow IPC staging. Deploy the gscdump-duckdb worker with stageArrowTable/dropTables support.");
+			const staged = /* @__PURE__ */ new Set();
+			let primaryError;
+			let cleanupError;
+			try {
+				for (const [name, chunks] of Object.entries(tableChunks)) for (const chunk of chunks) {
+					signal?.throwIfAborted();
+					staged.add(name);
+					await withDuckDBDeadline(svc.stageArrowTable({
+						table: name,
+						ipc: chunk.ipc
+					}), DUCKDB_RPC_TIMEOUT_MS, signal);
+				}
+				result = await withDuckDBDeadline(svc.runSQL({ sql: finalSql }), DUCKDB_RPC_TIMEOUT_MS, signal);
+			} catch (error) {
+				primaryError = error;
+			} finally {
+				if (staged.size > 0) try {
+					await withDuckDBDeadline(svc.dropTables({ tables: [...staged] }), DUCKDB_RPC_TIMEOUT_MS);
+				} catch (error) {
+					cleanupError = error;
+				}
+			}
+			if (primaryError) throw primaryError;
+			if (cleanupError) throw cleanupError;
+			result = result;
+		}
 		return {
 			rows: result.rows.map(coerceRow),
 			sql: result.sql

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@gscdump/cloudflare",
   "type": "module",
-  "version": "0.19.7",
+  "version": "0.20.1",
   "description": "Cloudflare-Workers-flavored helpers for the gscdump analytics stack: AnalyticsEnv binding contract, R2 SigV4 presigner, size-hint HMAC, DuckDB Workers shims, engine factory.",
   "author": {
     "name": "Harlan Wilton",
@@ -39,12 +39,13 @@
     "h3": "^1.15.0"
   },
   "dependencies": {
+    "@uwdata/flechette": "^2.5.0",
     "aws4fetch": "^1.0.20",
-    "@gscdump/engine-sqlite": "0.19.7",
-    "@gscdump/engine": "0.19.7"
+    "@gscdump/engine-sqlite": "0.20.1",
+    "@gscdump/engine": "0.20.1"
   },
   "devDependencies": {
-    "@cloudflare/workers-types": "^4.20260519.1",
+    "@cloudflare/workers-types": "^4.20260520.1",
     "h3": "^1.15.11",
     "typescript": "^6.0.3"
   },