npm - @gscdump/engine - Versions diffs - 0.28.2 → 0.29.0 - Mend

@gscdump/engine 0.28.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/_chunks/duckdb.d.mts +1 -1
package/dist/_chunks/engine.mjs +55 -7
package/dist/_chunks/index.d.mts +6 -2
package/dist/_chunks/libs/hyparquet-compressors.mjs +9 -9
package/dist/_chunks/libs/icebird.mjs +6 -6
package/dist/_chunks/schema.d.mts +16 -9
package/dist/_chunks/schema.mjs +6 -3
package/dist/_chunks/schema2.mjs +30 -3
package/dist/_chunks/sink.d.mts +64 -10
package/dist/_chunks/storage.d.mts +16 -0
package/dist/_chunks/types.d.mts +7 -2
package/dist/adapters/duckdb-node.mjs +18 -6
package/dist/adapters/hyparquet.d.mts +10 -2
package/dist/adapters/hyparquet.mjs +132 -42
package/dist/adapters/node.mjs +1 -1
package/dist/errors.mjs +1 -1
package/dist/iceberg/index.d.mts +2 -2
package/dist/iceberg/index.mjs +47 -29
package/dist/ingest.mjs +5 -3
package/dist/rollups.mjs +1 -1
package/dist/schema.d.mts +2 -2
package/dist/schema.mjs +2 -2
package/dist/sql-fragments.d.mts +24 -1
package/dist/sql-fragments.mjs +6 -1
package/package.json +7 -7

package/dist/_chunks/duckdb.d.mts CHANGED Viewed

@@ -19,7 +19,7 @@ declare function createDuckDBExecutor(factory: DuckDBFactory): QueryExecutor;
 /**
  * Canonical "empty-file" SELECT clause for a table. Codecs that need to
  * emit a schema-correct empty Parquet can wrap this in:
- *   `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET)`
+ *   `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET, COMPRESSION ZSTD)`
  * to satisfy the ParquetCodec empty-rows invariant.
  */
 declare function canonicalEmptyParquetSchema(table: TableName): string;

package/dist/_chunks/engine.mjs CHANGED Viewed

@@ -1,6 +1,7 @@
 import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
-import { SCHEMAS, currentSchemaVersion, dedupeByNaturalKey } from "./schema.mjs";
+import { SCHEMAS, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
 import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
+import { dateReplaceClause as dateReplaceClause$1 } from "../sql-fragments.mjs";
 import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
 import { sqlEscape } from "../sql-bind.mjs";
 import { buildLogicalPlan } from "gscdump/query/plan";
@@ -13,7 +14,7 @@ async function encodeBytes(db, table, rows) {
 	await db.registerFileBuffer(inName, jsonBytes);
 	registered.push(inName);
 	try {
-		const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`;
+		const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`;
 		await db.query(sql);
 		registered.push(outName);
 		return await db.copyFileToBuffer(outName);
@@ -58,7 +59,7 @@ function createDuckDBCodec(factory) {
 				const outName = db.makeTempPath("parquet");
 				const fileList = inputUris.map((u) => `'${sqlEscape(u)}'`).join(", ");
 				try {
-					await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
+					await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`);
 					const bytes = await db.copyFileToBuffer(outName);
 					const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
 					const rowCount = Number(countRows[0]?.n ?? 0);
@@ -83,7 +84,7 @@ function createDuckDBCodec(factory) {
 			}
 			try {
 				const fileList = inNames.map((n) => `'${sqlEscape(n)}'`).join(", ");
-				await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
+				await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`);
 				registered.push(outName);
 				const bytes = await db.copyFileToBuffer(outName);
 				const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
@@ -162,9 +163,7 @@ function canonicalEmptyParquetSchema(table) {
 }
 function dateReplaceClause(table) {
 	if (!table) return "";
-	const dateCols = SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
-	if (dateCols.length === 0) return "";
-	return `REPLACE (${dateCols.map((n) => `strftime(CAST(${n} AS DATE), '%Y-%m-%d') AS ${n}`).join(", ")})`;
+	return dateReplaceClause$1(dateColumnsFor(table), "string");
 }
 function columnList(table) {
 	return SCHEMAS[table].columns.map((c) => c.name).join(", ");
@@ -272,6 +271,52 @@ async function gcOrphansImpl(deps, now, graceMs, opts = {}) {
 	}
 	return { deleted: retired.length + sweptOrphans + hourlyDeleted };
 }
+const PUSHABLE_COLUMN = { query: "query" };
+function txLeaf(leaf, columns) {
+	if (leaf.operator !== "equals") return null;
+	const column = PUSHABLE_COLUMN[leaf.dimension];
+	if (!column || !columns.has(column)) return null;
+	return { [column]: { $eq: leaf.expression } };
+}
+function txExact(node, columns) {
+	const groupType = node._groupType ?? "and";
+	const leafParts = [];
+	for (const leaf of node._filters) {
+		const t = txLeaf(leaf, columns);
+		if (!t) return null;
+		leafParts.push(t);
+	}
+	if (groupType === "or") {
+		if (node._nestedGroups?.length || leafParts.length === 0) return null;
+		return leafParts.length === 1 ? leafParts[0] : { $or: leafParts };
+	}
+	const parts = leafParts;
+	for (const group of node._nestedGroups ?? []) {
+		const t = txExact(group, columns);
+		if (!t) return null;
+		parts.push(t);
+	}
+	if (parts.length === 0) return null;
+	return parts.length === 1 ? parts[0] : { $and: parts };
+}
+function extractParquetPushdown(state, table) {
+	const filter = state?.filter;
+	const schema = SCHEMAS[table];
+	if (!filter || !schema) return void 0;
+	const columns = new Set(schema.columns.map((c) => c.name));
+	if ((filter._groupType ?? "and") === "or") return txExact(filter, columns) ?? void 0;
+	const parts = [];
+	for (const leaf of filter._filters) {
+		const t = txLeaf(leaf, columns);
+		if (t) parts.push(t);
+	}
+	for (const group of filter._nestedGroups ?? []) {
+		const t = txExact(group, columns);
+		if (t) parts.push(t);
+	}
+	if (parts.length === 0) return void 0;
+	return parts.length === 1 ? parts[0] : { $and: parts };
+}
 const URL_PURGE_TABLES = ["pages", "page_queries"];
 const MAX_DAY_BYTES = 100 * 1024 * 1024;
 const URL_COLUMNS = /* @__PURE__ */ new Set();
@@ -463,6 +508,7 @@ function createStorageEngine(opts) {
 			dataSource,
 			table,
 			signal: opts.signal,
+			...opts.pushdownFilters ? { pushdownFilters: opts.pushdownFilters } : {},
 			...profiler ? { profiler } : {}
 		});
 		endExec?.({ rows: result.rows.length });
@@ -476,6 +522,7 @@ function createStorageEngine(opts) {
 		const plan = buildLogicalPlan(state, { regex: true });
 		const table = ctx.table ?? plan.dataset;
 		const resolved = compileLogicalQueryPlan(plan, table);
+		const pushdown = extractParquetPushdown(state, table);
 		return runSQL({
 			ctx: {
 				userId: ctx.userId,
@@ -489,6 +536,7 @@ function createStorageEngine(opts) {
 			sql: resolved.sql,
 			params: resolved.params,
 			signal: ctx.signal,
+			...pushdown ? { pushdownFilters: { FILES: pushdown } } : {},
 			...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {},
 			...ctx.profiler ? { profiler: ctx.profiler } : {}
 		});

package/dist/_chunks/index.d.mts CHANGED Viewed

@@ -60,8 +60,12 @@ interface CreateSqlQuerySourceOptions<TKey extends string> {
   execute: (sql: string, params: unknown[]) => Promise<QueryRow[]>;
   /** Tenant id for multi-tenant dialects; forwarded to `resolveToSQL`. */
   siteId?: string | number;
-  /** Search-type scope for multi-tenant dialects; forwarded to `resolveToSQL`. */
-  searchType?: string;
+  /**
+   * Search-type scope for multi-tenant dialects; forwarded to `resolveToSQL`.
+   * `number` = int-encoded code (`SEARCH_TYPE_INT`) for INT `search_type`
+   * catalogs (bound bare so the int partition prunes); `string` otherwise.
+   */
+  searchType?: string | number;
   /** Additional capability flags merged on top of `adapter.capabilities`. */
   extraCapabilities?: Partial<SourceCapabilities>;
 }

package/dist/_chunks/libs/hyparquet-compressors.mjs CHANGED Viewed

@@ -175,8 +175,8 @@ function nextTableBitSize(count, len, root_bits) {
 }
 function buildHuffmanTable(root_table, table, root_bits, code_lengths, code_lengths_size) {
 	const start_table = table;
-	const count = new Int32Array(16);
-	const offset = new Int32Array(16);
+	const count = /* @__PURE__ */ new Int32Array(16);
+	const offset = /* @__PURE__ */ new Int32Array(16);
 	const sorted = new Int32Array(code_lengths_size);
 	for (let i = 0; i < code_lengths_size; i++) count[code_lengths[i]]++;
 	offset[1] = 0;
@@ -220,7 +220,7 @@ function readHuffmanCode(alphabet_size, tables, table, br) {
 	if (simple_code_or_skip === 1) {
 		let max_bits_counter = alphabet_size - 1;
 		let max_bits = 0;
-		const symbols = new Int32Array(4);
+		const symbols = /* @__PURE__ */ new Int32Array(4);
 		const num_symbols = br.readBits(2) + 1;
 		while (max_bits_counter) {
 			max_bits_counter >>= 1;
@@ -505,7 +505,7 @@ const fixedDistanceExtraBits = new Uint8Array([
 	0
 ]);
 function freb(eb, start) {
-	const base = new Uint16Array(31);
+	const base = /* @__PURE__ */ new Uint16Array(31);
 	for (let i = 0; i < 31; i++) base[i] = start += 1 << eb[i - 1];
 	const rev = new Int32Array(base[30]);
 	for (let i = 1; i < 30; i++) for (let j = base[i]; j < base[i + 1]; ++j) rev[j] = j - base[i] << 5 | i;
@@ -518,7 +518,7 @@ const { base: fixedLength, rev: revfl } = freb(fixedLengthExtraBits, 2);
 fixedLength[28] = 258;
 revfl[258] = 28;
 const { base: fixedDistance } = freb(fixedDistanceExtraBits, 0);
-const rev = new Uint16Array(32768);
+const rev = /* @__PURE__ */ new Uint16Array(32768);
 for (let i = 0; i < 32768; i++) {
 	let x = (i & 43690) >> 1 | (i & 21845) << 1;
 	x = (x & 52428) >> 2 | (x & 13107) << 2;
@@ -546,12 +546,12 @@ function huffMap(cd, maxBits, r) {
 	}
 	return co;
 }
-const fixedLengthTree = new Uint8Array(288);
+const fixedLengthTree = /* @__PURE__ */ new Uint8Array(288);
 for (let i = 0; i < 144; i++) fixedLengthTree[i] = 8;
 for (let i = 144; i < 256; i++) fixedLengthTree[i] = 9;
 for (let i = 256; i < 280; i++) fixedLengthTree[i] = 7;
 for (let i = 280; i < 288; i++) fixedLengthTree[i] = 8;
-const fixedDistanceTree = new Uint8Array(32);
+const fixedDistanceTree = /* @__PURE__ */ new Uint8Array(32);
 for (let i = 0; i < 32; i++) fixedDistanceTree[i] = 5;
 const fixedLengthMap = /*#__PURE__*/ huffMap(fixedLengthTree, 9, 1);
 const fixedDistanceMap = /*#__PURE__*/ huffMap(fixedDistanceTree, 5, 1);
@@ -2420,7 +2420,7 @@ function gzipStart(input, i) {
 	return i + (flag & 2);
 }
 function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
-	let out = output ?? new Uint8Array(1024);
+	let out = output ?? /* @__PURE__ */ new Uint8Array(1024);
 	if (!(input.length - inputIndex)) return out;
 	const payloadStart = gzipStart(input, inputIndex);
 	if (payloadStart === input.length - 8) return out;
@@ -2465,7 +2465,7 @@ function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
 				const tl = hLiteral + bits(input, pos + 5, 31) + 1;
 				pos += 14;
 				const lengthDistanceTree = new Uint8Array(tl);
-				const codeLengthTree = new Uint8Array(19);
+				const codeLengthTree = /* @__PURE__ */ new Uint8Array(19);
 				for (let i = 0; i < hcLengths; ++i) codeLengthTree[codeLengthIndexMap[i]] = bits(input, pos + i * 3, 7);
 				pos += hcLengths * 3;
 				const codeLengthBits = Math.max(...codeLengthTree);

package/dist/_chunks/libs/icebird.mjs CHANGED Viewed

@@ -962,7 +962,7 @@ function uuidToBytes(value, label) {
 	if (typeof value !== "string") throw new Error(`expected ${label}`);
 	const hex = value.toLowerCase().replace(/-/g, "");
 	if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error(`expected ${label}`);
-	const bytes = new Uint8Array(16);
+	const bytes = /* @__PURE__ */ new Uint8Array(16);
 	for (let i = 0; i < bytes.length; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
 	return bytes;
 }
@@ -1074,7 +1074,7 @@ function bucketBytes(value, sourceType) {
 			else if (t === "timestamp" || t === "timestamptz") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value);
 			else if (t === "timestamp_ns" || t === "timestamptz_ns") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value) / 1000n;
 			else v = typeof value === "bigint" ? value : BigInt(value);
-			const out = new Uint8Array(8);
+			const out = /* @__PURE__ */ new Uint8Array(8);
 			new DataView(out.buffer).setBigInt64(0, v, true);
 			return out;
 		}
@@ -1835,7 +1835,7 @@ function avroWrite({ writer, schema, records, blockSize = 512, metadata }) {
 		writer.appendBytes(vb);
 	}
 	writer.appendVarInt(0);
-	const sync = new Uint8Array(16);
+	const sync = /* @__PURE__ */ new Uint8Array(16);
 	for (let i = 0; i < 16; i++) sync[i] = Math.random() * 256 | 0;
 	writer.appendBytes(sync);
 	for (let i = 0; i < records.length; i += blockSize) {
@@ -1940,7 +1940,7 @@ function appendZigZag64(writer, v) {
 function uuidStringToBytes$1(value) {
 	const hex = value.toLowerCase().replace(/-/g, "");
 	if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error("expected uuid string");
-	const bytes = new Uint8Array(16);
+	const bytes = /* @__PURE__ */ new Uint8Array(16);
 	for (let i = 0; i < 16; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
 	return bytes;
 }
@@ -2531,7 +2531,7 @@ function twosComplementMinBigEndian(value) {
 function uuidStringToBytes(s) {
 	const hex = s.replace(/-/g, "");
 	if (hex.length !== 32) return void 0;
-	const out = new Uint8Array(16);
+	const out = /* @__PURE__ */ new Uint8Array(16);
 	for (let i = 0; i < 16; i++) {
 		const byte = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
 		if (Number.isNaN(byte)) return void 0;
@@ -3286,7 +3286,7 @@ function resolveParquetCodec(value) {
 }
 function newSnapshotId(metadata) {
 	const used = new Set((metadata?.snapshots ?? []).map((s) => BigInt(s["snapshot-id"])));
-	const arr = new BigInt64Array(1);
+	const arr = /* @__PURE__ */ new BigInt64Array(1);
 	for (let attempt = 0; attempt < 32; attempt++) {
 		globalThis.crypto.getRandomValues(arr);
 		const masked = arr[0] & 9007199254740991n;

package/dist/_chunks/schema.d.mts CHANGED Viewed

@@ -1087,12 +1087,12 @@ declare const hourly_pages: import("drizzle-orm/pg-core").PgTableWithColumns<{
       identity: undefined;
       generated: undefined;
     }>;
-    hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgVarcharBuilder<[string, ...string[]]>>, {
+    hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgIntegerBuilder>, {
       name: string;
       tableName: "hourly_pages";
-      dataType: "string";
-      data: string;
-      driverParam: string;
+      dataType: "number int32";
+      data: number;
+      driverParam: string | number;
       notNull: true;
       hasDefault: false;
       isPrimaryKey: false;
@@ -2193,12 +2193,12 @@ declare const drizzleSchema: {
         identity: undefined;
         generated: undefined;
       }>;
-      hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgVarcharBuilder<[string, ...string[]]>>, {
+      hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgIntegerBuilder>, {
         name: string;
         tableName: "hourly_pages";
-        dataType: "string";
-        data: string;
-        driverParam: string;
+        dataType: "number int32";
+        data: number;
+        driverParam: string | number;
         notNull: true;
         hasDefault: false;
         isPrimaryKey: false;
@@ -2236,6 +2236,13 @@ declare const TABLE_METADATA: Record<TableName, {
 declare const SCHEMAS: Record<TableName, TableSchema>;
 declare function currentSchemaVersion(table: TableName): number;
 declare function schemaFor(table: TableName): TableSchema;
+/**
+ * DATE column names for a table. The single schema-derived source every read
+ * path uses to build the legacy-VARCHAR date canonicalization (see
+ * `dateReplaceClause` in `./sql-fragments`), so the engine codec and the CLI
+ * `dump`/`export` commands agree on which columns to cast.
+ */
+declare function dateColumnsFor(table: TableName): string[];
 declare function allTables(): readonly TableName[];
 declare function inferTable(dimensions: readonly string[]): TableName;
 /**
@@ -2259,4 +2266,4 @@ declare function naturalKeyColumns(table: TableName): readonly string[];
  */
 declare function dedupeByNaturalKey(table: TableName, rows: readonly Row[]): Row[];
 declare function dimensionToColumn(dim: string, _table: TableName): string;
-export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
+export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };

package/dist/_chunks/schema.mjs CHANGED Viewed

@@ -74,7 +74,7 @@ const search_appearance_page_queries = pgTable("search_appearance_page_queries",
 });
 const hourly_pages = pgTable("hourly_pages", {
 	url: varchar("url").notNull(),
-	hour: varchar("hour").notNull(),
+	hour: integer("hour").notNull(),
 	date: dateCol(),
 	...metricCols()
 });
@@ -181,7 +181,7 @@ const TABLE_METADATA = {
 			"date",
 			"hour"
 		],
-		version: 1
+		version: 2
 	}
 };
 function pgSqlTypeToColumnType(sqlType) {
@@ -226,6 +226,9 @@ function currentSchemaVersion(table) {
 function schemaFor(table) {
 	return SCHEMAS[table];
 }
+function dateColumnsFor(table) {
+	return SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
+}
 function allTables() {
 	return METRIC_TABLES;
 }
@@ -260,4 +263,4 @@ function dimensionToColumn(dim, _table) {
 	if (dim === "queryCanonical") return "query_canonical";
 	return dim;
 }
-export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
+export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };

package/dist/_chunks/schema2.mjs CHANGED Viewed

@@ -10,6 +10,15 @@ const ICEBERG_TABLES = [
 	"search_appearance_queries",
 	"search_appearance_page_queries"
 ];
+const SEARCH_TYPE_INT = {
+	web: 1,
+	image: 2,
+	video: 3,
+	news: 4,
+	discover: 5,
+	googleNews: 6
+};
+const INT_SEARCH_TYPE = Object.fromEntries(Object.entries(SEARCH_TYPE_INT).map(([k, v]) => [v, k]));
 const ICEBERG_PARTITION_COLUMNS = [{
 	name: "site_id",
 	type: "STRING",
@@ -21,6 +30,20 @@ const ICEBERG_PARTITION_COLUMNS = [{
 	required: true,
 	fieldId: 2
 }];
+function icebergPartitionColumns(encoding = "string") {
+	if (encoding === "string") return ICEBERG_PARTITION_COLUMNS;
+	return [{
+		name: "site_id",
+		type: "INT",
+		required: true,
+		fieldId: 1
+	}, {
+		name: "search_type",
+		type: "INT",
+		required: true,
+		fieldId: 2
+	}];
+}
 const ICEBERG_FIELD_ID_BASE = 3;
 const ICEBERG_PARTITION_SPEC = [
 	{
@@ -48,7 +71,7 @@ function mapColumnType(t) {
 		case "DATE": return "DATE";
 	}
 }
-function icebergTableSpec(table) {
+function icebergTableSpec(table, encoding = "string") {
 	const base = SCHEMAS[table];
 	const dataColumns = base.columns.map((col, i) => ({
 		name: col.name,
@@ -58,7 +81,7 @@ function icebergTableSpec(table) {
 	}));
 	return {
 		table,
-		columns: [...ICEBERG_PARTITION_COLUMNS, ...dataColumns],
+		columns: [...icebergPartitionColumns(encoding), ...dataColumns],
 		partitionSpec: ICEBERG_PARTITION_SPEC,
 		identityColumns: [
 			"site_id",
@@ -68,6 +91,10 @@ function icebergTableSpec(table) {
 	};
 }
 const ICEBERG_SCHEMAS = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t)]));
+const ICEBERG_SCHEMAS_INT = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t, "int")]));
+function icebergSchemasFor(encoding = "string") {
+	return encoding === "int" ? ICEBERG_SCHEMAS_INT : ICEBERG_SCHEMAS;
+}
 const ICEBERG_TABLE_SET = new Set(ICEBERG_TABLES);
 function isIcebergTable(table) {
 	return ICEBERG_TABLE_SET.has(table);
@@ -76,4 +103,4 @@ function assertIcebergTable(table) {
 	if (!isIcebergTable(table)) throw new Error(`Unknown Iceberg table '${table}'. Expected one of: ${ICEBERG_TABLES.join(", ")}`);
 	return table;
 }
-export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable };
+export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable };

package/dist/_chunks/sink.d.mts CHANGED Viewed

@@ -50,6 +50,33 @@ interface IcebergColumn {
    */
   fieldId: number;
 }
+/**
+ * Partition-key encoding for the two identity columns (`site_id`, `search_type`).
+ *
+ * - `'string'` (default, legacy): both columns are STRING. Correct, but R2 SQL's
+ *   string min/max statistics are truncated in predicate pushdown, so a bare
+ *   `WHERE site_id='<uuid>'` UNDERCOUNTS — callers must CONCAT(col,'') to stay
+ *   correct, which defeats partition pruning.
+ * - `'int'`: BOTH `site_id` and `search_type` are INT. Integer statistics are
+ *   fixed-width and never truncated, so `WHERE site_id=<n>` is both correct AND
+ *   prunes (empirically confirmed 2026-06-19, gscdump.com probe-int64-partition;
+ *   INT equality proven via the search_type column in the engine e2e canary). A
+ *   small INT site_id is ample (≪ 2.1B sites) — no LONG/BigInt needed. The caller
+ *   maps the UUID `site_id` ↔ int (app-owned, per-tenant serial) and uses
+ *   {@link SEARCH_TYPE_INT} for `search_type` (engine-owned, fixed enum).
+ *
+ * New per-team catalogs are provisioned `'int'`; existing catalogs stay
+ * `'string'`. Purely additive: `'string'` is the default everywhere so existing
+ * tables, writers, and readers are unchanged.
+ */
+type PartitionKeyEncoding = 'string' | 'int';
+/**
+ * Stable `search_type` enum → int map for `'int'`-encoded catalogs. Engine-owned
+ * and FROZEN: never renumber or reuse an id (it's the on-disk partition value).
+ */
+declare const SEARCH_TYPE_INT: Record<SearchType, number>;
+/** Reverse of {@link SEARCH_TYPE_INT} — int → `search_type`, for read-result mapping. */
+declare const INT_SEARCH_TYPE: Record<number, SearchType>;
 /** Iceberg partition transform applied to a source column. */
 type IcebergPartitionTransform = 'identity' | 'month';
 interface IcebergPartitionField {
@@ -82,6 +109,15 @@ interface IcebergTableSpec {
  * contiguously from id 3 (see `ICEBERG_FIELD_ID_BASE`).
  */
 declare const ICEBERG_PARTITION_COLUMNS: readonly IcebergColumn[];
+/**
+ * The two partition-identity columns for a given {@link PartitionKeyEncoding}.
+ * `'string'` returns {@link ICEBERG_PARTITION_COLUMNS} verbatim; `'int'` swaps
+ * BOTH to INT — `site_id` (the app's small `user_sites.int_id`; ≪ 2.1B sites, so
+ * INT is ample) and `search_type` (its fixed enum code). Integer identity columns
+ * avoid R2 SQL's truncated-string-stats equality undercount and restore pruning.
+ * Field ids are unchanged (1, 2) — only the column types differ.
+ */
+declare function icebergPartitionColumns(encoding?: PartitionKeyEncoding): readonly IcebergColumn[];
 /**
  * First field id used for per-table (non-partition) columns — immediately
  * after the two partition-identity columns (`site_id`=1, `search_type`=2).
@@ -105,9 +141,13 @@ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
  * CONTRACT NOTE: implementation agents must treat the RETURNED VALUE as the
  * source of truth — do not hand-list columns elsewhere.
  */
-declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
-/** All Iceberg table specs, keyed by table name. */
+declare function icebergTableSpec(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergTableSpec;
+/** All Iceberg table specs (legacy `'string'` encoding), keyed by table name. */
 declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
+/** All Iceberg table specs in `'int'` encoding (INT site_id + INT search_type). */
+declare const ICEBERG_SCHEMAS_INT: Record<IcebergTableName, IcebergTableSpec>;
+/** Table specs for the given encoding (`'string'` default). */
+declare function icebergSchemasFor(encoding?: PartitionKeyEncoding): Record<IcebergTableName, IcebergTableSpec>;
 /** True when `table` is one of the canonical {@link ICEBERG_TABLES}. */
 declare function isIcebergTable(table: string): table is IcebergTableName;
 /**
@@ -172,14 +212,14 @@ interface IcebergConnection {
  * `ICEBERG_SCHEMAS` contract. Field ids are advisory — R2 Data Catalog
  * re-assigns them on `createTable` (see `ICEBERG_FIELD_ID_BASE`).
  */
-declare function icebergSchemaFor(table: IcebergTableName): IcebergSchema;
+declare function icebergSchemaFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergSchema;
 /**
  * Build the icebird `PartitionSpec` for one of the 5 fact tables: the locked
  * spec `identity(site_id) + identity(search_type) + month(date)`. Each
  * partition field's `source-id` is resolved to the real column field id from
  * {@link icebergSchemaFor}.
  */
-declare function icebergPartitionSpecFor(table: IcebergTableName): IcebergPartitionSpec;
+declare function icebergPartitionSpecFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergPartitionSpec;
 /** Options for {@link connectIcebergCatalog}. */
 interface ConnectIcebergOptions {
   /**
@@ -264,7 +304,7 @@ declare function ensureIcebergNamespace(conn: IcebergConnection): Promise<void>;
  * than thrown so a partial run is observable; "table already exists" surfaces
  * as a failed result. Used by the app's one-off provisioning script.
  */
-declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[]): Promise<IcebergTableOpResult[]>;
+declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[], encoding?: PartitionKeyEncoding): Promise<IcebergTableOpResult[]>;
 /**
  * List the table names currently in the catalog namespace.
  *
@@ -284,10 +324,16 @@ interface IcebergListedDataFile {
 }
 interface ListIcebergDataFilesOptions {
   table: IcebergTableName;
-  /** Partition identity column. */
-  siteId: string;
-  /** Partition identity column. */
-  searchType: string;
+  /** Partition identity column. `number` for `'int'`-encoded catalogs. */
+  siteId: string | number;
+  /** Partition identity column. `number` (int code) for `'int'`-encoded catalogs. */
+  searchType: string | number;
+  /**
+   * Partition-key encoding of the catalog. `'int'` changes how manifest-summary
+   * bounds are decoded (int bytes vs UTF-8) and how the per-file partition value
+   * is compared. Defaults to `'string'`.
+   */
+  encoding?: PartitionKeyEncoding;
   /**
    * Inclusive date range. Every month touched by `[start, end]` is scanned;
    * `month(date)` is the third partition transform.
@@ -437,6 +483,14 @@ interface IcebergAppendSinkOptions extends SinkOptions {
    * uses the defaults; tests inject a synchronous `sleep`.
    */
   commitRetry?: CommitRetryOptions;
+  /**
+   * Partition-key encoding (default `'string'`). `'int'` writes BOTH `site_id`
+   * and `search_type` as INT — the caller MUST pass the numeric `site_id` (a
+   * numeric string is fine; it's `Number()`-coerced) in `slice.ctx.siteId`. A
+   * small INT is ample (≪ 2.1B sites), so no LONG/BigInt is involved. See
+   * {@link import('./iceberg/schema').PartitionKeyEncoding}.
+   */
+  encoding?: PartitionKeyEncoding;
 }
 /** `LocalIcebergSink` options — points at the local Iceberg REST catalog. */
 interface LocalIcebergSinkOptions extends SinkOptions {
@@ -447,4 +501,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
   /** S3-compatible warehouse location (POC: MinIO). */
   warehouse: string;
 }
-export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
+export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };

package/dist/_chunks/storage.d.mts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { ParquetQueryFilter } from "hyparquet";
 import { BuilderState, SearchType, SearchType as SearchType$1 } from "gscdump/query";
 import { Grain, Grain as Grain$1, Row, Row as Row$1, TableName, TableName as TableName$1, TenantCtx, TenantCtx as TenantCtx$1 } from "@gscdump/contracts";
 /**
@@ -397,6 +398,15 @@ interface QueryExecuteOptions {
    * the page_queries schema, not the analyzer's primary `table`.
    */
   placeholderTables?: Record<string, TableName>;
+  /**
+   * Per-placeholder row-group pushdown filter, derived from the query's
+   * structured filter (see `extractParquetPushdown`). A pure-JS decode executor
+   * MAY pass it to the parquet reader to prune row groups and shrink the rows
+   * it materialises before the SQL WHERE re-applies. Pure optimization: the
+   * filter is a superset of the final predicate, so an executor that ignores it
+   * (e.g. native DuckDB, which pushes from the SQL itself) stays correct.
+   */
+  pushdownFilters?: Record<string, ParquetQueryFilter>;
   dataSource: DataSource;
   table: TableName;
   signal?: AbortSignal;
@@ -475,6 +485,12 @@ interface RunSQLOptions {
    * Undefined keeps the legacy cross-type union.
    */
   searchType?: SearchType;
+  /**
+   * Per-placeholder parquet pushdown filter, forwarded verbatim to the
+   * executor. Keyed by fileSet name (matching `fileSets`). See
+   * `QueryExecuteOptions.pushdownFilters` and `extractParquetPushdown`.
+   */
+  pushdownFilters?: Record<string, ParquetQueryFilter>;
   /**
    * Optional read-path profiler. `runSQL` emits `manifest.list` +
    * `executor.execute` spans and forwards it into the executor for the

package/dist/_chunks/types.d.mts CHANGED Viewed

@@ -30,8 +30,13 @@ interface ResolverOptions<TableKey extends string = string> {
   adapter: ResolverAdapter<TableKey>;
   /** Optional site scope. Required for multi-tenant D1; omitted for parquet. */
   siteId?: string | number;
-  /** Optional searchType scope. Required for multi-tenant Iceberg; omitted for parquet. */
-  searchType?: string;
+  /**
+   * Optional searchType scope. Required for multi-tenant Iceberg; omitted for
+   * parquet. `number` is the int-encoded code (`SEARCH_TYPE_INT`) for catalogs
+   * whose `search_type` partition column is INT — bound bare (unquoted) so the
+   * int partition prunes; `string` for the default string-encoded catalogs.
+   */
+  searchType?: string | number;
 }
 interface ResolvedSQL {
   sql: string;

package/dist/adapters/duckdb-node.mjs CHANGED Viewed

@@ -8,6 +8,7 @@ import { tmpdir } from "node:os";
 import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
 const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
 let singleton = null;
+let singletonOpts = null;
 function bundles() {
 	return {
 		mvp: {
@@ -29,11 +30,19 @@ async function initialize(opts) {
 		conn: db.connect()
 	};
 }
+function getSingleton(opts) {
+	if (!singleton) {
+		singleton = initialize(opts);
+		singletonOpts = opts;
+	}
+	return singleton;
+}
 function createNodeDuckDBHandle(opts = {}) {
-	if (!singleton) singleton = initialize(opts);
+	if (singleton && opts.verbose !== void 0 && opts.verbose !== (singletonOpts?.verbose ?? false)) console.warn(`[gscdump] createNodeDuckDBHandle: ignoring verbose=${opts.verbose} — a shared DuckDB instance was already initialized with verbose=${singletonOpts?.verbose ?? false}. Call resetNodeDuckDB() before re-initializing to change it.`);
+	getSingleton(opts);
 	return {
 		async query(sql, params) {
-			const { conn } = await singleton;
+			const { conn } = await getSingleton(opts);
 			if (!params || params.length === 0) return arrowToRows(conn.query(sql));
 			const stmt = conn.prepare(sql);
 			try {
@@ -43,15 +52,15 @@ function createNodeDuckDBHandle(opts = {}) {
 			}
 		},
 		async registerFileBuffer(name, bytes) {
-			const { db } = await singleton;
+			const { db } = await getSingleton(opts);
 			db.registerFileBuffer(name, bytes);
 		},
 		async copyFileToBuffer(name) {
-			const { db } = await singleton;
+			const { db } = await getSingleton(opts);
 			return db.copyFileToBuffer(name);
 		},
 		async dropFiles(names) {
-			const { db } = await singleton;
+			const { db } = await getSingleton(opts);
 			for (const name of names) {
 				try {
 					db.dropFile(name);
@@ -69,9 +78,12 @@ function createNodeDuckDBHandle(opts = {}) {
 function resetNodeDuckDB() {
 	const pending = singleton;
 	singleton = null;
+	singletonOpts = null;
 	pending?.then(({ db, conn }) => {
 		conn.close();
 		db.reset();
-	}).catch(() => {});
+	}).catch((err) => {
+		console.warn("[gscdump] resetNodeDuckDB: failed to release DuckDB instance", err);
+	});
 }
 export { createNodeDuckDBHandle, resetNodeDuckDB };

package/dist/adapters/hyparquet.d.mts CHANGED Viewed

@@ -24,10 +24,18 @@ interface DecodeParquetOptions {
    * per row group — pruning groups whose column statistics can't match and
    * materialising only matching rows — so a filtered decode of a large file
    * holds at most one row group plus the matches in memory, never the whole
-   * file. Use this whenever the caller needs a sub-slice of a big parquet
-   * (e.g. one feedpath out of a site-wide sitemap-urls index).
+   * file. Use when a caller needs a sub-slice of a big parquet keyed on a
+   * clustered column (a row group's min/max stats only prune if the predicate
+   * column is the physical sort key — see `sortKey`/`clusterKey`).
    */
   filter?: ParquetQueryFilter;
+  /**
+   * Project a subset of columns. hyparquet only fetches + decodes the named
+   * column chunks, so a read that needs 2 of 14 columns skips the other 12's
+   * pages entirely. Omit to read every column. Names not present in the file
+   * are ignored by the reader.
+   */
+  columns?: readonly string[];
 }
 declare function decodeParquetToRows(bytes: Uint8Array, opts?: DecodeParquetOptions): Promise<Row[]>;
 interface HyparquetCodecOptions {

package/dist/adapters/hyparquet.mjs CHANGED Viewed

@@ -1,14 +1,83 @@
 import { SCHEMAS, TABLE_METADATA, dedupeByNaturalKey } from "../_chunks/schema.mjs";
 import { parquetReadObjects } from "hyparquet";
-import { parquetWriteBuffer } from "hyparquet-writer";
+import { ByteWriter, parquetWriteRows } from "hyparquet-writer";
 const ROW_GROUP_SIZE = 25e3;
 function basicTypeFor(colType) {
-	if (colType === "VARCHAR" || colType === "DATE") return "STRING";
+	if (colType === "VARCHAR") return "STRING";
 	if (colType === "BIGINT") return "INT64";
 	if (colType === "INTEGER") return "INT32";
 	if (colType === "DOUBLE") return "DOUBLE";
+	if (colType === "DATE") return "INT32";
 	throw new Error(`unsupported column type for parquet encoding: ${colType}`);
 }
+const EPOCH_DAY_MS = 864e5;
+function toEpochDays(value) {
+	if (value === null || value === void 0) return null;
+	if (typeof value === "number") return value;
+	if (value instanceof Date) {
+		const ms = value.getTime();
+		if (Number.isNaN(ms)) throw new TypeError("encodeRowsToParquet: invalid Date for DATE column");
+		return Math.floor(ms / EPOCH_DAY_MS);
+	}
+	if (typeof value === "string") {
+		const ms = Date.parse(`${value}T00:00:00Z`);
+		if (Number.isNaN(ms)) throw new TypeError(`encodeRowsToParquet: invalid date string '${value}'`);
+		return Math.floor(ms / EPOCH_DAY_MS);
+	}
+	throw new TypeError(`encodeRowsToParquet: unsupported DATE value '${String(value)}'`);
+}
+function isoFromDate(d) {
+	return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
+}
+function buildWriteSchema(columns) {
+	const schema = [{
+		name: "root",
+		num_children: columns.length
+	}];
+	for (const col of columns) {
+		const repetition_type = col.nullable ? "OPTIONAL" : "REQUIRED";
+		switch (col.type) {
+			case "DATE":
+				schema.push({
+					name: col.name,
+					type: "INT32",
+					converted_type: "DATE",
+					repetition_type
+				});
+				break;
+			case "VARCHAR":
+				schema.push({
+					name: col.name,
+					type: "BYTE_ARRAY",
+					converted_type: "UTF8",
+					repetition_type
+				});
+				break;
+			case "INTEGER":
+				schema.push({
+					name: col.name,
+					type: "INT32",
+					repetition_type
+				});
+				break;
+			case "BIGINT":
+				schema.push({
+					name: col.name,
+					type: "INT64",
+					repetition_type
+				});
+				break;
+			case "DOUBLE":
+				schema.push({
+					name: col.name,
+					type: "DOUBLE",
+					repetition_type
+				});
+				break;
+		}
+	}
+	return schema;
+}
 function coerceValue(value, type) {
 	if (value === null || value === void 0) return null;
 	if (type === "STRING") return typeof value === "string" ? value : String(value);
@@ -50,65 +119,86 @@ function sortRowsByClusterKey(table, rows) {
 	});
 	return copy;
 }
+function encodeOrderedRows(rows, columns, rowGroupSize) {
+	const schema = buildWriteSchema(columns);
+	const isDate = columns.map((col) => col.type === "DATE");
+	const types = columns.map((col) => basicTypeFor(col.type));
+	const columnSpecs = columns.map((col) => ({
+		name: col.name,
+		nullable: col.nullable,
+		columnIndex: true
+	}));
+	function* coercedRows() {
+		for (const r of rows) {
+			const out = {};
+			for (let c = 0; c < columns.length; c++) {
+				const name = columns[c].name;
+				out[name] = isDate[c] ? toEpochDays(r[name]) : coerceValue(r[name], types[c]);
+			}
+			yield out;
+		}
+	}
+	const writer = new ByteWriter();
+	parquetWriteRows({
+		writer,
+		rows: coercedRows(),
+		columns: columnSpecs,
+		schema,
+		rowGroupSize
+	});
+	return new Uint8Array(writer.getBuffer());
+}
 function encodeRowsToParquet(table, rows) {
 	const schema = SCHEMAS[table];
-	const sorted = sortRowsByClusterKey(table, rows);
-	const buffer = parquetWriteBuffer({
-		columnData: schema.columns.map((col) => {
-			const type = basicTypeFor(col.type);
-			const data = sorted.map((r) => coerceValue(r[col.name], type));
-			return {
-				name: col.name,
-				data,
-				type,
-				nullable: col.nullable,
-				columnIndex: true
-			};
-		}),
-		rowGroupSize: ROW_GROUP_SIZE
-	});
-	return new Uint8Array(buffer);
+	return encodeOrderedRows(sortRowsByClusterKey(table, rows), schema.columns, ROW_GROUP_SIZE);
 }
 function encodeRowsToParquetFlex(rows, opts) {
 	const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
-	const sorted = sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
+	return encodeOrderedRows(sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
 		for (const col of sortKey) {
 			const cmp = compareValues(a[col], b[col]);
 			if (cmp !== 0) return cmp;
 		}
 		return 0;
-	});
-	const buffer = parquetWriteBuffer({
-		columnData: columns.map((col) => {
-			const type = basicTypeFor(col.type);
-			const data = sorted.map((r) => coerceValue(r[col.name], type));
-			return {
-				name: col.name,
-				data,
-				type,
-				nullable: col.nullable,
-				columnIndex: true
-			};
-		}),
-		rowGroupSize
-	});
-	return new Uint8Array(buffer);
+	}), columns, rowGroupSize);
 }
 function asyncBufferFromBytes(bytes) {
-	const ab = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
+	const base = bytes.byteOffset;
+	const buf = bytes.buffer;
 	return {
-		byteLength: ab.byteLength,
+		byteLength: bytes.byteLength,
 		slice(start, end) {
-			return ab.slice(start, end);
+			const from = base + start;
+			const to = end === void 0 ? base + bytes.byteLength : base + end;
+			return buf.slice(from, to);
 		}
 	};
 }
 async function decodeParquetToRows(bytes, opts = {}) {
 	if (bytes.byteLength === 0) return [];
-	return await parquetReadObjects({
+	return normalizeDecodedDates(await parquetReadObjects({
 		file: asyncBufferFromBytes(bytes),
-		...opts.filter ? { filter: opts.filter } : {}
-	});
+		...opts.columns ? { columns: [...opts.columns] } : {},
+		...opts.filter ? {
+			filter: opts.filter,
+			useBloomFilters: true
+		} : {}
+	}));
+}
+function normalizeDecodedDates(rows) {
+	if (rows.length === 0) return rows;
+	const dateCols = [];
+	const first = rows[0];
+	for (const k in first) if (first[k] instanceof Date) dateCols.push(k);
+	if (dateCols.length === 0) return rows;
+	for (const row of rows) {
+		const r = row;
+		for (const k of dateCols) {
+			const v = r[k];
+			if (v instanceof Date) r[k] = isoFromDate(v);
+		}
+	}
+	return rows;
 }
 function createHyparquetCodec(options = {}) {
 	return {
@@ -135,7 +225,7 @@ function createHyparquetCodec(options = {}) {
 			const allRows = [];
 			for (const key of inputKeys) {
 				const rows = await decodeParquetToRows(await dataSource.read(key));
-				allRows.push(...rows);
+				for (let i = 0; i < rows.length; i++) allRows.push(rows[i]);
 			}
 			const rows = dedupeByNaturalKey(ctx.table, allRows);
 			const bytes = encodeRowsToParquet(ctx.table, rows);

package/dist/adapters/node.mjs CHANGED Viewed

@@ -74,7 +74,7 @@ function snapshotAlias(fileName) {
 	if (!m?.[1]) throw new TypeError(`snapshotAlias: unrecognised filename ${JSON.stringify(fileName)}`);
 	return `cold_${m[1].replace("-", "_")}`;
 }
-const SNAPSHOT_TYPE_ERROR_KINDS = new Set([
+const SNAPSHOT_TYPE_ERROR_KINDS = /* @__PURE__ */ new Set([
 	"invalid-snapshot-filename",
 	"unsupported-snapshot-index-version",
 	"invalid-schema-identifier",

package/dist/errors.mjs CHANGED Viewed

@@ -168,7 +168,7 @@ const engineErrors = {
 		};
 	}
 };
-const ENGINE_ERROR_KINDS = new Set([
+const ENGINE_ERROR_KINDS = /* @__PURE__ */ new Set([
 	"analyzer-not-found",
 	"analyzer-capability-missing",
 	"invalid-sql-literal",

package/dist/iceberg/index.d.mts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
+import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
 import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
 type IcebergAppendSink = Sink;
 /**
@@ -10,4 +10,4 @@ type IcebergAppendSink = Sink;
  * with no rows never touches the network.
  */
 declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
-export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
+export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };

package/dist/iceberg/index.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 import { engineErrors } from "../errors.mjs";
-import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
+import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
 import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
 import { err, ok } from "gscdump/result";
 async function cacheGet(cache, key, now) {
@@ -35,21 +35,25 @@ function decodeInt(bytes) {
 	if (u == null) return null;
 	return new DataView(u.buffer, u.byteOffset, u.byteLength).getInt32(0, true);
 }
-function buildPartitionFilter(siteId, searchType, wantedMonths) {
+function buildPartitionFilter(siteId, searchType, wantedMonths, encoding = "string") {
 	return (partitions) => {
 		const parts = partitions;
 		if (!parts || parts.length === 0) return true;
-		const siteSummary = parts[SITE_ID_FIELD_INDEX];
-		if (siteSummary && (siteSummary.lower_bound != null || siteSummary.upper_bound != null)) {
-			const lo = decodeString(siteSummary.lower_bound);
-			const hi = decodeString(siteSummary.upper_bound);
-			if (lo != null && hi != null && (siteId < lo || siteId > hi)) return false;
-		}
-		const searchTypeSummary = parts[SEARCH_TYPE_FIELD_INDEX];
-		if (searchTypeSummary && (searchTypeSummary.lower_bound != null || searchTypeSummary.upper_bound != null)) {
-			const lo = decodeString(searchTypeSummary.lower_bound);
-			const hi = decodeString(searchTypeSummary.upper_bound);
-			if (lo != null && hi != null && (searchType < lo || searchType > hi)) return false;
+		if (encoding === "string") {
+			const siteStr = String(siteId);
+			const searchStr = String(searchType);
+			const siteSummary = parts[SITE_ID_FIELD_INDEX];
+			if (siteSummary && (siteSummary.lower_bound != null || siteSummary.upper_bound != null)) {
+				const lo = decodeString(siteSummary.lower_bound);
+				const hi = decodeString(siteSummary.upper_bound);
+				if (lo != null && hi != null && (siteStr < lo || siteStr > hi)) return false;
+			}
+			const searchTypeSummary = parts[SEARCH_TYPE_FIELD_INDEX];
+			if (searchTypeSummary && (searchTypeSummary.lower_bound != null || searchTypeSummary.upper_bound != null)) {
+				const lo = decodeString(searchTypeSummary.lower_bound);
+				const hi = decodeString(searchTypeSummary.upper_bound);
+				if (lo != null && hi != null && (searchStr < lo || searchStr > hi)) return false;
+			}
 		}
 		const monthSummary = parts[DATE_MONTH_FIELD_INDEX];
 		if (monthSummary && (monthSummary.lower_bound != null || monthSummary.upper_bound != null)) {
@@ -74,11 +78,11 @@ const ICEBERG_TYPE_MAP = {
 	DOUBLE: "double",
 	DATE: "date"
 };
-function icebergSchemaFor(table) {
+function icebergSchemaFor(table, encoding = "string") {
 	return {
 		"type": "struct",
 		"schema-id": 0,
-		"fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
+		"fields": icebergSchemasFor(encoding)[table].columns.map((col) => ({
 			id: col.fieldId,
 			name: col.name,
 			required: col.required,
@@ -86,8 +90,8 @@ function icebergSchemaFor(table) {
 		}))
 	};
 }
-function icebergPartitionSpecFor(table) {
-	const fields = ICEBERG_SCHEMAS[table].columns;
+function icebergPartitionSpecFor(table, encoding = "string") {
+	const fields = icebergSchemasFor(encoding)[table].columns;
 	const fieldId = (name) => {
 		const col = fields.find((c) => c.name === name);
 		if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
@@ -176,14 +180,14 @@ async function icebergAppendRetrying(args, options = {}) {
 async function ensureIcebergNamespace(conn) {
 	await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
 }
-async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
+async function createIcebergTables(conn, tables = ICEBERG_TABLES, encoding = "string") {
 	const results = [];
 	for (const table of tables) await icebergCreateTable({
 		catalog: conn.catalog,
 		namespace: conn.namespace,
 		table,
-		schema: icebergSchemaFor(table),
-		partitionSpec: icebergPartitionSpecFor(table)
+		schema: icebergSchemaFor(table, encoding),
+		partitionSpec: icebergPartitionSpecFor(table, encoding)
 	}).then(() => results.push({
 		table,
 		outcome: ok(void 0)
@@ -275,20 +279,22 @@ async function listIcebergDataFiles(conn, opts) {
 		if (snapshotId == null || !metadata) return [];
 	}
 	const endWalk = profiler?.start("iceberg.walk");
-	const partitionFilter = buildPartitionFilter(opts.siteId, opts.searchType, wantedMonths);
+	const partitionFilter = buildPartitionFilter(opts.siteId, opts.searchType, wantedMonths, opts.encoding ?? "string");
 	const manifests = await icebergManifests({
 		metadata,
 		resolver: conn.resolver,
 		partitionFilter
 	});
+	const wantSite = String(opts.siteId);
+	const wantSearch = String(opts.searchType);
 	const out = [];
 	for (const m of manifests) for (const entry of m.entries) {
 		if (entry.status === 2) continue;
 		const df = entry.data_file;
 		if (df.content !== 0) continue;
 		const part = df.partition;
-		if (part.site_id !== opts.siteId) continue;
-		if (part.search_type !== opts.searchType) continue;
+		if (String(part.site_id) !== wantSite) continue;
+		if (String(part.search_type) !== wantSearch) continue;
 		const month = part.date_month;
 		if (typeof month !== "number" || !wantedMonths.has(month)) continue;
 		out.push({
@@ -326,6 +332,8 @@ async function dropIcebergTables(conn, tables) {
 	return results;
 }
 const DAY_MILLIS = 864e5;
+const INT32_MIN = -2147483648;
+const INT32_MAX = 2147483647;
 function toIcebergDate(value) {
 	if (typeof value === "string") {
 		const ms = Date.parse(`${value}T00:00:00Z`);
@@ -343,6 +351,14 @@ function coerceJsonSafe(value) {
 	if (typeof value === "bigint") return Number(value);
 	return value;
 }
+function toIntPartitionSiteId(value) {
+	if (value == null || typeof value === "string" && value.trim() === "") throw new TypeError("toRecords: slice.ctx.siteId is required for int partition encoding");
+	if (typeof value !== "string" && typeof value !== "number" && typeof value !== "bigint") throw new TypeError(`toRecords: int partition site_id must be a safe integer, got '${String(value)}'`);
+	const siteId = Number(value);
+	if (!Number.isSafeInteger(siteId)) throw new TypeError(`toRecords: int partition site_id must be a safe integer, got '${String(value)}'`);
+	if (siteId < INT32_MIN || siteId > INT32_MAX) throw new TypeError(`toRecords: int partition site_id must fit Iceberg INT, got '${String(value)}'`);
+	return siteId;
+}
 function dedupeByIdentity(table, records) {
 	if (records.length < 2) return records;
 	const key = ICEBERG_SCHEMAS[table].identityColumns;
@@ -353,19 +369,21 @@ function dedupeByIdentity(table, records) {
 	}
 	return seen.size === records.length ? records : [...seen.values()];
 }
-function toRecords(slice, rows) {
-	const siteId = slice.ctx.siteId ?? "";
+function toRecords(slice, rows, encoding) {
+	const siteVal = encoding === "int" ? toIntPartitionSiteId(slice.ctx.siteId) : slice.ctx.siteId ?? "";
+	const searchVal = encoding === "int" ? SEARCH_TYPE_INT[slice.searchType] : slice.searchType;
 	return rows.map((row) => {
 		const out = {};
 		for (const k in row) out[k] = coerceJsonSafe(row[k]);
 		out.date = toIcebergDate(out.date);
-		out.site_id = siteId;
-		out.search_type = slice.searchType;
+		out.site_id = siteVal;
+		out.search_type = searchVal;
 		return out;
 	});
 }
 function createIcebergAppendSink(options) {
 	let connection;
+	const encoding = options.encoding ?? "string";
 	const buffers = /* @__PURE__ */ new Map();
 	function connect() {
 		connection ??= connectIcebergCatalog(options.catalog);
@@ -375,7 +393,7 @@ function createIcebergAppendSink(options) {
 		capabilities: { appendOnly: true },
 		async emit(slice, rows) {
 			if (rows.length === 0) return { rowCount: 0 };
-			const records = toRecords(slice, rows);
+			const records = toRecords(slice, rows, encoding);
 			const buffer = buffers.get(slice.table);
 			if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
 			else buffers.set(slice.table, records);
@@ -429,4 +447,4 @@ function createIcebergAppendSink(options) {
 		}
 	};
 }
-export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
+export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };

package/dist/ingest.mjs CHANGED Viewed

@@ -26,7 +26,7 @@ function toPath(gscUrl) {
 	}
 }
 function toSumPosition(apiPosition, impressions) {
-	return (apiPosition - 1) * Math.max(impressions, 1);
+	return ((apiPosition >= 1 ? apiPosition : 1) - 1) * Math.max(impressions, 1);
 }
 function transformGscRow(table, apiRow, options = {}) {
 	const keys = apiRow.keys;
@@ -76,8 +76,10 @@ function transformGscRow(table, apiRow, options = {}) {
 		};
 	}
 	if (table === "hourly_pages") {
-		const hour = String(keys[0] ?? "");
-		const date = hour.slice(0, 10);
+		const hourStamp = String(keys[0] ?? "");
+		const date = hourStamp.slice(0, 10);
+		const hour = Number.parseInt(hourStamp.slice(11, 13), 10);
+		if (!Number.isInteger(hour) || hour < 0 || hour > 23) throw new Error(`hourly_pages: cannot derive hour-of-day from '${hourStamp}'`);
 		return {
 			date,
 			row: {

package/dist/rollups.mjs CHANGED Viewed

@@ -560,7 +560,7 @@ const indexingMetadataRollup = {
 				if (!latestRemove || r.latestRemoveAt > latestRemove) latestRemove = r.latestRemoveAt;
 			}
 		}
-		const days = new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
+		const days = /* @__PURE__ */ new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
 		const perDay = Array.from(days).sort().map((day) => ({
 			day,
 			updates: updatesByDay.get(day) ?? 0,

package/dist/schema.d.mts CHANGED Viewed

@@ -1,2 +1,2 @@
-import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
-export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
+import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
+export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };

package/dist/schema.mjs CHANGED Viewed

@@ -1,2 +1,2 @@
-import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
-export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
+import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
+export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };

package/dist/sql-fragments.d.mts CHANGED Viewed

@@ -18,4 +18,27 @@ declare const METRIC_EXPR: Record<Metric, string>;
  * on the resolved column expression so drizzle can pass a column ref.
  */
 declare function topLevelPagePredicateSql(pathExpr: string): string;
-export { METRIC_EXPR, escapeLike, topLevelPagePredicateSql };
+/**
+ * How a canonicalized date column is emitted by {@link dateReplaceClause}:
+ * - `'date'` keeps a real `DATE` value (`CAST(col AS DATE)`). Right for views
+ *   and `.duckdb` exports the app re-queries, where the column type matters.
+ * - `'string'` emits an ISO `YYYY-MM-DD` string (`strftime(CAST(col AS DATE)…)`).
+ *   Right for row materialisation to JSON/CSV/NDJSON, where a `DATE` would
+ *   serialize as an opaque object / epoch.
+ */
+type DateCanonicalForm = 'date' | 'string';
+/**
+ * Build a `read_parquet` `REPLACE (…)` clause that canonicalizes legacy `date`
+ * columns. `date` lands as VARCHAR in older parquets (BYTE_ARRAY/UTF8, written
+ * before the schema enforced DATE); DuckDB infers the column type from the file,
+ * so without this every read path would expose VARCHAR despite SCHEMAS declaring
+ * DATE. The `CAST(col AS DATE)` is a no-op for already-DATE columns and
+ * vectorized parsing for VARCHAR ones, so output stays canonical either way.
+ *
+ * Pure: the caller passes the table's DATE column names (derived from `SCHEMAS`)
+ * so this fragment carries no schema/drizzle dependency. Returns `''` when the
+ * table has no DATE columns, so callers can interpolate it unconditionally:
+ *   `SELECT * ${dateReplaceClause(cols)} FROM read_parquet(…)`.
+ */
+declare function dateReplaceClause(dateColumns: readonly string[], form?: DateCanonicalForm): string;
+export { DateCanonicalForm, METRIC_EXPR, dateReplaceClause, escapeLike, topLevelPagePredicateSql };

package/dist/sql-fragments.mjs CHANGED Viewed

@@ -10,4 +10,9 @@ const METRIC_EXPR = {
 function topLevelPagePredicateSql(pathExpr) {
 	return `LENGTH(${pathExpr}) - LENGTH(REPLACE(${pathExpr}, '/', '')) <= 1`;
 }
-export { METRIC_EXPR, escapeLike, topLevelPagePredicateSql };
+function dateReplaceClause(dateColumns, form = "string") {
+	if (dateColumns.length === 0) return "";
+	const cast = (n) => form === "date" ? `CAST(${n} AS DATE) AS ${n}` : `strftime(CAST(${n} AS DATE), '%Y-%m-%d') AS ${n}`;
+	return `REPLACE (${dateColumns.map(cast).join(", ")})`;
+}
+export { METRIC_EXPR, dateReplaceClause, escapeLike, topLevelPagePredicateSql };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@gscdump/engine",
   "type": "module",
-  "version": "0.28.2",
+  "version": "0.29.0",
   "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
   "author": {
     "name": "Harlan Wilton",
@@ -172,8 +172,8 @@
   },
   "peerDependencies": {
     "@duckdb/duckdb-wasm": "^1.32.0",
-    "hyparquet": "^1.26.0",
-    "hyparquet-writer": "^0.15.6"
+    "hyparquet": "^1.26.1",
+    "hyparquet-writer": "^0.16.1"
   },
   "peerDependenciesMeta": {
     "@duckdb/duckdb-wasm": {
@@ -188,11 +188,11 @@
   },
   "dependencies": {
     "drizzle-orm": "1.0.0-rc.3",
-    "hyparquet": "^1.26.0",
-    "hyparquet-writer": "^0.15.6",
+    "hyparquet": "^1.26.1",
+    "hyparquet-writer": "^0.16.1",
     "proper-lockfile": "^4.1.2",
-    "@gscdump/contracts": "0.28.2",
-    "gscdump": "0.28.2"
+    "@gscdump/contracts": "0.29.0",
+    "gscdump": "0.29.0"
   },
   "devDependencies": {
     "@duckdb/duckdb-wasm": "^1.32.0",