npm - @gscdump/engine - Versions diffs - 0.20.2 → 0.21.0 - Mend

@gscdump/engine 0.20.2 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/_chunks/engine.mjs +1 -1
package/dist/_chunks/iceberg-schema.mjs +67 -0
package/dist/_chunks/registry.d.mts +1 -1
package/dist/_chunks/resolver.mjs +15 -21
package/dist/_chunks/schema.d.mts +452 -133
package/dist/_chunks/schema.mjs +50 -24
package/dist/_chunks/sink.d.mts +329 -0
package/dist/_chunks/storage.d.mts +4 -4
package/dist/adapters/duckdb-node.mjs +2 -2
package/dist/adapters/hyparquet.mjs +5 -5
package/dist/index.d.mts +39 -7
package/dist/index.mjs +272 -14
package/dist/ingest.d.mts +23 -3
package/dist/ingest.mjs +43 -18
package/dist/rollups.d.mts +16 -6
package/dist/rollups.mjs +42 -35
package/dist/schema.d.mts +2 -2
package/dist/schema.mjs +2 -2
package/dist/sink-node.d.mts +31 -0
package/dist/sink-node.mjs +76 -0
package/dist/vendor/hysnappy-purejs.d.mts +29 -0
package/dist/vendor/hysnappy-purejs.mjs +13 -0
package/package.json +14 -3

package/dist/rollups.mjs CHANGED Viewed

@@ -41,16 +41,18 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
 }
 async function rebuildRollups(opts) {
 	const now = opts.now ?? (() => Date.now());
+	const dataEndMs = opts.dataEndDate !== void 0 ? isoDateToUtcMs(opts.dataEndDate) : null;
 	const results = [];
 	for (const def of opts.defs) {
 		const builtAt = now();
+		const windowAnchorMs = dataEndMs ?? builtAt;
 		const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
 		try {
 			const payload = await def.build({
 				engine: opts.engine,
 				ctx: opts.ctx,
 				dataSource: opts.dataSource,
-				builtAt,
+				windowAnchorMs,
 				...defSearchType !== void 0 ? { searchType: defSearchType } : {}
 			});
 			if (def.format === "parquet") {
@@ -115,6 +117,11 @@ async function rebuildRollups(opts) {
 	}
 	return results;
 }
+function isoDateToUtcMs(iso) {
+	const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(iso);
+	if (!m) throw new Error(`dataEndDate must be ISO YYYY-MM-DD, got: ${iso}`);
+	return Date.UTC(Number(m[1]), Number(m[2]) - 1, Number(m[3]));
+}
 function utcDateMinusDays(at, days) {
 	const d = new Date(at - days * MS_PER_DAY);
 	return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
@@ -262,10 +269,10 @@ const dailyTotalsRollup = {
         ORDER BY date
       `
 		});
-		const keywordRows = await runWindowed({
+		const queryRows = await runWindowed({
 			engine,
 			ctx,
-			table: "keywords",
+			table: "queries",
 			...searchType !== void 0 ? { searchType } : {},
 			sqlFor: (w) => `
         SELECT
@@ -290,14 +297,14 @@ const dailyTotalsRollup = {
 			cur.sum_position += Number(r.sum_position);
 			pagesByDate.set(date, cur);
 		}
-		const keywordImpressionsByDate = /* @__PURE__ */ new Map();
-		for (const r of keywordRows) {
+		const queryImpressionsByDate = /* @__PURE__ */ new Map();
+		for (const r of queryRows) {
 			const date = String(r.date);
-			keywordImpressionsByDate.set(date, (keywordImpressionsByDate.get(date) ?? BigInt(0)) + BigInt(r.impressions));
+			queryImpressionsByDate.set(date, (queryImpressionsByDate.get(date) ?? BigInt(0)) + BigInt(r.impressions));
 		}
 		return Array.from(pagesByDate.values()).sort((a, b) => a.date < b.date ? -1 : 1).map((r) => {
 			const totalImpressions = BigInt(r.impressions);
-			const queryImpressions = keywordImpressionsByDate.get(String(r.date)) ?? BigInt(0);
+			const queryImpressions = queryImpressionsByDate.get(String(r.date)) ?? BigInt(0);
 			const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
 			return {
 				date: r.date,
@@ -350,13 +357,13 @@ const weeklyTotalsRollup = {
 const topPages28dRollup = {
 	id: "top_pages_28d",
 	windowDays: 28,
-	async build({ engine, ctx, builtAt, searchType }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
+	async build({ engine, ctx, windowAnchorMs, searchType }) {
+		const cutoff = utcDateMinusDays(windowAnchorMs, 28);
 		const partitions = partitionsInRange(await engine.listPartitions({
 			ctx,
 			table: "pages",
 			...searchType !== void 0 ? { searchType } : {}
-		}), cutoff, utcDateMinusDays(builtAt, 0));
+		}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
 		if (partitions.length === 0) return [];
 		return (await engine.runSQL({
 			ctx,
@@ -389,13 +396,13 @@ const topPages28dRollup = {
 const topCountries28dRollup = {
 	id: "top_countries_28d",
 	windowDays: 28,
-	async build({ engine, ctx, builtAt, searchType }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
+	async build({ engine, ctx, windowAnchorMs, searchType }) {
+		const cutoff = utcDateMinusDays(windowAnchorMs, 28);
 		const partitions = partitionsInRange(await engine.listPartitions({
 			ctx,
 			table: "countries",
 			...searchType !== void 0 ? { searchType } : {}
-		}), cutoff, utcDateMinusDays(builtAt, 0));
+		}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
 		if (partitions.length === 0) return [];
 		return (await engine.runSQL({
 			ctx,
@@ -428,19 +435,19 @@ const topCountries28dRollup = {
 const topKeywords28dRollup = {
 	id: "top_keywords_28d",
 	windowDays: 28,
-	async build({ engine, ctx, builtAt, searchType }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
+	async build({ engine, ctx, windowAnchorMs, searchType }) {
+		const cutoff = utcDateMinusDays(windowAnchorMs, 28);
 		const partitions = partitionsInRange(await engine.listPartitions({
 			ctx,
-			table: "keywords",
+			table: "queries",
 			...searchType !== void 0 ? { searchType } : {}
-		}), cutoff, utcDateMinusDays(builtAt, 0));
+		}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
 		if (partitions.length === 0) return [];
 		return (await engine.runSQL({
 			ctx,
-			table: "keywords",
+			table: "queries",
 			fileSets: { FILES: {
-				table: "keywords",
+				table: "queries",
 				partitions
 			} },
 			...searchType !== void 0 ? { searchType } : {},
@@ -491,19 +498,19 @@ const topKeywords28dParquetRollup = {
 		}
 	],
 	parquetSortKey: ["clicks"],
-	async build({ engine, ctx, builtAt, searchType }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
+	async build({ engine, ctx, windowAnchorMs, searchType }) {
+		const cutoff = utcDateMinusDays(windowAnchorMs, 28);
 		const partitions = partitionsInRange(await engine.listPartitions({
 			ctx,
-			table: "keywords",
+			table: "queries",
 			...searchType !== void 0 ? { searchType } : {}
-		}), cutoff, utcDateMinusDays(builtAt, 0));
+		}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
 		if (partitions.length === 0) return [];
 		return (await engine.runSQL({
 			ctx,
-			table: "keywords",
+			table: "queries",
 			fileSets: { FILES: {
-				table: "keywords",
+				table: "queries",
 				partitions
 			} },
 			...searchType !== void 0 ? { searchType } : {},
@@ -575,7 +582,7 @@ const indexingHealthRollup = {
 	id: "indexing_health",
 	windowDays: 90,
 	sliceOrthogonal: true,
-	async build({ engine, ctx, dataSource, builtAt }) {
+	async build({ engine, ctx, dataSource, windowAnchorMs }) {
 		const key = inspectionParquetKey(ctx);
 		if (!await dataSource.head?.(key)) return { days: [] };
 		const sql = `
@@ -590,7 +597,7 @@ const indexingHealthRollup = {
         SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
         SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
       FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
-      WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
+      WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(windowAnchorMs, 90)}'
       GROUP BY 1
       ORDER BY 1
     `;
@@ -619,19 +626,19 @@ const indexPercentRollup = {
 	id: "index_percent",
 	windowDays: 90,
 	sliceOrthogonal: true,
-	async build({ engine, ctx, dataSource, builtAt, searchType }) {
+	async build({ engine, ctx, dataSource, windowAnchorMs, searchType }) {
 		const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
 		if (urlsKeys.length === 0) return {
 			totalSitemapUrls: 0,
 			days: []
 		};
-		const cutoff = utcDateMinusDays(builtAt, 90);
+		const cutoff = utcDateMinusDays(windowAnchorMs, 90);
 		const factSearchType = searchType ?? "web";
 		const pagesPartitions = partitionsInRange(await engine.listPartitions({
 			ctx,
 			table: "pages",
 			searchType: factSearchType
-		}), cutoff, utcDateMinusDays(builtAt, 0));
+		}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
 		const numerator = await engine.runSQL({
 			ctx,
 			table: "pages",
@@ -690,10 +697,10 @@ const sitemapHealthRollup = {
 	id: "sitemap_health",
 	windowDays: 90,
 	sliceOrthogonal: true,
-	async build({ dataSource, ctx, builtAt }) {
+	async build({ dataSource, ctx, windowAnchorMs }) {
 		const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
 		const records = Object.values(index.records);
-		const cutoff = utcDateMinusDays(builtAt, 90);
+		const cutoff = utcDateMinusDays(windowAnchorMs, 90);
 		const byDay = /* @__PURE__ */ new Map();
 		const feeds = [];
 		for (const r of records) {
@@ -734,10 +741,10 @@ const sitemapChanges28dRollup = {
 	id: "sitemap_changes_28d",
 	windowDays: 28,
 	sliceOrthogonal: true,
-	async build({ dataSource, ctx, builtAt }) {
+	async build({ dataSource, ctx, windowAnchorMs }) {
 		const store = createSitemapStore({ dataSource });
-		const from = utcDateMinusDays(builtAt, 28);
-		const to = utcDateMinusDays(builtAt, 0);
+		const from = utcDateMinusDays(windowAnchorMs, 28);
+		const to = utcDateMinusDays(windowAnchorMs, 0);
 		const counts = /* @__PURE__ */ new Map();
 		const addedTop = [];
 		const removedTop = [];

package/dist/schema.d.mts CHANGED Viewed

@@ -1,2 +1,2 @@
-import { _ as hourly_pages, a as allTables, b as pages, c as dimensionToColumn, d as schemaFor, f as DrizzleSchema, g as drizzleSchema, h as devices, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, s as dedupeByNaturalKey, t as ColumnDef, u as naturalKeyColumns, v as keywords, x as search_appearance, y as page_keywords } from "./_chunks/schema.mjs";
-export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dedupeByNaturalKey, devices, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, keywords, naturalKeyColumns, page_keywords, pages, schemaFor, search_appearance };
+import { _ as hourly_pages, a as allTables, b as queries, c as dimensionToColumn, d as schemaFor, f as DrizzleSchema, g as drizzleSchema, h as dates, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, s as dedupeByNaturalKey, t as ColumnDef, u as naturalKeyColumns, v as page_queries, x as search_appearance, y as pages } from "./_chunks/schema.mjs";
+export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance };

package/dist/schema.mjs CHANGED Viewed

@@ -1,2 +1,2 @@
-import { _ as search_appearance, a as dimensionToColumn, c as schemaFor, d as devices, f as drizzleSchema, g as pages, h as page_keywords, i as dedupeByNaturalKey, l as TABLE_METADATA, m as keywords, n as allTables, o as inferTable, p as hourly_pages, r as currentSchemaVersion, s as naturalKeyColumns, t as SCHEMAS, u as countries } from "./_chunks/schema.mjs";
-export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dedupeByNaturalKey, devices, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, keywords, naturalKeyColumns, page_keywords, pages, schemaFor, search_appearance };
+import { _ as search_appearance, a as dimensionToColumn, c as schemaFor, d as dates, f as drizzleSchema, g as queries, h as pages, i as dedupeByNaturalKey, l as TABLE_METADATA, m as page_queries, n as allTables, o as inferTable, p as hourly_pages, r as currentSchemaVersion, s as naturalKeyColumns, t as SCHEMAS, u as countries } from "./_chunks/schema.mjs";
+export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance };

package/dist/sink-node.d.mts ADDED Viewed

@@ -0,0 +1,31 @@
+import { n as LocalIcebergSinkOptions, r as Sink } from "./_chunks/sink.mjs";
+/** S3-compatible credentials for the warehouse (POC: MinIO). */
+interface LocalIcebergS3Config {
+  /** S3 endpoint host (POC MinIO: `localhost:9100`). */
+  endpoint: string;
+  accessKeyId: string;
+  secretAccessKey: string;
+  region?: string;
+}
+/** Full `LocalIcebergSink` options — extends the frozen contract options. */
+interface LocalIcebergSinkFullOptions extends LocalIcebergSinkOptions {
+  /** S3 credentials for the warehouse. Defaults to the POC MinIO creds. */
+  s3?: LocalIcebergS3Config;
+  /** Python interpreter. Defaults to `$GSCDUMP_ICEBERG_PYTHON` then `python3`. */
+  python?: string;
+  /** Override the writer-script path. Defaults to `scripts/iceberg-writer.py`. */
+  writerScript?: string;
+}
+interface LocalIcebergSink extends Sink {
+  /** The catalog namespace the 5 tables live under. */
+  readonly namespace: string;
+}
+/**
+ * Create a `LocalIcebergSink` pointed at a local Iceberg REST catalog.
+ *
+ * Requires the POC docker stack (`poc/iceberg/docker-compose.iceberg.yml`)
+ * running and a Python env with `pyiceberg` + `pyarrow` available. Tests that
+ * use this sink must skip when the stack is unreachable.
+ */
+declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
+export { type LocalIcebergS3Config, type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };

package/dist/sink-node.mjs ADDED Viewed

@@ -0,0 +1,76 @@
+import { i as ICEBERG_SCHEMAS } from "./_chunks/iceberg-schema.mjs";
+import { execFile } from "node:child_process";
+import { dirname, join } from "node:path";
+import process from "node:process";
+import { fileURLToPath } from "node:url";
+const POC_S3 = {
+	endpoint: "localhost:9100",
+	accessKeyId: "poc",
+	secretAccessKey: "pocpocpoc",
+	region: "us-east-1"
+};
+function resolveWriterScript(override) {
+	if (override) return override;
+	return join(dirname(fileURLToPath(import.meta.url)), "..", "..", "scripts", "iceberg-writer.py");
+}
+function runWriter(python, script, job) {
+	return new Promise((resolve, reject) => {
+		execFile(python, [script], { maxBuffer: 64 * 1024 * 1024 }, (err, stdout, stderr) => {
+			let parsed;
+			if (stdout.trim()) try {
+				parsed = JSON.parse(stdout);
+			} catch {}
+			if (parsed?.error) {
+				reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer failed: ${parsed.error}`));
+				return;
+			}
+			if (err) {
+				reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer process failed (${err.message})${stderr ? `: ${stderr}` : ""}`));
+				return;
+			}
+			if (!parsed) {
+				reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer produced no parseable output: ${stdout || stderr}`));
+				return;
+			}
+			resolve(parsed);
+		}).stdin?.end(JSON.stringify(job));
+	});
+}
+function createLocalIcebergSink(options) {
+	const s3 = options.s3 ?? POC_S3;
+	const python = options.python ?? process.env.GSCDUMP_ICEBERG_PYTHON ?? "python3";
+	const script = resolveWriterScript(options.writerScript);
+	function buildJob(op, slice, rows) {
+		return {
+			op,
+			catalogUri: options.catalogUri,
+			namespace: options.namespace,
+			warehouse: options.warehouse,
+			s3,
+			table: slice.table,
+			spec: ICEBERG_SCHEMAS[slice.table],
+			siteId: slice.ctx.siteId ?? "",
+			searchType: slice.searchType,
+			date: slice.date,
+			rows
+		};
+	}
+	const touched = /* @__PURE__ */ new Set();
+	return {
+		namespace: options.namespace,
+		capabilities: { appendOnly: true },
+		async emit(slice, rows) {
+			if (rows.length === 0) return { rowCount: 0 };
+			const res = await runWriter(python, script, buildJob("emit", slice, rows));
+			touched.add(slice.table);
+			return { rowCount: res.rowCount ?? 0 };
+		},
+		async close() {
+			return {
+				flushed: [...touched],
+				failed: []
+			};
+		}
+	};
+}
+export { createLocalIcebergSink };

package/dist/vendor/hysnappy-purejs.d.mts ADDED Viewed

@@ -0,0 +1,29 @@
+/**
+ * Pure-JS drop-in replacement for `hysnappy`.
+ *
+ * `hysnappy`'s `snappyUncompressor()` eagerly compiles a WASM module
+ * (`new WebAssembly.Module(byteArray)`) — and `hyparquet-compressors`
+ * instantiates it at module top level (`SNAPPY: snappyUncompressor()`).
+ * Cloudflare's `workerd` forbids compiling WebAssembly from a runtime buffer
+ * (WASM must be a bundled module import), so any Worker bundle that imports
+ * `icebird` (→ `hyparquet-compressors`) fails to start with
+ * `CompileError: Wasm code generation disallowed by embedder`.
+ *
+ * `icebird`'s append path never actually decompresses a snappy data file — it
+ * only reads gzipped `metadata.json` and Avro manifests — so the snappy
+ * decompressor is instantiated but never invoked. This shim swaps the WASM
+ * codec for `hyparquet`'s pure-JS snappy decompressor (a vendored snappyjs),
+ * keeping the exact `hysnappy` API surface so `hyparquet-compressors` is
+ * unaware of the swap. Wired in via a build-time `hysnappy` alias.
+ *
+ * See `docs/plans/2026-05-22-icebird-ingest-writer-spike.md` (section e).
+ */
+/**
+ * Pure-JS stand-in for `hysnappy`'s `snappyUncompressor()`. Returns the
+ * decompressor immediately — no WASM compilation, so it is safe to call at
+ * module top level inside `workerd`.
+ */
+declare function snappyUncompressor(): (input: Uint8Array, outputLength: number) => Uint8Array;
+/** Pure-JS stand-in for `hysnappy`'s `snappyUncompress(input, outputLength)`. */
+declare function snappyUncompress(input: Uint8Array, outputLength: number): Uint8Array;
+export { snappyUncompress, snappyUncompressor };

package/dist/vendor/hysnappy-purejs.mjs ADDED Viewed

@@ -0,0 +1,13 @@
+import { snappyUncompress as snappyUncompress$1 } from "hyparquet/src/snappy.js";
+function decode(input, outputLength) {
+	const output = new Uint8Array(outputLength);
+	snappyUncompress$1(input, output);
+	return output;
+}
+function snappyUncompressor() {
+	return decode;
+}
+function snappyUncompress(input, outputLength) {
+	return decode(input, outputLength);
+}
+export { snappyUncompress, snappyUncompressor };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@gscdump/engine",
   "type": "module",
-  "version": "0.20.2",
+  "version": "0.21.0",
   "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
   "author": {
     "name": "Harlan Wilton",
@@ -51,6 +51,11 @@
       "import": "./dist/ingest.mjs",
       "default": "./dist/ingest.mjs"
     },
+    "./sink-node": {
+      "types": "./dist/sink-node.d.mts",
+      "import": "./dist/sink-node.mjs",
+      "default": "./dist/sink-node.mjs"
+    },
     "./sql": {
       "types": "./dist/sql-bind.d.mts",
       "import": "./dist/sql-bind.mjs",
@@ -140,6 +145,11 @@
       "types": "./dist/arrow-utils.d.mts",
       "import": "./dist/arrow-utils.mjs",
       "default": "./dist/arrow-utils.mjs"
+    },
+    "./vendor/hysnappy": {
+      "types": "./dist/vendor/hysnappy-purejs.d.mts",
+      "import": "./dist/vendor/hysnappy-purejs.mjs",
+      "default": "./dist/vendor/hysnappy-purejs.mjs"
     }
   },
   "main": "./dist/index.mjs",
@@ -168,9 +178,10 @@
   },
   "dependencies": {
     "drizzle-orm": "^0.45.2",
+    "icebird": "^0.8.5",
     "proper-lockfile": "^4.1.2",
-    "@gscdump/contracts": "0.20.2",
-    "gscdump": "0.20.2"
+    "@gscdump/contracts": "0.21.0",
+    "gscdump": "0.21.0"
   },
   "devDependencies": {
     "@duckdb/duckdb-wasm": "^1.32.0",