npm - @gscdump/engine - Versions diffs - 0.24.1 → 0.25.1 - Mend

@gscdump/engine 0.24.1 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/_chunks/compaction.mjs +247 -0
package/dist/_chunks/engine.mjs +22 -4
package/dist/_chunks/parquet-plan.mjs +3 -248
package/dist/_chunks/resolver.mjs +3 -3
package/dist/_chunks/{iceberg-schema.mjs → schema2.mjs} +9 -2
package/dist/_chunks/sink.d.mts +11 -1
package/dist/_chunks/source.mjs +1 -1
package/dist/_chunks/storage.d.mts +24 -33
package/dist/adapters/filesystem.mjs +1 -1
package/dist/adapters/node.mjs +1 -1
package/dist/adapters/r2-manifest.mjs +1 -1
package/dist/compaction-public.d.mts +15 -0
package/dist/compaction-public.mjs +5 -0
package/dist/iceberg/index.d.mts +12 -0
package/dist/iceberg/index.mjs +269 -0
package/dist/index.d.mts +30 -29
package/dist/index.mjs +5 -272
package/dist/planner.mjs +2 -1
package/dist/rollups.mjs +1 -1
package/dist/sink-node.d.mts +1 -1
package/dist/sink-node.mjs +1 -1
package/package.json +13 -8
package/dist/_chunks/{storage.mjs → layout.mjs} +11 -11

package/dist/index.mjs CHANGED Viewed

@@ -1,189 +1,15 @@
 import { ENGINE_QUERY_CAPABILITIES, coerceRow, coerceRows, createSqlQuerySource } from "./_chunks/source.mjs";
+import { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, objectKey } from "./_chunks/layout.mjs";
 import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, page_queries, pages, queries } from "./_chunks/schema.mjs";
-import { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, objectKey } from "./_chunks/storage.mjs";
-import { FILES_PLACEHOLDER, RAW_DAILY_COMPACT_THRESHOLD, countRawDailies, dedupeOverlappingTiers, enumeratePartitions, resolveParquetSQL, splitOverlappingTiers, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
+import { enumeratePartitions } from "./_chunks/compaction.mjs";
+import { FILES_PLACEHOLDER, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
 import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
-import { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine, gcOrphansImpl } from "./_chunks/engine.mjs";
-import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, icebergTableSpec } from "./_chunks/iceberg-schema.mjs";
+import { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "./_chunks/engine.mjs";
 import { assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
 import "./planner.mjs";
 import { createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "./_chunks/resolver.mjs";
 import { rebuildDailyFromHourly } from "./rollups.mjs";
 import { fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
-import { icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "icebird";
-const ICEBERG_TYPE_MAP = {
-	STRING: "string",
-	INT: "int",
-	LONG: "long",
-	DOUBLE: "double",
-	DATE: "date"
-};
-function icebergSchemaFor(table) {
-	return {
-		"type": "struct",
-		"schema-id": 0,
-		"fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
-			id: col.fieldId,
-			name: col.name,
-			required: col.required,
-			type: ICEBERG_TYPE_MAP[col.type]
-		}))
-	};
-}
-function icebergPartitionSpecFor(table) {
-	const fields = ICEBERG_SCHEMAS[table].columns;
-	const fieldId = (name) => {
-		const col = fields.find((c) => c.name === name);
-		if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
-		return col.fieldId;
-	};
-	return {
-		"spec-id": 0,
-		"fields": ICEBERG_PARTITION_SPEC.map((p, i) => ({
-			"source-id": fieldId(p.sourceColumn),
-			"field-id": 1e3 + i,
-			"name": p.name,
-			"transform": p.transform
-		}))
-	};
-}
-async function connectIcebergCatalog(config) {
-	return {
-		catalog: await restCatalogConnect({
-			url: config.catalogUri,
-			warehouse: config.warehouse,
-			requestInit: { headers: { Authorization: `Bearer ${config.catalogToken}` } }
-		}),
-		resolver: s3SignedResolver({
-			accessKeyId: config.s3.accessKeyId,
-			secretAccessKey: config.s3.secretAccessKey,
-			region: config.s3.region ?? "auto",
-			endpoint: config.s3.endpoint,
-			pathStyle: true
-		}),
-		namespace: config.namespace
-	};
-}
-function isCommitRateLimited(err) {
-	if (err && typeof err === "object" && err.status === 429) return true;
-	const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
-	return msg.includes("429") || msg.includes("too many commits") || msg.includes("rate limit");
-}
-function defaultCommitSleep(ms) {
-	return new Promise((resolve) => setTimeout(resolve, ms));
-}
-async function icebergAppendRetrying(args, options = {}) {
-	const maxAttempts = options.maxAttempts ?? 6;
-	const baseDelayMs = options.baseDelayMs ?? 1e3;
-	const maxDelayMs = options.maxDelayMs ?? 2e4;
-	const sleep = options.sleep ?? defaultCommitSleep;
-	const random = options.random ?? Math.random;
-	for (let attempt = 0; attempt < maxAttempts; attempt++) {
-		const err = await icebergAppend(args).then(() => void 0, (e) => e);
-		if (err === void 0) return;
-		if (!isCommitRateLimited(err) || attempt === maxAttempts - 1) throw err;
-		const ceiling = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt);
-		await sleep(Math.floor(random() * ceiling));
-	}
-}
-async function ensureIcebergNamespace(conn) {
-	await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
-}
-async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
-	const results = [];
-	for (const table of tables) await icebergCreateTable({
-		catalog: conn.catalog,
-		namespace: conn.namespace,
-		table,
-		schema: icebergSchemaFor(table),
-		partitionSpec: icebergPartitionSpecFor(table)
-	}).then(() => results.push({
-		table,
-		ok: true
-	}), (e) => results.push({
-		table,
-		ok: false,
-		error: String(e)
-	}));
-	return results;
-}
-async function listIcebergTables(conn) {
-	return restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name).sort(), () => []);
-}
-function monthsInRange(range) {
-	const [sy, sm] = range.start.split("-").map(Number);
-	const [ey, em] = range.end.split("-").map(Number);
-	const out = [];
-	let y = sy;
-	let m = sm;
-	while (y < ey || y === ey && m <= em) {
-		out.push(`${y}-${String(m).padStart(2, "0")}`);
-		m++;
-		if (m > 12) {
-			m = 1;
-			y++;
-		}
-	}
-	return out;
-}
-function monthsSinceEpoch(ym) {
-	const [y, m] = ym.split("-").map(Number);
-	return (y - 1970) * 12 + (m - 1);
-}
-function stripBucket(filePath) {
-	if (!filePath.startsWith("s3://")) return filePath;
-	const rest = filePath.slice(5);
-	const slash = rest.indexOf("/");
-	return slash >= 0 ? rest.slice(slash + 1) : rest;
-}
-async function listIcebergDataFiles(conn, opts) {
-	const { metadata } = await restCatalogLoadTable(conn.catalog, {
-		namespace: conn.namespace,
-		table: opts.table
-	});
-	if (metadata["current-snapshot-id"] == null) return [];
-	const wantedMonths = new Set(monthsInRange(opts.range).map(monthsSinceEpoch));
-	const manifests = await icebergManifests({
-		metadata,
-		resolver: conn.resolver
-	});
-	const out = [];
-	for (const m of manifests) for (const entry of m.entries) {
-		if (entry.status === 2) continue;
-		const df = entry.data_file;
-		if (df.content !== 0) continue;
-		const part = df.partition;
-		if (part.site_id !== opts.siteId) continue;
-		if (part.search_type !== opts.searchType) continue;
-		const month = part.date_month;
-		if (typeof month !== "number" || !wantedMonths.has(month)) continue;
-		out.push({
-			filePath: df.file_path,
-			objectKey: stripBucket(df.file_path),
-			bytes: Number(df.file_size_in_bytes),
-			rowCount: Number(df.record_count)
-		});
-	}
-	return out;
-}
-async function dropIcebergTables(conn, tables) {
-	const targets = tables ?? await restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name), () => []);
-	const results = [];
-	for (const table of targets) await icebergDropTable({
-		catalog: conn.catalog,
-		namespace: conn.namespace,
-		table,
-		purgeRequested: true
-	}).then(() => results.push({
-		table,
-		ok: true
-	}), (e) => results.push({
-		table,
-		ok: false,
-		error: String(e)
-	}));
-	return results;
-}
 const NOOP_RESULT = {
 	flushed: 0,
 	recovered: 0,
@@ -299,99 +125,6 @@ function createIngestAccumulator(opts) {
 		}
 	};
 }
-const DAY_MILLIS = 864e5;
-function toIcebergDate(value) {
-	if (typeof value === "string") {
-		const ms = Date.parse(`${value}T00:00:00Z`);
-		if (Number.isNaN(ms)) throw new TypeError(`toIcebergDate: invalid date string '${value}'`);
-		return Math.floor(ms / DAY_MILLIS);
-	}
-	if (value instanceof Date) {
-		const ms = value.getTime();
-		if (Number.isNaN(ms)) throw new TypeError("toIcebergDate: invalid Date (NaN)");
-		return Math.floor(ms / DAY_MILLIS);
-	}
-	return value;
-}
-function coerceJsonSafe(value) {
-	if (typeof value === "bigint") return Number(value);
-	return value;
-}
-function toRecords(slice, rows) {
-	const siteId = slice.ctx.siteId ?? "";
-	return rows.map((row) => {
-		const out = {};
-		for (const k in row) out[k] = coerceJsonSafe(row[k]);
-		out.date = toIcebergDate(out.date);
-		out.site_id = siteId;
-		out.search_type = slice.searchType;
-		return out;
-	});
-}
-function createIcebergAppendSink(options) {
-	let connection;
-	const buffers = /* @__PURE__ */ new Map();
-	function connect() {
-		connection ??= connectIcebergCatalog(options.catalog);
-		return connection;
-	}
-	return {
-		capabilities: { appendOnly: true },
-		async emit(slice, rows) {
-			if (rows.length === 0) return { rowCount: 0 };
-			const records = toRecords(slice, rows);
-			const buffer = buffers.get(slice.table);
-			if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
-			else buffers.set(slice.table, records);
-			return { rowCount: records.length };
-		},
-		async close() {
-			const flushed = [];
-			const failed = [];
-			if (buffers.size === 0) return {
-				flushed,
-				failed
-			};
-			const conn = await connect().then((c) => c, (err) => {
-				connection = void 0;
-				return { error: String(err) };
-			});
-			if ("error" in conn) {
-				for (const [table, records] of buffers) if (records.length > 0) failed.push({
-					table,
-					error: conn.error
-				});
-				buffers.clear();
-				return {
-					flushed,
-					failed
-				};
-			}
-			for (const [table, records] of buffers) {
-				if (records.length === 0) continue;
-				await icebergAppendRetrying({
-					catalog: conn.catalog,
-					namespace: conn.namespace,
-					table,
-					resolver: conn.resolver,
-					records
-				}, options.commitRetry).then(() => {
-					flushed.push(table);
-				}, (err) => {
-					failed.push({
-						table,
-						error: String(err)
-					});
-				});
-			}
-			buffers.clear();
-			return {
-				flushed,
-				failed
-			};
-		}
-	};
-}
 const KEY_SEP = "\0";
 function partitionKey(slice) {
 	return [
@@ -544,4 +277,4 @@ const MIN_SYNC_IMPRESSIONS = 1;
 const MIN_COUNTRY_IMPRESSIONS = 10;
 const MAX_SITEMAP_URLS_PER_SITE = 5e4;
 const MAX_TRACKED_URLS_PER_SITE = 2e5;
-export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, connectIcebergCatalog, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergAppendSink, createIcebergResolverAdapter, createIcebergTables, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dedupeOverlappingTiers, dimensionToColumn, drizzleSchema, dropIcebergTables, ensureIcebergNamespace, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isCommitRateLimited, listIcebergDataFiles, listIcebergTables, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
+export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };

package/dist/planner.mjs CHANGED Viewed

@@ -1,2 +1,3 @@
-import { FILES_PLACEHOLDER, compileLogicalQueryPlan, enumeratePartitions, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
+import { enumeratePartitions } from "./_chunks/compaction.mjs";
+import { FILES_PLACEHOLDER, compileLogicalQueryPlan, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
 export { FILES_PLACEHOLDER, compileLogicalQueryPlan, enumeratePartitions, resolveParquetSQL, substituteNamedFiles };

package/dist/rollups.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import "./_chunks/storage.mjs";
+import "./_chunks/layout.mjs";
 import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
 import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./entities.mjs";
 import { MS_PER_DAY } from "gscdump";

package/dist/sink-node.d.mts CHANGED Viewed

@@ -20,4 +20,4 @@ interface LocalIcebergSink extends Sink {
  * use this sink must skip when the stack is unreachable.
  */
 declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
-export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
+export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, type LocalIcebergSinkOptions, createLocalIcebergSink };

package/dist/sink-node.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { ICEBERG_SCHEMAS } from "./_chunks/iceberg-schema.mjs";
+import { ICEBERG_SCHEMAS } from "./_chunks/schema2.mjs";
 import { execFile } from "node:child_process";
 import { dirname, join } from "node:path";
 import process from "node:process";

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@gscdump/engine",
   "type": "module",
-  "version": "0.24.1",
+  "version": "0.25.1",
   "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
   "author": {
     "name": "Harlan Wilton",
@@ -41,6 +41,11 @@
       "import": "./dist/planner.mjs",
       "default": "./dist/planner.mjs"
     },
+    "./compaction": {
+      "types": "./dist/compaction-public.d.mts",
+      "import": "./dist/compaction-public.mjs",
+      "default": "./dist/compaction-public.mjs"
+    },
     "./schema": {
       "types": "./dist/schema.d.mts",
       "import": "./dist/schema.mjs",
@@ -66,11 +71,6 @@
       "import": "./dist/sql-fragments.mjs",
       "default": "./dist/sql-fragments.mjs"
     },
-    "./schedule": {
-      "types": "./dist/schedule.d.mts",
-      "import": "./dist/schedule.mjs",
-      "default": "./dist/schedule.mjs"
-    },
     "./entities": {
       "types": "./dist/entities.d.mts",
       "import": "./dist/entities.mjs",
@@ -81,6 +81,11 @@
       "import": "./dist/rollups.mjs",
       "default": "./dist/rollups.mjs"
     },
+    "./iceberg": {
+      "types": "./dist/iceberg/index.d.mts",
+      "import": "./dist/iceberg/index.mjs",
+      "default": "./dist/iceberg/index.mjs"
+    },
     "./node": {
       "types": "./dist/adapters/node.d.mts",
       "import": "./dist/adapters/node.mjs",
@@ -180,8 +185,8 @@
     "drizzle-orm": "1.0.0-rc.3",
     "icebird": "^0.8.6",
     "proper-lockfile": "^4.1.2",
-    "@gscdump/contracts": "0.24.1",
-    "gscdump": "0.24.1"
+    "@gscdump/contracts": "0.25.1",
+    "gscdump": "0.25.1"
   },
   "devDependencies": {
     "@duckdb/duckdb-wasm": "^1.32.0",

package/dist/_chunks/{storage.mjs → layout.mjs} RENAMED Viewed

@@ -1,13 +1,4 @@
 import { MS_PER_DAY, toIsoDate } from "gscdump";
-const DEFAULT_SEARCH_TYPE = "web";
-function inferSearchType(entry) {
-	return entry.searchType ?? "web";
-}
-function inferLegacyTier(entry) {
-	if (entry.tier !== void 0) return entry.tier;
-	if (entry.partition.startsWith("daily/")) return "raw";
-	if (entry.partition.startsWith("monthly/")) return "d30";
-}
 function dayPartition(date) {
 	return `daily/${date}`;
 }
@@ -33,10 +24,19 @@ function quarterOfMonth(month) {
 	const [y, m] = month.split("-").map(Number);
 	return `${y}-Q${Math.floor((m - 1) / 3) + 1}`;
 }
+function tenantPrefix(ctx) {
+	return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
+}
+const DEFAULT_SEARCH_TYPE = "web";
 function objectKey(ctx, table, partition, version, searchType) {
 	return `${ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/${table}` : `u_${ctx.userId}/${table}`}/${searchType !== void 0 && searchType !== "web" ? `${searchType}/` : ""}${partition}__v${version}.parquet`;
 }
-function tenantPrefix(ctx) {
-	return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
+function inferSearchType(entry) {
+	return entry.searchType ?? "web";
+}
+function inferLegacyTier(entry) {
+	if (entry.tier !== void 0) return entry.tier;
+	if (entry.partition.startsWith("daily/")) return "raw";
+	if (entry.partition.startsWith("monthly/")) return "d30";
 }
 export { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, mondayOfWeek, monthPartition, objectKey, quarterOfMonth, quarterPartition, tenantPrefix, weekPartition };