npm - @gscdump/engine - Versions diffs - 0.20.3 → 0.21.1 - Mend

@gscdump/engine 0.20.3 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/_chunks/engine.mjs +1 -1
package/dist/_chunks/iceberg-schema.mjs +67 -0
package/dist/_chunks/registry.d.mts +1 -1
package/dist/_chunks/resolver.mjs +15 -21
package/dist/_chunks/schema.d.mts +452 -133
package/dist/_chunks/schema.mjs +50 -24
package/dist/_chunks/sink.d.mts +329 -0
package/dist/_chunks/storage.d.mts +4 -4
package/dist/adapters/duckdb-node.mjs +2 -2
package/dist/adapters/hyparquet.mjs +5 -5
package/dist/index.d.mts +39 -7
package/dist/index.mjs +272 -14
package/dist/ingest.d.mts +23 -3
package/dist/ingest.mjs +43 -18
package/dist/rollups.mjs +12 -12
package/dist/schema.d.mts +2 -2
package/dist/schema.mjs +2 -2
package/dist/sink-node.d.mts +31 -0
package/dist/sink-node.mjs +76 -0
package/dist/vendor/hysnappy-purejs.d.mts +29 -0
package/dist/vendor/hysnappy-purejs.mjs +13 -0
package/package.json +14 -3

package/dist/_chunks/schema.mjs CHANGED Viewed

@@ -12,7 +12,7 @@ const pages = pgTable("pages", {
 	date: dateCol(),
 	...metricCols()
 });
-const keywords = pgTable("keywords", {
+const queries = pgTable("queries", {
 	query: varchar("query").notNull(),
 	query_canonical: varchar("query_canonical"),
 	date: dateCol(),
@@ -23,18 +23,29 @@ const countries = pgTable("countries", {
 	date: dateCol(),
 	...metricCols()
 });
-const devices = pgTable("devices", {
-	device: varchar("device").notNull(),
-	date: dateCol(),
-	...metricCols()
-});
-const page_keywords = pgTable("page_keywords", {
+const page_queries = pgTable("page_queries", {
 	url: varchar("url").notNull(),
 	query: varchar("query").notNull(),
 	query_canonical: varchar("query_canonical"),
 	date: dateCol(),
 	...metricCols()
 });
+const dates = pgTable("dates", {
+	date: dateCol(),
+	clicks: integer("clicks").notNull(),
+	impressions: integer("impressions").notNull(),
+	sum_position: doublePrecision("sum_position").notNull(),
+	anonymized_impressions_pct: doublePrecision("anonymized_impressions_pct").notNull(),
+	clicks_desktop: integer("clicks_desktop").notNull(),
+	clicks_mobile: integer("clicks_mobile").notNull(),
+	clicks_tablet: integer("clicks_tablet").notNull(),
+	impressions_desktop: integer("impressions_desktop").notNull(),
+	impressions_mobile: integer("impressions_mobile").notNull(),
+	impressions_tablet: integer("impressions_tablet").notNull(),
+	sum_position_desktop: doublePrecision("sum_position_desktop").notNull(),
+	sum_position_mobile: doublePrecision("sum_position_mobile").notNull(),
+	sum_position_tablet: doublePrecision("sum_position_tablet").notNull()
+});
 const search_appearance = pgTable("search_appearance", {
 	searchAppearance: varchar("searchAppearance").notNull(),
 	date: dateCol(),
@@ -48,40 +59,50 @@ const hourly_pages = pgTable("hourly_pages", {
 });
 const drizzleSchema = {
 	pages,
-	keywords,
+	queries,
 	countries,
-	devices,
-	page_keywords,
+	page_queries,
+	dates,
 	search_appearance,
 	hourly_pages
 };
 const TABLE_METADATA = {
 	pages: {
 		sortKey: ["date", "url"],
+		clusterKey: ["url", "date"],
 		version: 1
 	},
-	keywords: {
+	queries: {
 		sortKey: ["date", "query"],
+		clusterKey: ["query", "date"],
 		version: 2
 	},
 	countries: {
 		sortKey: ["date", "country"],
+		clusterKey: ["country", "date"],
 		version: 1
 	},
-	devices: {
-		sortKey: ["date", "device"],
-		version: 1
-	},
-	page_keywords: {
+	page_queries: {
 		sortKey: [
 			"date",
 			"url",
 			"query"
 		],
+		clusterKey: [
+			"url",
+			"query",
+			"date"
+		],
 		version: 2
 	},
+	dates: {
+		sortKey: ["date"],
+		clusterKey: ["date"],
+		version: 1
+	},
 	search_appearance: {
 		sortKey: ["date", "searchAppearance"],
+		clusterKey: ["searchAppearance", "date"],
 		version: 1
 	},
 	hourly_pages: {
@@ -90,6 +111,11 @@ const TABLE_METADATA = {
 			"hour",
 			"url"
 		],
+		clusterKey: [
+			"url",
+			"date",
+			"hour"
+		],
 		version: 1
 	}
 };
@@ -118,10 +144,10 @@ function tableSchemaFrom(tableName) {
 }
 const METRIC_TABLES = [
 	"pages",
-	"keywords",
+	"queries",
 	"countries",
-	"devices",
-	"page_keywords",
+	"page_queries",
+	"dates",
 	"search_appearance",
 	"hourly_pages"
 ];
@@ -139,13 +165,13 @@ function inferTable(dimensions) {
 	const dims = new Set(dimensions);
 	const hasPage = dims.has("page");
 	const hasQuery = dims.has("query");
-	if (hasPage && hasQuery) return "page_keywords";
-	if (hasQuery) return "keywords";
+	if (hasPage && hasQuery) return "page_queries";
+	if (hasQuery) return "queries";
 	if (hasPage) return "pages";
 	if (dims.has("country")) return "countries";
-	if (dims.has("device")) return "devices";
+	if (dims.has("device")) return "dates";
 	if (dims.has("searchAppearance")) return "search_appearance";
-	return "pages";
+	return "dates";
 }
 function naturalKeyColumns(table) {
 	return TABLE_METADATA[table].sortKey;
@@ -166,4 +192,4 @@ function dimensionToColumn(dim, _table) {
 	if (dim === "queryCanonical") return "query_canonical";
 	return dim;
 }
-export { search_appearance as _, dimensionToColumn as a, schemaFor as c, devices as d, drizzleSchema as f, pages as g, page_keywords as h, dedupeByNaturalKey as i, TABLE_METADATA as l, keywords as m, allTables as n, inferTable as o, hourly_pages as p, currentSchemaVersion as r, naturalKeyColumns as s, SCHEMAS as t, countries as u };
+export { search_appearance as _, dimensionToColumn as a, schemaFor as c, dates as d, drizzleSchema as f, queries as g, pages as h, dedupeByNaturalKey as i, TABLE_METADATA as l, page_queries as m, allTables as n, inferTable as o, hourly_pages as p, currentSchemaVersion as r, naturalKeyColumns as s, SCHEMAS as t, countries as u };

package/dist/_chunks/sink.d.mts ADDED Viewed

@@ -0,0 +1,329 @@
+import { C as Row$1, N as TenantCtx$1, T as SearchType } from "./storage.mjs";
+import { icebergAppend, restCatalogConnect, s3SignedResolver } from "icebird";
+import { TableName } from "@gscdump/contracts";
+/** The 5 fact tables that exist as global Iceberg tables. */
+type IcebergTableName = Extract<TableName, 'pages' | 'queries' | 'countries' | 'page_queries' | 'dates'>;
+/** The 5 Iceberg table names, in canonical order. */
+declare const ICEBERG_TABLES: readonly IcebergTableName[];
+/**
+ * Iceberg-native column type. Superset-mapped from the engine `ColumnType`;
+ * `LONG` is Iceberg's name for 64-bit integers, `STRING` for varchar.
+ */
+type IcebergColumnType = 'STRING' | 'INT' | 'LONG' | 'DOUBLE' | 'DATE';
+interface IcebergColumn {
+  /** Column name as written into the Iceberg table (snake_case). */
+  name: string;
+  type: IcebergColumnType;
+  /** Iceberg field nullability. Partition identity columns are never null. */
+  required: boolean;
+  /**
+   * Stable Iceberg field id. Field ids — not names — are the schema-evolution
+   * identity in Iceberg; never reuse or renumber an id once a table is live.
+   */
+  fieldId: number;
+}
+/** Iceberg partition transform applied to a source column. */
+type IcebergPartitionTransform = 'identity' | 'month';
+interface IcebergPartitionField {
+  /** Source column the transform reads. */
+  sourceColumn: 'site_id' | 'search_type' | 'date';
+  transform: IcebergPartitionTransform;
+  /** Partition field name as it appears in Iceberg metadata. */
+  name: string;
+}
+interface IcebergTableSpec {
+  table: IcebergTableName;
+  columns: readonly IcebergColumn[];
+  /**
+   * Partition spec — shared by all 5 tables: identity(site_id),
+   * identity(search_type), month(date).
+   */
+  partitionSpec: readonly IcebergPartitionField[];
+  /**
+   * Natural-key columns: a row is uniquely identified by this tuple within
+   * its partition. Drives partition-overwrite revision correctness and
+   * dedup. Mirrors `TABLE_METADATA[table].sortKey` plus `site_id` +
+   * `search_type`.
+   */
+  identityColumns: readonly string[];
+}
+/**
+ * The two partition-identity columns Iceberg rows carry that the legacy
+ * parquet path encoded in the object-key prefix instead. Field ids 1–2 are
+ * the first two columns; per-table metric/dimension columns follow
+ * contiguously from id 3 (see `ICEBERG_FIELD_ID_BASE`).
+ */
+declare const ICEBERG_PARTITION_COLUMNS: readonly IcebergColumn[];
+/**
+ * First field id used for per-table (non-partition) columns — immediately
+ * after the two partition-identity columns (`site_id`=1, `search_type`=2).
+ *
+ * ADVISORY ONLY. The icebird spike (2026-05-22) established that R2 Data
+ * Catalog's `createTable` endpoint re-assigns field ids sequentially and does
+ * NOT preserve caller-supplied ids. The contiguous numbering here matches what
+ * the catalog produces (`site_id`=1, `search_type`=2, data columns 3, 4, …) so
+ * the contract describes reality, but ids are authoritatively assigned by the
+ * catalog. Iceberg still guarantees ids are stable once a table exists.
+ */
+declare const ICEBERG_FIELD_ID_BASE = 3;
+/** Shared partition spec — identical across all 5 tables. */
+declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
+/**
+ * Derive the full Iceberg table spec for a table from the engine `SCHEMAS`
+ * (drizzle-derived column set) plus the shared partition-identity columns.
+ * Field ids are assigned deterministically from `ICEBERG_FIELD_ID_BASE` in
+ * column declaration order so the same schema always yields the same ids.
+ *
+ * CONTRACT NOTE: implementation agents must treat the RETURNED VALUE as the
+ * source of truth — do not hand-list columns elsewhere.
+ */
+declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
+/** All 5 Iceberg table specs, keyed by table name. */
+declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
+/** icebird's lowercase Iceberg primitive types (subset we use). */
+type IcebergPrimitiveType = 'string' | 'int' | 'long' | 'double' | 'date';
+/** A field in an icebird table `Schema`. */
+interface IcebergSchemaField {
+  id: number;
+  name: string;
+  required: boolean;
+  type: IcebergPrimitiveType;
+}
+/** An icebird table `Schema` (Iceberg `struct`). */
+interface IcebergSchema {
+  'type': 'struct';
+  'schema-id': number;
+  'fields': IcebergSchemaField[];
+}
+/** A field in an icebird `PartitionSpec`. */
+interface IcebergPartitionSpecField {
+  'source-id': number;
+  'field-id': number;
+  'name': string;
+  'transform': 'identity' | 'month';
+}
+/** An icebird `PartitionSpec`. */
+interface IcebergPartitionSpec {
+  'spec-id': number;
+  'fields': IcebergPartitionSpecField[];
+}
+/** S3-compatible credentials for the R2 warehouse. */
+interface IcebergS3Config {
+  /** R2 S3 endpoint, e.g. `https://<account>.r2.cloudflarestorage.com`. */
+  endpoint: string;
+  accessKeyId: string;
+  secretAccessKey: string;
+  /** Defaults to `'auto'` (R2's region). */
+  region?: string;
+}
+/** Everything needed to talk to the R2 Data Catalog. */
+interface IcebergCatalogConfig {
+  /** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
+  catalogUri: string;
+  /** Warehouse identifier, e.g. `<acct>_gscdump-analytics`. */
+  warehouse: string;
+  /** Catalog namespace the 5 fact tables live under (e.g. `gsc`). */
+  namespace: string;
+  /** Bearer token for the REST catalog (`R2_CATALOG_TOKEN`). */
+  catalogToken: string;
+  /** R2 S3 credentials for the warehouse objects. */
+  s3: IcebergS3Config;
+}
+/** The connected catalog context + a signed S3 resolver — the icebird call inputs. */
+interface IcebergConnection {
+  /** icebird REST catalog context, passed as `{ catalog }` to icebird write fns. */
+  catalog: Awaited<ReturnType<typeof restCatalogConnect>>;
+  /** icebird S3 resolver, passed as `{ resolver }` to icebird write fns. */
+  resolver: ReturnType<typeof s3SignedResolver>;
+  /** The namespace the fact tables live under. */
+  namespace: string;
+}
+/**
+ * Build the icebird `Schema` for one of the 5 fact tables from the frozen
+ * `ICEBERG_SCHEMAS` contract. Field ids are advisory — R2 Data Catalog
+ * re-assigns them on `createTable` (see `ICEBERG_FIELD_ID_BASE`).
+ */
+declare function icebergSchemaFor(table: IcebergTableName): IcebergSchema;
+/**
+ * Build the icebird `PartitionSpec` for one of the 5 fact tables: the locked
+ * spec `identity(site_id) + identity(search_type) + month(date)`. Each
+ * partition field's `source-id` is resolved to the real column field id from
+ * {@link icebergSchemaFor}.
+ */
+declare function icebergPartitionSpecFor(table: IcebergTableName): IcebergPartitionSpec;
+/**
+ * Connect to the R2 Data Catalog: a REST catalog context + a signed S3
+ * resolver. Runs in Node and in `workerd` — SigV4 is Web Crypto, I/O is
+ * `fetch`, no node builtins.
+ */
+declare function connectIcebergCatalog(config: IcebergCatalogConfig): Promise<IcebergConnection>;
+/** Tunable retry policy for {@link icebergAppendRetrying}. */
+interface CommitRetryOptions {
+  /** Total attempts, including the first. Default 6. */
+  maxAttempts?: number;
+  /** Base (ms) for the exponential back-off ceiling. Default 1000. */
+  baseDelayMs?: number;
+  /** Hard cap (ms) on the back-off ceiling. Default 20_000. */
+  maxDelayMs?: number;
+  /** Injectable sleep — tests pass a synchronous no-op. */
+  sleep?: (ms: number) => Promise<void>;
+  /** Injectable RNG for the jitter — tests pass a deterministic value. */
+  random?: () => number;
+}
+/**
+ * True when `err` is an R2 Data Catalog commit rate-limit response
+ * (`429 too many commits to this table`). Matches a numeric `status` of 429
+ * or the message text, so it holds whether icebird surfaces the raw HTTP
+ * error or a wrapped `Error`.
+ */
+declare function isCommitRateLimited(err: unknown): boolean;
+/**
+ * `icebergAppend` wrapped with retry on R2 Data Catalog 429 commit
+ * rate-limits, using full-jitter exponential back-off. icebird already
+ * retries 412/409 internally; 429 is the gap this closes. Non-429 errors
+ * (and 429s that survive every attempt) propagate unchanged.
+ *
+ * RESIDUAL RISK — re-upload orphans. icebird's `icebergAppend` prepares the
+ * data + manifest files ONCE, outside its internal 412/409 retry loop, so
+ * those retries never re-upload data. A 429 that escapes that loop and is
+ * retried HERE re-runs the whole `icebergAppend` call, which re-prepares and
+ * re-uploads the data files; the previous attempt's parquet objects become
+ * orphans (referenced by no snapshot). 429s should be rare and clear within
+ * an attempt or two, so orphan volume is small, and R2 orphan-file cleanup
+ * reclaims them. Eliminating the 429 source entirely (per-table commit
+ * coalescing) is assessed in the Phase-1.5 report.
+ */
+declare function icebergAppendRetrying(args: Parameters<typeof icebergAppend>[0], options?: CommitRetryOptions): Promise<void>;
+/** Outcome of a single table create/drop. */
+interface IcebergTableOpResult {
+  table: string;
+  ok: boolean;
+  /** Present when `ok` is false. */
+  error?: string;
+}
+/**
+ * Ensure the catalog namespace exists. Idempotent — an "already exists"
+ * response from the REST catalog is swallowed.
+ */
+declare function ensureIcebergNamespace(conn: IcebergConnection): Promise<void>;
+/**
+ * Create the global Iceberg fact tables with the locked partition spec
+ * (`identity(site_id) + identity(search_type) + month(date)`) and the schema
+ * derived from {@link ICEBERG_SCHEMAS}. Per-table errors are captured rather
+ * than thrown so a partial run is observable; "table already exists" surfaces
+ * as a failed result. Used by the app's one-off provisioning script.
+ */
+declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[]): Promise<IcebergTableOpResult[]>;
+/** List the table names currently in the catalog namespace. */
+declare function listIcebergTables(conn: IcebergConnection): Promise<string[]>;
+/**
+ * Drop tables from the catalog namespace, purging their data objects.
+ * Defaults to every table currently in the namespace — used to clear the
+ * wrong-spec Pipelines-provisioned `gsc.*` tables before re-creating them.
+ */
+declare function dropIcebergTables(conn: IcebergConnection, tables?: readonly string[]): Promise<IcebergTableOpResult[]>;
+/**
+ * Identifies one fact slice — the atomic unit a sink emits.
+ * `(table, site, searchType, date)`. `userId` rides along on `ctx` for
+ * tenant-scoped sinks (local/in-memory); the prod Iceberg table is global
+ * and keys only on `siteId`.
+ */
+interface SinkSlice {
+  ctx: TenantCtx$1;
+  table: IcebergTableName;
+  /** GSC search-type partition. */
+  searchType: SearchType;
+  /** Calendar day (PT), `YYYY-MM-DD`. The slice's `month(date)` partition. */
+  date: string;
+}
+/**
+ * Outcome of a sink write. `rowCount` is the number of rows accepted;
+ * `bytes` is best-effort — `IcebergAppendSink` does not report it (undefined there).
+ */
+interface SinkWriteResult {
+  rowCount: number;
+  bytes?: number;
+}
+/**
+ * Static description of a sink. All sinks are append-only under the v5
+ * stability-cutoff model; `appendOnly` is therefore always `true` and kept
+ * only as an explicit, self-documenting marker.
+ */
+interface SinkCapabilities {
+  /** Always `true` — re-emitting a slice accumulates duplicate rows. */
+  appendOnly: true;
+}
+/**
+ * Outcome of `Sink.close()` — which tables' buffered rows reached durable
+ * storage and which failed.
+ *
+ * `IcebergAppendSink.emit` only BUFFERS; the durable Iceberg commit happens
+ * in `close()`, one `icebergAppend()` per table. The ingest ledger
+ * (`sinkAsIngestEngine`) records a `(site, table, searchType, date)` slice
+ * ONLY after the table holding it appears in `flushed` — a table in `failed`
+ * leaves its slices un-recorded so the next sync re-emits them. This is what
+ * keeps the D1 ledger from ever running ahead of Iceberg.
+ */
+interface SinkCloseResult {
+  /** Tables whose buffered rows committed durably. */
+  flushed: IcebergTableName[];
+  /** Tables whose flush failed — their slices must NOT be ledger-recorded. */
+  failed: {
+    table: IcebergTableName;
+    error: string;
+  }[];
+}
+interface Sink {
+  readonly capabilities: SinkCapabilities;
+  /**
+   * Emit the fact rows for one slice. Append semantics — for `IcebergAppendSink`
+   * this commits one Iceberg snapshot. Re-emitting the same slice produces
+   * DUPLICATE rows; exactly-once is enforced upstream by the D1 ingested-days
+   * ledger, which only calls `emit` for a slice once.
+   *
+   * `rows` carry the table's data columns; the sink injects the partition
+   * identity columns (`site_id`, `search_type`) from `slice` — callers MUST
+   * NOT pre-populate them.
+   */
+  emit: (slice: SinkSlice, rows: readonly Row$1[]) => Promise<SinkWriteResult>;
+  /**
+   * Flush any buffered rows and release resources, returning which tables
+   * reached durable storage. Idempotent — a second `close()` after a flush
+   * reports empty `flushed`/`failed` (nothing left buffered).
+   *
+   * `IcebergAppendSink` buffers in `emit` and commits one `icebergAppend()`
+   * per table HERE; `flushed`/`failed` reflect those per-table commits.
+   * In-memory / local sinks write durably in `emit`, so they report every
+   * table they received rows for as `flushed`.
+   */
+  close: () => Promise<SinkCloseResult>;
+}
+/** Construction options shared by all sink implementations. */
+interface SinkOptions {
+  now?: () => number;
+}
+/**
+ * `IcebergAppendSink`-specific options — the R2 Data Catalog coordinates the
+ * sink appends to via `icebird`. The catalog config shape (`IcebergCatalogConfig`)
+ * is type-only here so this contract file stays implementation-free.
+ */
+interface IcebergAppendSinkOptions extends SinkOptions {
+  /** R2 Data Catalog connection config (catalog URI, warehouse, namespace, token, S3 creds). */
+  catalog: IcebergCatalogConfig;
+  /**
+   * Retry policy for the per-table `icebergAppend()` commit, applied on R2
+   * Data Catalog 429 ("too many commits") rate-limits. Optional — production
+   * uses the defaults; tests inject a synchronous `sleep`.
+   */
+  commitRetry?: CommitRetryOptions;
+}
+/** `LocalIcebergSink` options — points at the local Iceberg REST catalog. */
+interface LocalIcebergSinkOptions extends SinkOptions {
+  /** Iceberg REST catalog URI (POC: `apache/iceberg-rest-fixture`). */
+  catalogUri: string;
+  /** Catalog namespace the 5 tables live under. */
+  namespace: string;
+  /** S3-compatible warehouse location (POC: MinIO). */
+  warehouse: string;
+}
+export { ICEBERG_PARTITION_SPEC as A, icebergAppendRetrying as C, listIcebergTables as D, isCommitRateLimited as E, IcebergPartitionField as F, IcebergPartitionTransform as I, IcebergTableName as L, ICEBERG_TABLES as M, IcebergColumn as N, ICEBERG_FIELD_ID_BASE as O, IcebergColumnType as P, IcebergTableSpec as R, ensureIcebergNamespace as S, icebergSchemaFor as T, IcebergSchemaField as _, SinkCloseResult as a, createIcebergTables as b, SinkWriteResult as c, IcebergConnection as d, IcebergPartitionSpec as f, IcebergSchema as g, IcebergS3Config as h, SinkCapabilities as i, ICEBERG_SCHEMAS as j, ICEBERG_PARTITION_COLUMNS as k, CommitRetryOptions as l, IcebergPrimitiveType as m, LocalIcebergSinkOptions as n, SinkOptions as o, IcebergPartitionSpecField as p, Sink as r, SinkSlice as s, IcebergAppendSinkOptions as t, IcebergCatalogConfig as u, IcebergTableOpResult as v, icebergPartitionSpecFor as w, dropIcebergTables as x, connectIcebergCatalog as y, icebergTableSpec as z };

package/dist/_chunks/storage.d.mts CHANGED Viewed

@@ -395,8 +395,8 @@ interface QueryExecuteOptions {
   /**
    * Per-placeholder table identity. Used by the executor to emit a
    * schema-correct empty fallback when a named file set is empty: an
-   * `extraFiles` placeholder against `page_keywords` should fall back to
-   * the page_keywords schema, not the analyzer's primary `table`.
+   * `extraFiles` placeholder against `page_queries` should fall back to
+   * the page_queries schema, not the analyzer's primary `table`.
    */
   placeholderTables?: Record<string, TableName>;
   dataSource: DataSource;
@@ -512,8 +512,8 @@ interface StorageEngine {
   /**
    * GDPR URL-matcher purge. Deletes rows whose `url` column matches one of
    * `urls` across every live parquet entry for the tenant in tables that
-   * carry a `url` column (`pages`, `page_keywords`). Tables without a `url`
-   * column (`keywords`, `countries`, `devices`, `search_appearance`) are
+   * carry a `url` column (`pages`, `page_queries`). Tables without a `url`
+   * column (`queries`, `countries`, `search_appearance`) are
    * untouched — they never store per-URL data.
    *
    * For each affected entry the engine reads the file, filters the matching

package/dist/adapters/duckdb-node.mjs CHANGED Viewed

@@ -1,10 +1,10 @@
 import { arrowToRows } from "../arrow-utils.mjs";
 import { createRequire } from "node:module";
-import { unlinkSync } from "node:fs";
-import { tmpdir } from "node:os";
 import { join } from "node:path";
 import process from "node:process";
 import { fileURLToPath } from "node:url";
+import { unlinkSync } from "node:fs";
+import { tmpdir } from "node:os";
 import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
 const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
 let singleton = null;

package/dist/adapters/hyparquet.mjs CHANGED Viewed

@@ -37,12 +37,12 @@ function compareValues(a, b) {
 	if (typeof a === "number" && typeof b === "number") return a - b;
 	return String(a) < String(b) ? -1 : 1;
 }
-function sortRowsBySortKey(table, rows) {
-	const sortKey = TABLE_METADATA[table].sortKey;
-	if (sortKey.length === 0 || rows.length <= 1) return rows;
+function sortRowsByClusterKey(table, rows) {
+	const clusterKey = TABLE_METADATA[table].clusterKey;
+	if (clusterKey.length === 0 || rows.length <= 1) return rows;
 	const copy = rows.slice();
 	copy.sort((a, b) => {
-		for (const col of sortKey) {
+		for (const col of clusterKey) {
 			const cmp = compareValues(a[col], b[col]);
 			if (cmp !== 0) return cmp;
 		}
@@ -52,7 +52,7 @@ function sortRowsBySortKey(table, rows) {
 }
 function encodeRowsToParquet(table, rows) {
 	const schema = SCHEMAS[table];
-	const sorted = sortRowsBySortKey(table, rows);
+	const sorted = sortRowsByClusterKey(table, rows);
 	const buffer = parquetWriteBuffer({
 		columnData: schema.columns.map((col) => {
 			const type = basicTypeFor(col.type);

package/dist/index.d.mts CHANGED Viewed

@@ -1,8 +1,9 @@
 import { A as SyncStateKind, B as hourPartition, C as Row, D as SyncState, E as StorageEngine, F as WatermarkFilter, G as RAW_DAILY_COMPACT_THRESHOLD, H as inferSearchType, I as WatermarkScope, J as enumeratePartitions, K as countRawDailies, L as WriteCtx, M as TableName, N as TenantCtx, O as SyncStateDetail, P as Watermark, R as WriteResult, S as QueryResult, T as SearchType, U as objectKey, V as inferLegacyTier, W as CompactionThresholds, Y as splitOverlappingTiers, _ as PurgeUrlsResult, a as EngineOptions, b as QueryExecuteResult, c as Grain, d as ManifestEntry, f as ManifestPurgeResult, g as PurgeResult, h as PurgeFilter, i as DataSource, j as SyncStateScope, k as SyncStateFilter, l as ListLiveFilter, m as ParquetCodec, n as CompactionTier, o as FileSetRef, p as ManifestStore, q as dedupeOverlappingTiers, r as DEFAULT_SEARCH_TYPE, s as GcCtx, t as CodecCtx, u as LockScope, v as QueryCtx, w as RunSQLOptions, x as QueryExecutor, y as QueryExecuteOptions, z as dayPartition } from "./_chunks/storage.mjs";
 import { a as createDuckDBExecutor, i as createDuckDBCodec, n as DuckDBHandle, r as canonicalEmptyParquetSchema, t as DuckDBFactory } from "./_chunks/duckdb.mjs";
-import { _ as hourly_pages, a as allTables, b as pages, c as dimensionToColumn, f as DrizzleSchema, g as drizzleSchema, h as devices, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, t as ColumnDef, v as keywords, y as page_keywords } from "./_chunks/schema.mjs";
+import { _ as hourly_pages, a as allTables, b as queries, c as dimensionToColumn, f as DrizzleSchema, g as drizzleSchema, h as dates, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, t as ColumnDef, v as page_queries, y as pages } from "./_chunks/schema.mjs";
 import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
-import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
+import { A as ICEBERG_PARTITION_SPEC, C as icebergAppendRetrying, D as listIcebergTables, E as isCommitRateLimited, F as IcebergPartitionField, I as IcebergPartitionTransform, L as IcebergTableName, M as ICEBERG_TABLES, N as IcebergColumn, O as ICEBERG_FIELD_ID_BASE, P as IcebergColumnType, R as IcebergTableSpec, S as ensureIcebergNamespace, T as icebergSchemaFor, _ as IcebergSchemaField, a as SinkCloseResult, b as createIcebergTables, c as SinkWriteResult, d as IcebergConnection, f as IcebergPartitionSpec, g as IcebergSchema, h as IcebergS3Config, i as SinkCapabilities, j as ICEBERG_SCHEMAS, k as ICEBERG_PARTITION_COLUMNS, l as CommitRetryOptions, m as IcebergPrimitiveType, n as LocalIcebergSinkOptions, o as SinkOptions, p as IcebergPartitionSpecField, r as Sink, s as SinkSlice, t as IcebergAppendSinkOptions, u as IcebergCatalogConfig, v as IcebergTableOpResult, w as icebergPartitionSpecFor, x as dropIcebergTables, y as connectIcebergCatalog, z as icebergTableSpec } from "./_chunks/sink.mjs";
+import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
 import { a as substituteNamedFiles, i as resolveParquetSQL, n as ResolvedQuery, t as FILES_PLACEHOLDER } from "./_chunks/planner.mjs";
 import { rebuildDailyFromHourly } from "./rollups.mjs";
 import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
@@ -122,16 +123,47 @@ interface CreateIngestAccumulatorOptions extends RowAccumulatorOptions {
 }
 declare function createNoopIngestAccumulator(): IngestAccumulator;
 declare function createIngestAccumulator(opts: CreateIngestAccumulatorOptions): IngestAccumulator;
-type SyncTableName = Extract<TableName$1, 'pages' | 'keywords' | 'countries' | 'devices' | 'page_keywords'>;
+type IcebergAppendSink = Sink;
+/**
+ * Create an `IcebergAppendSink` over the R2 Data Catalog.
+ *
+ * `emit` buffers; `close()` commits one `icebergAppend()` per table touched.
+ * The catalog connection (REST context + signed S3 resolver) is established
+ * lazily on the first flush and reused — a sink that is opened and closed
+ * with no rows never touches the network.
+ */
+declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
+/** A row as stored by the fake — data columns plus the injected identity columns. */
+type StoredRow = Row & {
+  site_id: string;
+  search_type: string;
+};
+interface InMemorySink extends Sink {
+  /** All rows accepted, in partition order. */
+  readonly rows: readonly StoredRow[];
+  /** Rows for one table. */
+  rowsFor: (table: IcebergTableName) => StoredRow[];
+  /** Rows for one exact slice partition. */
+  rowsForSlice: (slice: SinkSlice) => StoredRow[];
+  /** `true` once `close()` has run. */
+  readonly closed: boolean;
+  /** Drop everything — handy between test cases. */
+  reset: () => void;
+}
+/**
+ * Create an in-memory append-only `Sink`.
+ */
+declare function createInMemorySink(): InMemorySink;
+type SyncTableName = Extract<TableName$1, 'pages' | 'queries' | 'countries' | 'page_queries' | 'dates'>;
 declare const TABLES_BY_SEARCH_TYPE: Record<SearchType, readonly SyncTableName[]>;
 declare function parseEnabledSearchTypes(raw: string | null | undefined): SearchType[];
 declare function validateEnabledSearchTypes(value: unknown): SearchType[];
 declare const TABLE_TIERS: {
   readonly pages: "critical";
-  readonly keywords: "critical";
+  readonly queries: "critical";
   readonly countries: "standard";
-  readonly devices: "standard";
-  readonly page_keywords: "extended";
+  readonly dates: "standard";
+  readonly page_queries: "extended";
 };
 type TieredTableName = keyof typeof TABLE_TIERS;
 type TableTier = 'critical' | 'standard' | 'extended';
@@ -147,4 +179,4 @@ declare const MIN_SYNC_IMPRESSIONS = 1;
 declare const MIN_COUNTRY_IMPRESSIONS = 10;
 declare const MAX_SITEMAP_URLS_PER_SITE = 50000;
 declare const MAX_TRACKED_URLS_PER_SITE = 200000;
-export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIngestAccumulator, createNoopIngestAccumulator, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, dedupeOverlappingTiers, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, parseEnabledSearchTypes, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
+export { type CodecCtx, type ColumnDef, type ColumnType, type CommitRetryOptions, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type InMemorySink, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LocalIcebergSinkOptions, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type Sink, type SinkCapabilities, type SinkCloseResult, type SinkOptions, type SinkSlice, type SinkWriteResult, type StorageEngine, type StoredRow, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, connectIcebergCatalog, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergAppendSink, createIcebergTables, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createRowAccumulator, createStorageEngine, currentSchemaVersion, dates, dayPartition, dedupeOverlappingTiers, dimensionToColumn, drizzleSchema, dropIcebergTables, ensureIcebergNamespace, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isCommitRateLimited, listIcebergTables, objectKey, page_queries, pages, parseEnabledSearchTypes, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };