@gscdump/engine 0.20.3 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ const pages = pgTable("pages", {
12
12
  date: dateCol(),
13
13
  ...metricCols()
14
14
  });
15
- const keywords = pgTable("keywords", {
15
+ const queries = pgTable("queries", {
16
16
  query: varchar("query").notNull(),
17
17
  query_canonical: varchar("query_canonical"),
18
18
  date: dateCol(),
@@ -23,18 +23,29 @@ const countries = pgTable("countries", {
23
23
  date: dateCol(),
24
24
  ...metricCols()
25
25
  });
26
- const devices = pgTable("devices", {
27
- device: varchar("device").notNull(),
28
- date: dateCol(),
29
- ...metricCols()
30
- });
31
- const page_keywords = pgTable("page_keywords", {
26
+ const page_queries = pgTable("page_queries", {
32
27
  url: varchar("url").notNull(),
33
28
  query: varchar("query").notNull(),
34
29
  query_canonical: varchar("query_canonical"),
35
30
  date: dateCol(),
36
31
  ...metricCols()
37
32
  });
33
+ const dates = pgTable("dates", {
34
+ date: dateCol(),
35
+ clicks: integer("clicks").notNull(),
36
+ impressions: integer("impressions").notNull(),
37
+ sum_position: doublePrecision("sum_position").notNull(),
38
+ anonymized_impressions_pct: doublePrecision("anonymized_impressions_pct").notNull(),
39
+ clicks_desktop: integer("clicks_desktop").notNull(),
40
+ clicks_mobile: integer("clicks_mobile").notNull(),
41
+ clicks_tablet: integer("clicks_tablet").notNull(),
42
+ impressions_desktop: integer("impressions_desktop").notNull(),
43
+ impressions_mobile: integer("impressions_mobile").notNull(),
44
+ impressions_tablet: integer("impressions_tablet").notNull(),
45
+ sum_position_desktop: doublePrecision("sum_position_desktop").notNull(),
46
+ sum_position_mobile: doublePrecision("sum_position_mobile").notNull(),
47
+ sum_position_tablet: doublePrecision("sum_position_tablet").notNull()
48
+ });
38
49
  const search_appearance = pgTable("search_appearance", {
39
50
  searchAppearance: varchar("searchAppearance").notNull(),
40
51
  date: dateCol(),
@@ -48,40 +59,50 @@ const hourly_pages = pgTable("hourly_pages", {
48
59
  });
49
60
  const drizzleSchema = {
50
61
  pages,
51
- keywords,
62
+ queries,
52
63
  countries,
53
- devices,
54
- page_keywords,
64
+ page_queries,
65
+ dates,
55
66
  search_appearance,
56
67
  hourly_pages
57
68
  };
58
69
  const TABLE_METADATA = {
59
70
  pages: {
60
71
  sortKey: ["date", "url"],
72
+ clusterKey: ["url", "date"],
61
73
  version: 1
62
74
  },
63
- keywords: {
75
+ queries: {
64
76
  sortKey: ["date", "query"],
77
+ clusterKey: ["query", "date"],
65
78
  version: 2
66
79
  },
67
80
  countries: {
68
81
  sortKey: ["date", "country"],
82
+ clusterKey: ["country", "date"],
69
83
  version: 1
70
84
  },
71
- devices: {
72
- sortKey: ["date", "device"],
73
- version: 1
74
- },
75
- page_keywords: {
85
+ page_queries: {
76
86
  sortKey: [
77
87
  "date",
78
88
  "url",
79
89
  "query"
80
90
  ],
91
+ clusterKey: [
92
+ "url",
93
+ "query",
94
+ "date"
95
+ ],
81
96
  version: 2
82
97
  },
98
+ dates: {
99
+ sortKey: ["date"],
100
+ clusterKey: ["date"],
101
+ version: 1
102
+ },
83
103
  search_appearance: {
84
104
  sortKey: ["date", "searchAppearance"],
105
+ clusterKey: ["searchAppearance", "date"],
85
106
  version: 1
86
107
  },
87
108
  hourly_pages: {
@@ -90,6 +111,11 @@ const TABLE_METADATA = {
90
111
  "hour",
91
112
  "url"
92
113
  ],
114
+ clusterKey: [
115
+ "url",
116
+ "date",
117
+ "hour"
118
+ ],
93
119
  version: 1
94
120
  }
95
121
  };
@@ -118,10 +144,10 @@ function tableSchemaFrom(tableName) {
118
144
  }
119
145
  const METRIC_TABLES = [
120
146
  "pages",
121
- "keywords",
147
+ "queries",
122
148
  "countries",
123
- "devices",
124
- "page_keywords",
149
+ "page_queries",
150
+ "dates",
125
151
  "search_appearance",
126
152
  "hourly_pages"
127
153
  ];
@@ -139,13 +165,13 @@ function inferTable(dimensions) {
139
165
  const dims = new Set(dimensions);
140
166
  const hasPage = dims.has("page");
141
167
  const hasQuery = dims.has("query");
142
- if (hasPage && hasQuery) return "page_keywords";
143
- if (hasQuery) return "keywords";
168
+ if (hasPage && hasQuery) return "page_queries";
169
+ if (hasQuery) return "queries";
144
170
  if (hasPage) return "pages";
145
171
  if (dims.has("country")) return "countries";
146
- if (dims.has("device")) return "devices";
172
+ if (dims.has("device")) return "dates";
147
173
  if (dims.has("searchAppearance")) return "search_appearance";
148
- return "pages";
174
+ return "dates";
149
175
  }
150
176
  function naturalKeyColumns(table) {
151
177
  return TABLE_METADATA[table].sortKey;
@@ -166,4 +192,4 @@ function dimensionToColumn(dim, _table) {
166
192
  if (dim === "queryCanonical") return "query_canonical";
167
193
  return dim;
168
194
  }
169
- export { search_appearance as _, dimensionToColumn as a, schemaFor as c, devices as d, drizzleSchema as f, pages as g, page_keywords as h, dedupeByNaturalKey as i, TABLE_METADATA as l, keywords as m, allTables as n, inferTable as o, hourly_pages as p, currentSchemaVersion as r, naturalKeyColumns as s, SCHEMAS as t, countries as u };
195
+ export { search_appearance as _, dimensionToColumn as a, schemaFor as c, dates as d, drizzleSchema as f, queries as g, pages as h, dedupeByNaturalKey as i, TABLE_METADATA as l, page_queries as m, allTables as n, inferTable as o, hourly_pages as p, currentSchemaVersion as r, naturalKeyColumns as s, SCHEMAS as t, countries as u };
@@ -0,0 +1,329 @@
1
+ import { C as Row$1, N as TenantCtx$1, T as SearchType } from "./storage.mjs";
2
+ import { icebergAppend, restCatalogConnect, s3SignedResolver } from "icebird";
3
+ import { TableName } from "@gscdump/contracts";
4
+ /** The 5 fact tables that exist as global Iceberg tables. */
5
+ type IcebergTableName = Extract<TableName, 'pages' | 'queries' | 'countries' | 'page_queries' | 'dates'>;
6
+ /** The 5 Iceberg table names, in canonical order. */
7
+ declare const ICEBERG_TABLES: readonly IcebergTableName[];
8
+ /**
9
+ * Iceberg-native column type. Superset-mapped from the engine `ColumnType`;
10
+ * `LONG` is Iceberg's name for 64-bit integers, `STRING` for varchar.
11
+ */
12
+ type IcebergColumnType = 'STRING' | 'INT' | 'LONG' | 'DOUBLE' | 'DATE';
13
+ interface IcebergColumn {
14
+ /** Column name as written into the Iceberg table (snake_case). */
15
+ name: string;
16
+ type: IcebergColumnType;
17
+ /** Iceberg field nullability. Partition identity columns are never null. */
18
+ required: boolean;
19
+ /**
20
+ * Stable Iceberg field id. Field ids — not names — are the schema-evolution
21
+ * identity in Iceberg; never reuse or renumber an id once a table is live.
22
+ */
23
+ fieldId: number;
24
+ }
25
+ /** Iceberg partition transform applied to a source column. */
26
+ type IcebergPartitionTransform = 'identity' | 'month';
27
+ interface IcebergPartitionField {
28
+ /** Source column the transform reads. */
29
+ sourceColumn: 'site_id' | 'search_type' | 'date';
30
+ transform: IcebergPartitionTransform;
31
+ /** Partition field name as it appears in Iceberg metadata. */
32
+ name: string;
33
+ }
34
+ interface IcebergTableSpec {
35
+ table: IcebergTableName;
36
+ columns: readonly IcebergColumn[];
37
+ /**
38
+ * Partition spec — shared by all 5 tables: identity(site_id),
39
+ * identity(search_type), month(date).
40
+ */
41
+ partitionSpec: readonly IcebergPartitionField[];
42
+ /**
43
+ * Natural-key columns: a row is uniquely identified by this tuple within
44
+ * its partition. Drives partition-overwrite revision correctness and
45
+ * dedup. Mirrors `TABLE_METADATA[table].sortKey` plus `site_id` +
46
+ * `search_type`.
47
+ */
48
+ identityColumns: readonly string[];
49
+ }
50
+ /**
51
+ * The two partition-identity columns Iceberg rows carry that the legacy
52
+ * parquet path encoded in the object-key prefix instead. Field ids 1–2 are
53
+ * the first two columns; per-table metric/dimension columns follow
54
+ * contiguously from id 3 (see `ICEBERG_FIELD_ID_BASE`).
55
+ */
56
+ declare const ICEBERG_PARTITION_COLUMNS: readonly IcebergColumn[];
57
+ /**
58
+ * First field id used for per-table (non-partition) columns — immediately
59
+ * after the two partition-identity columns (`site_id`=1, `search_type`=2).
60
+ *
61
+ * ADVISORY ONLY. The icebird spike (2026-05-22) established that R2 Data
62
+ * Catalog's `createTable` endpoint re-assigns field ids sequentially and does
63
+ * NOT preserve caller-supplied ids. The contiguous numbering here matches what
64
+ * the catalog produces (`site_id`=1, `search_type`=2, data columns 3, 4, …) so
65
+ * the contract describes reality, but ids are authoritatively assigned by the
66
+ * catalog. Iceberg still guarantees ids are stable once a table exists.
67
+ */
68
+ declare const ICEBERG_FIELD_ID_BASE = 3;
69
+ /** Shared partition spec — identical across all 5 tables. */
70
+ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
71
+ /**
72
+ * Derive the full Iceberg table spec for a table from the engine `SCHEMAS`
73
+ * (drizzle-derived column set) plus the shared partition-identity columns.
74
+ * Field ids are assigned deterministically from `ICEBERG_FIELD_ID_BASE` in
75
+ * column declaration order so the same schema always yields the same ids.
76
+ *
77
+ * CONTRACT NOTE: implementation agents must treat the RETURNED VALUE as the
78
+ * source of truth — do not hand-list columns elsewhere.
79
+ */
80
+ declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
81
+ /** All 5 Iceberg table specs, keyed by table name. */
82
+ declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
83
+ /** icebird's lowercase Iceberg primitive types (subset we use). */
84
+ type IcebergPrimitiveType = 'string' | 'int' | 'long' | 'double' | 'date';
85
+ /** A field in an icebird table `Schema`. */
86
+ interface IcebergSchemaField {
87
+ id: number;
88
+ name: string;
89
+ required: boolean;
90
+ type: IcebergPrimitiveType;
91
+ }
92
+ /** An icebird table `Schema` (Iceberg `struct`). */
93
+ interface IcebergSchema {
94
+ 'type': 'struct';
95
+ 'schema-id': number;
96
+ 'fields': IcebergSchemaField[];
97
+ }
98
+ /** A field in an icebird `PartitionSpec`. */
99
+ interface IcebergPartitionSpecField {
100
+ 'source-id': number;
101
+ 'field-id': number;
102
+ 'name': string;
103
+ 'transform': 'identity' | 'month';
104
+ }
105
+ /** An icebird `PartitionSpec`. */
106
+ interface IcebergPartitionSpec {
107
+ 'spec-id': number;
108
+ 'fields': IcebergPartitionSpecField[];
109
+ }
110
+ /** S3-compatible credentials for the R2 warehouse. */
111
+ interface IcebergS3Config {
112
+ /** R2 S3 endpoint, e.g. `https://<account>.r2.cloudflarestorage.com`. */
113
+ endpoint: string;
114
+ accessKeyId: string;
115
+ secretAccessKey: string;
116
+ /** Defaults to `'auto'` (R2's region). */
117
+ region?: string;
118
+ }
119
+ /** Everything needed to talk to the R2 Data Catalog. */
120
+ interface IcebergCatalogConfig {
121
+ /** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
122
+ catalogUri: string;
123
+ /** Warehouse identifier, e.g. `<acct>_gscdump-analytics`. */
124
+ warehouse: string;
125
+ /** Catalog namespace the 5 fact tables live under (e.g. `gsc`). */
126
+ namespace: string;
127
+ /** Bearer token for the REST catalog (`R2_CATALOG_TOKEN`). */
128
+ catalogToken: string;
129
+ /** R2 S3 credentials for the warehouse objects. */
130
+ s3: IcebergS3Config;
131
+ }
132
+ /** The connected catalog context + a signed S3 resolver — the icebird call inputs. */
133
+ interface IcebergConnection {
134
+ /** icebird REST catalog context, passed as `{ catalog }` to icebird write fns. */
135
+ catalog: Awaited<ReturnType<typeof restCatalogConnect>>;
136
+ /** icebird S3 resolver, passed as `{ resolver }` to icebird write fns. */
137
+ resolver: ReturnType<typeof s3SignedResolver>;
138
+ /** The namespace the fact tables live under. */
139
+ namespace: string;
140
+ }
141
+ /**
142
+ * Build the icebird `Schema` for one of the 5 fact tables from the frozen
143
+ * `ICEBERG_SCHEMAS` contract. Field ids are advisory — R2 Data Catalog
144
+ * re-assigns them on `createTable` (see `ICEBERG_FIELD_ID_BASE`).
145
+ */
146
+ declare function icebergSchemaFor(table: IcebergTableName): IcebergSchema;
147
+ /**
148
+ * Build the icebird `PartitionSpec` for one of the 5 fact tables: the locked
149
+ * spec `identity(site_id) + identity(search_type) + month(date)`. Each
150
+ * partition field's `source-id` is resolved to the real column field id from
151
+ * {@link icebergSchemaFor}.
152
+ */
153
+ declare function icebergPartitionSpecFor(table: IcebergTableName): IcebergPartitionSpec;
154
+ /**
155
+ * Connect to the R2 Data Catalog: a REST catalog context + a signed S3
156
+ * resolver. Runs in Node and in `workerd` — SigV4 is Web Crypto, I/O is
157
+ * `fetch`, no node builtins.
158
+ */
159
+ declare function connectIcebergCatalog(config: IcebergCatalogConfig): Promise<IcebergConnection>;
160
+ /** Tunable retry policy for {@link icebergAppendRetrying}. */
161
+ interface CommitRetryOptions {
162
+ /** Total attempts, including the first. Default 6. */
163
+ maxAttempts?: number;
164
+ /** Base (ms) for the exponential back-off ceiling. Default 1000. */
165
+ baseDelayMs?: number;
166
+ /** Hard cap (ms) on the back-off ceiling. Default 20_000. */
167
+ maxDelayMs?: number;
168
+ /** Injectable sleep — tests pass a synchronous no-op. */
169
+ sleep?: (ms: number) => Promise<void>;
170
+ /** Injectable RNG for the jitter — tests pass a deterministic value. */
171
+ random?: () => number;
172
+ }
173
+ /**
174
+ * True when `err` is an R2 Data Catalog commit rate-limit response
175
+ * (`429 too many commits to this table`). Matches a numeric `status` of 429
176
+ * or the message text, so it holds whether icebird surfaces the raw HTTP
177
+ * error or a wrapped `Error`.
178
+ */
179
+ declare function isCommitRateLimited(err: unknown): boolean;
180
+ /**
181
+ * `icebergAppend` wrapped with retry on R2 Data Catalog 429 commit
182
+ * rate-limits, using full-jitter exponential back-off. icebird already
183
+ * retries 412/409 internally; 429 is the gap this closes. Non-429 errors
184
+ * (and 429s that survive every attempt) propagate unchanged.
185
+ *
186
+ * RESIDUAL RISK — re-upload orphans. icebird's `icebergAppend` prepares the
187
+ * data + manifest files ONCE, outside its internal 412/409 retry loop, so
188
+ * those retries never re-upload data. A 429 that escapes that loop and is
189
+ * retried HERE re-runs the whole `icebergAppend` call, which re-prepares and
190
+ * re-uploads the data files; the previous attempt's parquet objects become
191
+ * orphans (referenced by no snapshot). 429s should be rare and clear within
192
+ * an attempt or two, so orphan volume is small, and R2 orphan-file cleanup
193
+ * reclaims them. Eliminating the 429 source entirely (per-table commit
194
+ * coalescing) is assessed in the Phase-1.5 report.
195
+ */
196
+ declare function icebergAppendRetrying(args: Parameters<typeof icebergAppend>[0], options?: CommitRetryOptions): Promise<void>;
197
+ /** Outcome of a single table create/drop. */
198
+ interface IcebergTableOpResult {
199
+ table: string;
200
+ ok: boolean;
201
+ /** Present when `ok` is false. */
202
+ error?: string;
203
+ }
204
+ /**
205
+ * Ensure the catalog namespace exists. Idempotent — an "already exists"
206
+ * response from the REST catalog is swallowed.
207
+ */
208
+ declare function ensureIcebergNamespace(conn: IcebergConnection): Promise<void>;
209
+ /**
210
+ * Create the global Iceberg fact tables with the locked partition spec
211
+ * (`identity(site_id) + identity(search_type) + month(date)`) and the schema
212
+ * derived from {@link ICEBERG_SCHEMAS}. Per-table errors are captured rather
213
+ * than thrown so a partial run is observable; "table already exists" surfaces
214
+ * as a failed result. Used by the app's one-off provisioning script.
215
+ */
216
+ declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[]): Promise<IcebergTableOpResult[]>;
217
+ /** List the table names currently in the catalog namespace. */
218
+ declare function listIcebergTables(conn: IcebergConnection): Promise<string[]>;
219
+ /**
220
+ * Drop tables from the catalog namespace, purging their data objects.
221
+ * Defaults to every table currently in the namespace — used to clear the
222
+ * wrong-spec Pipelines-provisioned `gsc.*` tables before re-creating them.
223
+ */
224
+ declare function dropIcebergTables(conn: IcebergConnection, tables?: readonly string[]): Promise<IcebergTableOpResult[]>;
225
+ /**
226
+ * Identifies one fact slice — the atomic unit a sink emits.
227
+ * `(table, site, searchType, date)`. `userId` rides along on `ctx` for
228
+ * tenant-scoped sinks (local/in-memory); the prod Iceberg table is global
229
+ * and keys only on `siteId`.
230
+ */
231
+ interface SinkSlice {
232
+ ctx: TenantCtx$1;
233
+ table: IcebergTableName;
234
+ /** GSC search-type partition. */
235
+ searchType: SearchType;
236
+ /** Calendar day (PT), `YYYY-MM-DD`. The slice's `month(date)` partition. */
237
+ date: string;
238
+ }
239
+ /**
240
+ * Outcome of a sink write. `rowCount` is the number of rows accepted;
241
+ * `bytes` is best-effort — `IcebergAppendSink` does not report it (undefined there).
242
+ */
243
+ interface SinkWriteResult {
244
+ rowCount: number;
245
+ bytes?: number;
246
+ }
247
+ /**
248
+ * Static description of a sink. All sinks are append-only under the v5
249
+ * stability-cutoff model; `appendOnly` is therefore always `true` and kept
250
+ * only as an explicit, self-documenting marker.
251
+ */
252
+ interface SinkCapabilities {
253
+ /** Always `true` — re-emitting a slice accumulates duplicate rows. */
254
+ appendOnly: true;
255
+ }
256
+ /**
257
+ * Outcome of `Sink.close()` — which tables' buffered rows reached durable
258
+ * storage and which failed.
259
+ *
260
+ * `IcebergAppendSink.emit` only BUFFERS; the durable Iceberg commit happens
261
+ * in `close()`, one `icebergAppend()` per table. The ingest ledger
262
+ * (`sinkAsIngestEngine`) records a `(site, table, searchType, date)` slice
263
+ * ONLY after the table holding it appears in `flushed` — a table in `failed`
264
+ * leaves its slices un-recorded so the next sync re-emits them. This is what
265
+ * keeps the D1 ledger from ever running ahead of Iceberg.
266
+ */
267
+ interface SinkCloseResult {
268
+ /** Tables whose buffered rows committed durably. */
269
+ flushed: IcebergTableName[];
270
+ /** Tables whose flush failed — their slices must NOT be ledger-recorded. */
271
+ failed: {
272
+ table: IcebergTableName;
273
+ error: string;
274
+ }[];
275
+ }
276
+ interface Sink {
277
+ readonly capabilities: SinkCapabilities;
278
+ /**
279
+ * Emit the fact rows for one slice. Append semantics — for `IcebergAppendSink`
280
+ * this commits one Iceberg snapshot. Re-emitting the same slice produces
281
+ * DUPLICATE rows; exactly-once is enforced upstream by the D1 ingested-days
282
+ * ledger, which only calls `emit` for a slice once.
283
+ *
284
+ * `rows` carry the table's data columns; the sink injects the partition
285
+ * identity columns (`site_id`, `search_type`) from `slice` — callers MUST
286
+ * NOT pre-populate them.
287
+ */
288
+ emit: (slice: SinkSlice, rows: readonly Row$1[]) => Promise<SinkWriteResult>;
289
+ /**
290
+ * Flush any buffered rows and release resources, returning which tables
291
+ * reached durable storage. Idempotent — a second `close()` after a flush
292
+ * reports empty `flushed`/`failed` (nothing left buffered).
293
+ *
294
+ * `IcebergAppendSink` buffers in `emit` and commits one `icebergAppend()`
295
+ * per table HERE; `flushed`/`failed` reflect those per-table commits.
296
+ * In-memory / local sinks write durably in `emit`, so they report every
297
+ * table they received rows for as `flushed`.
298
+ */
299
+ close: () => Promise<SinkCloseResult>;
300
+ }
301
+ /** Construction options shared by all sink implementations. */
302
+ interface SinkOptions {
303
+ now?: () => number;
304
+ }
305
+ /**
306
+ * `IcebergAppendSink`-specific options — the R2 Data Catalog coordinates the
307
+ * sink appends to via `icebird`. The catalog config shape (`IcebergCatalogConfig`)
308
+ * is type-only here so this contract file stays implementation-free.
309
+ */
310
+ interface IcebergAppendSinkOptions extends SinkOptions {
311
+ /** R2 Data Catalog connection config (catalog URI, warehouse, namespace, token, S3 creds). */
312
+ catalog: IcebergCatalogConfig;
313
+ /**
314
+ * Retry policy for the per-table `icebergAppend()` commit, applied on R2
315
+ * Data Catalog 429 ("too many commits") rate-limits. Optional — production
316
+ * uses the defaults; tests inject a synchronous `sleep`.
317
+ */
318
+ commitRetry?: CommitRetryOptions;
319
+ }
320
+ /** `LocalIcebergSink` options — points at the local Iceberg REST catalog. */
321
+ interface LocalIcebergSinkOptions extends SinkOptions {
322
+ /** Iceberg REST catalog URI (POC: `apache/iceberg-rest-fixture`). */
323
+ catalogUri: string;
324
+ /** Catalog namespace the 5 tables live under. */
325
+ namespace: string;
326
+ /** S3-compatible warehouse location (POC: MinIO). */
327
+ warehouse: string;
328
+ }
329
+ export { ICEBERG_PARTITION_SPEC as A, icebergAppendRetrying as C, listIcebergTables as D, isCommitRateLimited as E, IcebergPartitionField as F, IcebergPartitionTransform as I, IcebergTableName as L, ICEBERG_TABLES as M, IcebergColumn as N, ICEBERG_FIELD_ID_BASE as O, IcebergColumnType as P, IcebergTableSpec as R, ensureIcebergNamespace as S, icebergSchemaFor as T, IcebergSchemaField as _, SinkCloseResult as a, createIcebergTables as b, SinkWriteResult as c, IcebergConnection as d, IcebergPartitionSpec as f, IcebergSchema as g, IcebergS3Config as h, SinkCapabilities as i, ICEBERG_SCHEMAS as j, ICEBERG_PARTITION_COLUMNS as k, CommitRetryOptions as l, IcebergPrimitiveType as m, LocalIcebergSinkOptions as n, SinkOptions as o, IcebergPartitionSpecField as p, Sink as r, SinkSlice as s, IcebergAppendSinkOptions as t, IcebergCatalogConfig as u, IcebergTableOpResult as v, icebergPartitionSpecFor as w, dropIcebergTables as x, connectIcebergCatalog as y, icebergTableSpec as z };
@@ -395,8 +395,8 @@ interface QueryExecuteOptions {
395
395
  /**
396
396
  * Per-placeholder table identity. Used by the executor to emit a
397
397
  * schema-correct empty fallback when a named file set is empty: an
398
- * `extraFiles` placeholder against `page_keywords` should fall back to
399
- * the page_keywords schema, not the analyzer's primary `table`.
398
+ * `extraFiles` placeholder against `page_queries` should fall back to
399
+ * the page_queries schema, not the analyzer's primary `table`.
400
400
  */
401
401
  placeholderTables?: Record<string, TableName>;
402
402
  dataSource: DataSource;
@@ -512,8 +512,8 @@ interface StorageEngine {
512
512
  /**
513
513
  * GDPR URL-matcher purge. Deletes rows whose `url` column matches one of
514
514
  * `urls` across every live parquet entry for the tenant in tables that
515
- * carry a `url` column (`pages`, `page_keywords`). Tables without a `url`
516
- * column (`keywords`, `countries`, `devices`, `search_appearance`) are
515
+ * carry a `url` column (`pages`, `page_queries`). Tables without a `url`
516
+ * column (`queries`, `countries`, `search_appearance`) are
517
517
  * untouched — they never store per-URL data.
518
518
  *
519
519
  * For each affected entry the engine reads the file, filters the matching
@@ -1,10 +1,10 @@
1
1
  import { arrowToRows } from "../arrow-utils.mjs";
2
2
  import { createRequire } from "node:module";
3
- import { unlinkSync } from "node:fs";
4
- import { tmpdir } from "node:os";
5
3
  import { join } from "node:path";
6
4
  import process from "node:process";
7
5
  import { fileURLToPath } from "node:url";
6
+ import { unlinkSync } from "node:fs";
7
+ import { tmpdir } from "node:os";
8
8
  import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
9
9
  const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
10
10
  let singleton = null;
@@ -37,12 +37,12 @@ function compareValues(a, b) {
37
37
  if (typeof a === "number" && typeof b === "number") return a - b;
38
38
  return String(a) < String(b) ? -1 : 1;
39
39
  }
40
- function sortRowsBySortKey(table, rows) {
41
- const sortKey = TABLE_METADATA[table].sortKey;
42
- if (sortKey.length === 0 || rows.length <= 1) return rows;
40
+ function sortRowsByClusterKey(table, rows) {
41
+ const clusterKey = TABLE_METADATA[table].clusterKey;
42
+ if (clusterKey.length === 0 || rows.length <= 1) return rows;
43
43
  const copy = rows.slice();
44
44
  copy.sort((a, b) => {
45
- for (const col of sortKey) {
45
+ for (const col of clusterKey) {
46
46
  const cmp = compareValues(a[col], b[col]);
47
47
  if (cmp !== 0) return cmp;
48
48
  }
@@ -52,7 +52,7 @@ function sortRowsBySortKey(table, rows) {
52
52
  }
53
53
  function encodeRowsToParquet(table, rows) {
54
54
  const schema = SCHEMAS[table];
55
- const sorted = sortRowsBySortKey(table, rows);
55
+ const sorted = sortRowsByClusterKey(table, rows);
56
56
  const buffer = parquetWriteBuffer({
57
57
  columnData: schema.columns.map((col) => {
58
58
  const type = basicTypeFor(col.type);
package/dist/index.d.mts CHANGED
@@ -1,8 +1,9 @@
1
1
  import { A as SyncStateKind, B as hourPartition, C as Row, D as SyncState, E as StorageEngine, F as WatermarkFilter, G as RAW_DAILY_COMPACT_THRESHOLD, H as inferSearchType, I as WatermarkScope, J as enumeratePartitions, K as countRawDailies, L as WriteCtx, M as TableName, N as TenantCtx, O as SyncStateDetail, P as Watermark, R as WriteResult, S as QueryResult, T as SearchType, U as objectKey, V as inferLegacyTier, W as CompactionThresholds, Y as splitOverlappingTiers, _ as PurgeUrlsResult, a as EngineOptions, b as QueryExecuteResult, c as Grain, d as ManifestEntry, f as ManifestPurgeResult, g as PurgeResult, h as PurgeFilter, i as DataSource, j as SyncStateScope, k as SyncStateFilter, l as ListLiveFilter, m as ParquetCodec, n as CompactionTier, o as FileSetRef, p as ManifestStore, q as dedupeOverlappingTiers, r as DEFAULT_SEARCH_TYPE, s as GcCtx, t as CodecCtx, u as LockScope, v as QueryCtx, w as RunSQLOptions, x as QueryExecutor, y as QueryExecuteOptions, z as dayPartition } from "./_chunks/storage.mjs";
2
2
  import { a as createDuckDBExecutor, i as createDuckDBCodec, n as DuckDBHandle, r as canonicalEmptyParquetSchema, t as DuckDBFactory } from "./_chunks/duckdb.mjs";
3
- import { _ as hourly_pages, a as allTables, b as pages, c as dimensionToColumn, f as DrizzleSchema, g as drizzleSchema, h as devices, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, t as ColumnDef, v as keywords, y as page_keywords } from "./_chunks/schema.mjs";
3
+ import { _ as hourly_pages, a as allTables, b as queries, c as dimensionToColumn, f as DrizzleSchema, g as drizzleSchema, h as dates, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, t as ColumnDef, v as page_queries, y as pages } from "./_chunks/schema.mjs";
4
4
  import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
5
- import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
5
+ import { A as ICEBERG_PARTITION_SPEC, C as icebergAppendRetrying, D as listIcebergTables, E as isCommitRateLimited, F as IcebergPartitionField, I as IcebergPartitionTransform, L as IcebergTableName, M as ICEBERG_TABLES, N as IcebergColumn, O as ICEBERG_FIELD_ID_BASE, P as IcebergColumnType, R as IcebergTableSpec, S as ensureIcebergNamespace, T as icebergSchemaFor, _ as IcebergSchemaField, a as SinkCloseResult, b as createIcebergTables, c as SinkWriteResult, d as IcebergConnection, f as IcebergPartitionSpec, g as IcebergSchema, h as IcebergS3Config, i as SinkCapabilities, j as ICEBERG_SCHEMAS, k as ICEBERG_PARTITION_COLUMNS, l as CommitRetryOptions, m as IcebergPrimitiveType, n as LocalIcebergSinkOptions, o as SinkOptions, p as IcebergPartitionSpecField, r as Sink, s as SinkSlice, t as IcebergAppendSinkOptions, u as IcebergCatalogConfig, v as IcebergTableOpResult, w as icebergPartitionSpecFor, x as dropIcebergTables, y as connectIcebergCatalog, z as icebergTableSpec } from "./_chunks/sink.mjs";
6
+ import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
6
7
  import { a as substituteNamedFiles, i as resolveParquetSQL, n as ResolvedQuery, t as FILES_PLACEHOLDER } from "./_chunks/planner.mjs";
7
8
  import { rebuildDailyFromHourly } from "./rollups.mjs";
8
9
  import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
@@ -122,16 +123,47 @@ interface CreateIngestAccumulatorOptions extends RowAccumulatorOptions {
122
123
  }
123
124
  declare function createNoopIngestAccumulator(): IngestAccumulator;
124
125
  declare function createIngestAccumulator(opts: CreateIngestAccumulatorOptions): IngestAccumulator;
125
- type SyncTableName = Extract<TableName$1, 'pages' | 'keywords' | 'countries' | 'devices' | 'page_keywords'>;
126
+ type IcebergAppendSink = Sink;
127
+ /**
128
+ * Create an `IcebergAppendSink` over the R2 Data Catalog.
129
+ *
130
+ * `emit` buffers; `close()` commits one `icebergAppend()` per table touched.
131
+ * The catalog connection (REST context + signed S3 resolver) is established
132
+ * lazily on the first flush and reused — a sink that is opened and closed
133
+ * with no rows never touches the network.
134
+ */
135
+ declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
136
+ /** A row as stored by the fake — data columns plus the injected identity columns. */
137
+ type StoredRow = Row & {
138
+ site_id: string;
139
+ search_type: string;
140
+ };
141
+ interface InMemorySink extends Sink {
142
+ /** All rows accepted, in partition order. */
143
+ readonly rows: readonly StoredRow[];
144
+ /** Rows for one table. */
145
+ rowsFor: (table: IcebergTableName) => StoredRow[];
146
+ /** Rows for one exact slice partition. */
147
+ rowsForSlice: (slice: SinkSlice) => StoredRow[];
148
+ /** `true` once `close()` has run. */
149
+ readonly closed: boolean;
150
+ /** Drop everything — handy between test cases. */
151
+ reset: () => void;
152
+ }
153
+ /**
154
+ * Create an in-memory append-only `Sink`.
155
+ */
156
+ declare function createInMemorySink(): InMemorySink;
157
+ type SyncTableName = Extract<TableName$1, 'pages' | 'queries' | 'countries' | 'page_queries' | 'dates'>;
126
158
  declare const TABLES_BY_SEARCH_TYPE: Record<SearchType, readonly SyncTableName[]>;
127
159
  declare function parseEnabledSearchTypes(raw: string | null | undefined): SearchType[];
128
160
  declare function validateEnabledSearchTypes(value: unknown): SearchType[];
129
161
  declare const TABLE_TIERS: {
130
162
  readonly pages: "critical";
131
- readonly keywords: "critical";
163
+ readonly queries: "critical";
132
164
  readonly countries: "standard";
133
- readonly devices: "standard";
134
- readonly page_keywords: "extended";
165
+ readonly dates: "standard";
166
+ readonly page_queries: "extended";
135
167
  };
136
168
  type TieredTableName = keyof typeof TABLE_TIERS;
137
169
  type TableTier = 'critical' | 'standard' | 'extended';
@@ -147,4 +179,4 @@ declare const MIN_SYNC_IMPRESSIONS = 1;
147
179
  declare const MIN_COUNTRY_IMPRESSIONS = 10;
148
180
  declare const MAX_SITEMAP_URLS_PER_SITE = 50000;
149
181
  declare const MAX_TRACKED_URLS_PER_SITE = 200000;
150
- export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIngestAccumulator, createNoopIngestAccumulator, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, dedupeOverlappingTiers, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, parseEnabledSearchTypes, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
182
+ export { type CodecCtx, type ColumnDef, type ColumnType, type CommitRetryOptions, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type InMemorySink, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LocalIcebergSinkOptions, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type Sink, type SinkCapabilities, type SinkCloseResult, type SinkOptions, type SinkSlice, type SinkWriteResult, type StorageEngine, type StoredRow, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, connectIcebergCatalog, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergAppendSink, createIcebergTables, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createRowAccumulator, createStorageEngine, currentSchemaVersion, dates, dayPartition, dedupeOverlappingTiers, dimensionToColumn, drizzleSchema, dropIcebergTables, ensureIcebergNamespace, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isCommitRateLimited, listIcebergTables, objectKey, page_queries, pages, parseEnabledSearchTypes, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };