@gscdump/engine 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,9 @@
1
+ import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
1
2
  import { SCHEMAS, currentSchemaVersion, dedupeByNaturalKey } from "./schema.mjs";
2
- import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./storage.mjs";
3
3
  import { compactTieredImpl, compileLogicalQueryPlan, dedupeOverlappingTiers, substituteNamedFiles } from "./parquet-plan.mjs";
4
4
  import { sqlEscape } from "../sql-bind.mjs";
5
5
  import { buildLogicalPlan } from "gscdump/query/plan";
6
- import { normalizeUrl } from "gscdump/normalize";
6
+ import { normalizeUrl } from "gscdump";
7
7
  async function encodeBytes(db, table, rows) {
8
8
  const inName = db.makeTempPath("json");
9
9
  const outName = db.makeTempPath("parquet");
@@ -601,4 +601,4 @@ function createStorageEngine(opts) {
601
601
  readObject: (key) => dataSource.read(key)
602
602
  };
603
603
  }
604
- export { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine, gcOrphansImpl };
604
+ export { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine };
@@ -1,8 +1,8 @@
1
+ import { dayPartition, inferSearchType, mondayOfWeek, monthPartition, objectKey, quarterOfMonth, quarterPartition, weekPartition } from "./layout.mjs";
1
2
  import { currentSchemaVersion, dimensionToColumn } from "./schema.mjs";
2
- import { dayPartition, inferSearchType, mondayOfWeek, monthPartition, objectKey, quarterOfMonth, quarterPartition, weekPartition } from "./storage.mjs";
3
3
  import { METRIC_EXPR, escapeLike, topLevelPagePredicateSql } from "../sql-fragments.mjs";
4
- import { MS_PER_DAY } from "gscdump";
5
4
  import { buildLogicalPlan } from "gscdump/query/plan";
5
+ import { MS_PER_DAY } from "gscdump";
6
6
  const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
7
7
  const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
8
8
  const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
@@ -12,10 +12,6 @@ const DEFAULT_THRESHOLDS = {
12
12
  d7: 30,
13
13
  d30: 90
14
14
  };
15
- const RAW_DAILY_COMPACT_THRESHOLD = 7;
16
- function countRawDailies(entries) {
17
- return entries.filter((e) => e.tier === "raw" || e.tier == null && e.partition.startsWith("daily/")).length;
18
- }
19
15
  const PENDING_WINDOW_DAYS = 4;
20
16
  const STAGES = [
21
17
  {
@@ -381,4 +377,4 @@ function substituteNamedFiles(sql, sets) {
381
377
  for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
382
378
  return out;
383
379
  }
384
- export { FILES_PLACEHOLDER, RAW_DAILY_COMPACT_THRESHOLD, compactTieredImpl, compileLogicalQueryPlan, countRawDailies, dedupeOverlappingTiers, enumeratePartitions, resolveParquetSQL, splitOverlappingTiers, substituteNamedFiles };
380
+ export { FILES_PLACEHOLDER, compactTieredImpl, compileLogicalQueryPlan, dedupeOverlappingTiers, enumeratePartitions, resolveParquetSQL, substituteNamedFiles };
@@ -2,9 +2,9 @@ import { SCHEMAS, drizzleSchema } from "./schema.mjs";
2
2
  import { enumeratePartitions } from "./parquet-plan.mjs";
3
3
  import { escapeLike } from "../sql-fragments.mjs";
4
4
  import "../planner.mjs";
5
- import { PgDialect, pgTable, varchar } from "drizzle-orm/pg-core";
6
5
  import { UnresolvableDatasetError, buildLogicalComparisonPlan, buildLogicalPlan, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
7
- import { normalizeUrl } from "gscdump/normalize";
6
+ import { normalizeUrl } from "gscdump";
7
+ import { PgDialect, pgTable, varchar } from "drizzle-orm/pg-core";
8
8
  import { sql } from "drizzle-orm";
9
9
  const DIMENSION_SURFACES = {
10
10
  page: ["api", "stored"],
@@ -68,5 +68,12 @@ function icebergTableSpec(table) {
68
68
  };
69
69
  }
70
70
  const ICEBERG_SCHEMAS = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t)]));
71
- new Set(ICEBERG_TABLES);
72
- export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, icebergTableSpec };
71
+ const ICEBERG_TABLE_SET = new Set(ICEBERG_TABLES);
72
+ function isIcebergTable(table) {
73
+ return ICEBERG_TABLE_SET.has(table);
74
+ }
75
+ function assertIcebergTable(table) {
76
+ if (!isIcebergTable(table)) throw new Error(`Unknown Iceberg table '${table}'. Expected one of: ${ICEBERG_TABLES.join(", ")}`);
77
+ return table;
78
+ }
79
+ export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable };
@@ -94,6 +94,16 @@ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
94
94
  declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
95
95
  /** All Iceberg table specs, keyed by table name. */
96
96
  declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
97
+ /** True when `table` is one of the canonical {@link ICEBERG_TABLES}. */
98
+ declare function isIcebergTable(table: string): table is IcebergTableName;
99
+ /**
100
+ * Narrow an arbitrary table name to a canonical {@link IcebergTableName},
101
+ * throwing a clear error otherwise. Guards write paths that index
102
+ * `ICEBERG_SCHEMAS` (a `Record<IcebergTableName, …>`) — a non-canonical name
103
+ * silently yields `undefined` there, propagating a corrupt/empty spec into the
104
+ * Iceberg job instead of failing loudly.
105
+ */
106
+ declare function assertIcebergTable(table: string): IcebergTableName;
97
107
  /** icebird's lowercase Iceberg primitive types (subset we use). */
98
108
  type IcebergPrimitiveType = 'string' | 'int' | 'long' | 'double' | 'date';
99
109
  /** A field in an icebird table `Schema`. */
@@ -370,4 +380,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
370
380
  /** S3-compatible warehouse location (POC: MinIO). */
371
381
  warehouse: string;
372
382
  }
373
- export { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, listIcebergDataFiles, listIcebergTables };
383
+ export { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
@@ -1,4 +1,4 @@
1
- import "./storage.mjs";
1
+ import "./layout.mjs";
2
2
  import { assertDimensionsSupported, getFilterDimensions, pgResolverAdapter, resolveToSQL } from "./resolver.mjs";
3
3
  import { runAnalyzerFromSource } from "./dispatch.mjs";
4
4
  function coerceRow(row) {
@@ -11,60 +11,7 @@ interface CompactionThresholds {
11
11
  d7?: number;
12
12
  d30?: number;
13
13
  }
14
- declare const RAW_DAILY_COMPACT_THRESHOLD = 7;
15
- declare function countRawDailies(entries: ReadonlyArray<{
16
- tier?: string | null;
17
- partition: string;
18
- }>): number;
19
14
  declare function enumeratePartitions(startDate: string, endDate: string): string[];
20
- /**
21
- * Split manifest entries into the set worth reading (`kept`) and the set whose
22
- * every covered day is already served by a finer-or-newer live entry
23
- * (`subsumed`).
24
- *
25
- * Tiered compaction (daily→weekly→monthly→quarterly) is meant to retire its
26
- * inputs, but coarse files can outlive their finer counterparts: a D1→R2
27
- * backfill writes daily files that compact to monthly while a later re-sync
28
- * writes fresh daily/weekly for the same dates, and same-partition re-writes
29
- * leave a stale prior version live. All stay live, the resolver unions every
30
- * live tier whose partition intersects the range, and `union_by_name` sums the
31
- * overlap — impressions/clicks double-count.
32
- *
33
- * Entries are walked finest-tier-first, newest-first within a tier, so a
34
- * coarse or stale file is dropped only when every day it covers is already
35
- * claimed. Subsumption is evaluated per searchType — a `web` monthly never
36
- * cancels a `discover` weekly, they cover disjoint data. Partial
37
- * month-boundary overlap (a weekly straddling two months alongside a kept
38
- * monthly) still double-counts those boundary days — eliminating that needs
39
- * per-file date predicates in the SQL, tracked separately. Unrecognised
40
- * partition shapes (`hourly/`, sidecar keys) are always kept.
41
- *
42
- * `queryRange` clamps every entry's day-span to the window the caller will
43
- * actually read. This is required when `entries` came from a partition-
44
- * filtered `listLive` (`runSQL` enumerates only the partitions intersecting
45
- * the query): a `monthly/2026-04` whose Apr 27-30 falls past the query end
46
- * must not be judged "unsubsumed" just because `weekly/2026-04-27` wasn't
47
- * enumerated — those out-of-window days are SQL-filtered to nothing anyway.
48
- * Omit `queryRange` when `entries` is the full manifest (e.g. analysis-sources).
49
- */
50
- declare function splitOverlappingTiers(entries: ManifestEntry[], queryRange?: {
51
- start: string;
52
- end: string;
53
- }): {
54
- kept: ManifestEntry[];
55
- subsumed: ManifestEntry[];
56
- };
57
- /** Entries worth reading — see {@link splitOverlappingTiers}. */
58
- declare function dedupeOverlappingTiers(entries: ManifestEntry[], queryRange?: {
59
- start: string;
60
- end: string;
61
- }): ManifestEntry[];
62
- /**
63
- * Default `searchType` for entries written before the field landed and for
64
- * sync paths that don't request a specific type. GSC's own default; the
65
- * vast majority of stored data is web-search.
66
- */
67
- declare const DEFAULT_SEARCH_TYPE: SearchType;
68
15
  interface WriteCtx extends TenantCtx {
69
16
  table: TableName;
70
17
  date?: string;
@@ -152,18 +99,6 @@ interface ManifestEntry {
152
99
  */
153
100
  searchType?: SearchType;
154
101
  }
155
- /**
156
- * Resolve the search type for an entry, defaulting legacy entries to `web`.
157
- * Use this anywhere code needs to bucket entries by searchType.
158
- */
159
- declare function inferSearchType(entry: Pick<ManifestEntry, 'searchType'>): SearchType;
160
- /**
161
- * Infer the tier for an entry that pre-dates the `tier` field. Daily files
162
- * are `raw`; monthly files are `d30`. Anything else (already migrated, or
163
- * a partition shape we haven't seen) returns undefined and the caller must
164
- * decide how to handle it.
165
- */
166
- declare function inferLegacyTier(entry: Pick<ManifestEntry, 'partition' | 'tier'>): CompactionTier | undefined;
167
102
  interface ListLiveFilter {
168
103
  userId: string;
169
104
  siteId?: string;
@@ -541,13 +476,4 @@ interface EngineOptions {
541
476
  executor: QueryExecutor;
542
477
  now?: () => number;
543
478
  }
544
- declare function dayPartition(date: string): string;
545
- /**
546
- * Hourly partition keyed by the PT calendar day (`YYYY-MM-DD`). One parquet
547
- * per day holds 24 hourly buckets — read-merge-write keeps `(url, hour)`
548
- * idempotency across retries. Names sort lexically alongside daily ones but
549
- * never collide because of the `hourly/` prefix.
550
- */
551
- declare function hourPartition(date: string): string;
552
- declare function objectKey(ctx: TenantCtx, table: TableName, partition: string, version: number, searchType?: SearchType): string;
553
- export { CodecCtx, CompactionThresholds, CompactionTier, DEFAULT_SEARCH_TYPE, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, RAW_DAILY_COMPACT_THRESHOLD, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, countRawDailies, dayPartition, dedupeOverlappingTiers, enumeratePartitions, hourPartition, inferLegacyTier, inferSearchType, objectKey, splitOverlappingTiers };
479
+ export { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, enumeratePartitions };
@@ -1,4 +1,4 @@
1
- import { inferLegacyTier, inferSearchType } from "../_chunks/storage.mjs";
1
+ import { inferLegacyTier, inferSearchType } from "../_chunks/layout.mjs";
2
2
  import { dirname, join, resolve } from "node:path";
3
3
  import { Buffer } from "node:buffer";
4
4
  import { randomBytes } from "node:crypto";
@@ -1,8 +1,8 @@
1
1
  import { createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "../_chunks/engine.mjs";
2
2
  import { createNodeDuckDBHandle, resetNodeDuckDB } from "./duckdb-node.mjs";
3
3
  import { createFilesystemDataSource, createFilesystemManifestStore } from "./filesystem.mjs";
4
+ import { encodeSiteId } from "gscdump";
4
5
  import path from "node:path";
5
- import { encodeSiteId } from "gscdump/tenant";
6
6
  function createNodeHarness(opts) {
7
7
  const dataDir = opts.dataDir;
8
8
  const userId = opts.userId ?? "local";
@@ -1,4 +1,4 @@
1
- import { inferLegacyTier, inferSearchType } from "../_chunks/storage.mjs";
1
+ import { inferLegacyTier, inferSearchType } from "../_chunks/layout.mjs";
2
2
  const SHARD_RE = /^u_[^/]+\/manifest\/(?<siteId>[^/]+)\/(?<table>[^/]+)\/HEAD$/;
3
3
  const CAS_BACKOFF_BASE_MS = 5;
4
4
  const CAS_BACKOFF_CAP_MS = 250;
@@ -0,0 +1,12 @@
1
+ import { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
2
+ type IcebergAppendSink = Sink;
3
+ /**
4
+ * Create an `IcebergAppendSink` over the R2 Data Catalog.
5
+ *
6
+ * `emit` buffers; `close()` commits one `icebergAppend()` per table touched.
7
+ * The catalog connection (REST context + signed S3 resolver) is established
8
+ * lazily on the first flush and reused — a sink that is opened and closed
9
+ * with no rows never touches the network.
10
+ */
11
+ declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
12
+ export { type CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
@@ -0,0 +1,269 @@
1
+ import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
2
+ import { icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "icebird";
3
+ const ICEBERG_TYPE_MAP = {
4
+ STRING: "string",
5
+ INT: "int",
6
+ LONG: "long",
7
+ DOUBLE: "double",
8
+ DATE: "date"
9
+ };
10
+ function icebergSchemaFor(table) {
11
+ return {
12
+ "type": "struct",
13
+ "schema-id": 0,
14
+ "fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
15
+ id: col.fieldId,
16
+ name: col.name,
17
+ required: col.required,
18
+ type: ICEBERG_TYPE_MAP[col.type]
19
+ }))
20
+ };
21
+ }
22
+ function icebergPartitionSpecFor(table) {
23
+ const fields = ICEBERG_SCHEMAS[table].columns;
24
+ const fieldId = (name) => {
25
+ const col = fields.find((c) => c.name === name);
26
+ if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
27
+ return col.fieldId;
28
+ };
29
+ return {
30
+ "spec-id": 0,
31
+ "fields": ICEBERG_PARTITION_SPEC.map((p, i) => ({
32
+ "source-id": fieldId(p.sourceColumn),
33
+ "field-id": 1e3 + i,
34
+ "name": p.name,
35
+ "transform": p.transform
36
+ }))
37
+ };
38
+ }
39
+ async function connectIcebergCatalog(config) {
40
+ return {
41
+ catalog: await restCatalogConnect({
42
+ url: config.catalogUri,
43
+ warehouse: config.warehouse,
44
+ requestInit: { headers: { Authorization: `Bearer ${config.catalogToken}` } }
45
+ }),
46
+ resolver: s3SignedResolver({
47
+ accessKeyId: config.s3.accessKeyId,
48
+ secretAccessKey: config.s3.secretAccessKey,
49
+ region: config.s3.region ?? "auto",
50
+ endpoint: config.s3.endpoint,
51
+ pathStyle: true
52
+ }),
53
+ namespace: config.namespace
54
+ };
55
+ }
56
+ function isCommitRateLimited(err) {
57
+ if (err && typeof err === "object" && err.status === 429) return true;
58
+ const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
59
+ return msg.includes("429") || msg.includes("too many commits") || msg.includes("rate limit");
60
+ }
61
+ function defaultCommitSleep(ms) {
62
+ return new Promise((resolve) => setTimeout(resolve, ms));
63
+ }
64
+ async function icebergAppendRetrying(args, options = {}) {
65
+ const maxAttempts = options.maxAttempts ?? 6;
66
+ const baseDelayMs = options.baseDelayMs ?? 1e3;
67
+ const maxDelayMs = options.maxDelayMs ?? 2e4;
68
+ const sleep = options.sleep ?? defaultCommitSleep;
69
+ const random = options.random ?? Math.random;
70
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
71
+ const err = await icebergAppend(args).then(() => void 0, (e) => e);
72
+ if (err === void 0) return;
73
+ if (!isCommitRateLimited(err) || attempt === maxAttempts - 1) throw err;
74
+ const ceiling = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt);
75
+ await sleep(Math.floor(random() * ceiling));
76
+ }
77
+ }
78
+ async function ensureIcebergNamespace(conn) {
79
+ await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
80
+ }
81
+ async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
82
+ const results = [];
83
+ for (const table of tables) await icebergCreateTable({
84
+ catalog: conn.catalog,
85
+ namespace: conn.namespace,
86
+ table,
87
+ schema: icebergSchemaFor(table),
88
+ partitionSpec: icebergPartitionSpecFor(table)
89
+ }).then(() => results.push({
90
+ table,
91
+ ok: true
92
+ }), (e) => results.push({
93
+ table,
94
+ ok: false,
95
+ error: String(e)
96
+ }));
97
+ return results;
98
+ }
99
+ async function listIcebergTables(conn) {
100
+ return restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name).sort(), () => []);
101
+ }
102
+ function monthsInRange(range) {
103
+ const [sy, sm] = range.start.split("-").map(Number);
104
+ const [ey, em] = range.end.split("-").map(Number);
105
+ const out = [];
106
+ let y = sy;
107
+ let m = sm;
108
+ while (y < ey || y === ey && m <= em) {
109
+ out.push(`${y}-${String(m).padStart(2, "0")}`);
110
+ m++;
111
+ if (m > 12) {
112
+ m = 1;
113
+ y++;
114
+ }
115
+ }
116
+ return out;
117
+ }
118
+ function monthsSinceEpoch(ym) {
119
+ const [y, m] = ym.split("-").map(Number);
120
+ return (y - 1970) * 12 + (m - 1);
121
+ }
122
+ function stripBucket(filePath) {
123
+ if (!filePath.startsWith("s3://")) return filePath;
124
+ const rest = filePath.slice(5);
125
+ const slash = rest.indexOf("/");
126
+ return slash >= 0 ? rest.slice(slash + 1) : rest;
127
+ }
128
+ async function listIcebergDataFiles(conn, opts) {
129
+ const { metadata } = await restCatalogLoadTable(conn.catalog, {
130
+ namespace: conn.namespace,
131
+ table: opts.table
132
+ });
133
+ if (metadata["current-snapshot-id"] == null) return [];
134
+ const wantedMonths = new Set(monthsInRange(opts.range).map(monthsSinceEpoch));
135
+ const manifests = await icebergManifests({
136
+ metadata,
137
+ resolver: conn.resolver
138
+ });
139
+ const out = [];
140
+ for (const m of manifests) for (const entry of m.entries) {
141
+ if (entry.status === 2) continue;
142
+ const df = entry.data_file;
143
+ if (df.content !== 0) continue;
144
+ const part = df.partition;
145
+ if (part.site_id !== opts.siteId) continue;
146
+ if (part.search_type !== opts.searchType) continue;
147
+ const month = part.date_month;
148
+ if (typeof month !== "number" || !wantedMonths.has(month)) continue;
149
+ out.push({
150
+ filePath: df.file_path,
151
+ objectKey: stripBucket(df.file_path),
152
+ bytes: Number(df.file_size_in_bytes),
153
+ rowCount: Number(df.record_count)
154
+ });
155
+ }
156
+ return out;
157
+ }
158
+ async function dropIcebergTables(conn, tables) {
159
+ const targets = tables ?? await restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name), () => []);
160
+ const results = [];
161
+ for (const table of targets) await icebergDropTable({
162
+ catalog: conn.catalog,
163
+ namespace: conn.namespace,
164
+ table,
165
+ purgeRequested: true
166
+ }).then(() => results.push({
167
+ table,
168
+ ok: true
169
+ }), (e) => results.push({
170
+ table,
171
+ ok: false,
172
+ error: String(e)
173
+ }));
174
+ return results;
175
+ }
176
+ const DAY_MILLIS = 864e5;
177
+ function toIcebergDate(value) {
178
+ if (typeof value === "string") {
179
+ const ms = Date.parse(`${value}T00:00:00Z`);
180
+ if (Number.isNaN(ms)) throw new TypeError(`toIcebergDate: invalid date string '${value}'`);
181
+ return Math.floor(ms / DAY_MILLIS);
182
+ }
183
+ if (value instanceof Date) {
184
+ const ms = value.getTime();
185
+ if (Number.isNaN(ms)) throw new TypeError("toIcebergDate: invalid Date (NaN)");
186
+ return Math.floor(ms / DAY_MILLIS);
187
+ }
188
+ return value;
189
+ }
190
+ function coerceJsonSafe(value) {
191
+ if (typeof value === "bigint") return Number(value);
192
+ return value;
193
+ }
194
+ function toRecords(slice, rows) {
195
+ const siteId = slice.ctx.siteId ?? "";
196
+ return rows.map((row) => {
197
+ const out = {};
198
+ for (const k in row) out[k] = coerceJsonSafe(row[k]);
199
+ out.date = toIcebergDate(out.date);
200
+ out.site_id = siteId;
201
+ out.search_type = slice.searchType;
202
+ return out;
203
+ });
204
+ }
205
+ function createIcebergAppendSink(options) {
206
+ let connection;
207
+ const buffers = /* @__PURE__ */ new Map();
208
+ function connect() {
209
+ connection ??= connectIcebergCatalog(options.catalog);
210
+ return connection;
211
+ }
212
+ return {
213
+ capabilities: { appendOnly: true },
214
+ async emit(slice, rows) {
215
+ if (rows.length === 0) return { rowCount: 0 };
216
+ const records = toRecords(slice, rows);
217
+ const buffer = buffers.get(slice.table);
218
+ if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
219
+ else buffers.set(slice.table, records);
220
+ return { rowCount: records.length };
221
+ },
222
+ async close() {
223
+ const flushed = [];
224
+ const failed = [];
225
+ if (buffers.size === 0) return {
226
+ flushed,
227
+ failed
228
+ };
229
+ const conn = await connect().then((c) => c, (err) => {
230
+ connection = void 0;
231
+ return { error: String(err) };
232
+ });
233
+ if ("error" in conn) {
234
+ for (const [table, records] of buffers) if (records.length > 0) failed.push({
235
+ table,
236
+ error: conn.error
237
+ });
238
+ buffers.clear();
239
+ return {
240
+ flushed,
241
+ failed
242
+ };
243
+ }
244
+ for (const [table, records] of buffers) {
245
+ if (records.length === 0) continue;
246
+ await icebergAppendRetrying({
247
+ catalog: conn.catalog,
248
+ namespace: conn.namespace,
249
+ table,
250
+ resolver: conn.resolver,
251
+ records
252
+ }, options.commitRetry).then(() => {
253
+ flushed.push(table);
254
+ }, (err) => {
255
+ failed.push({
256
+ table,
257
+ error: String(err)
258
+ });
259
+ });
260
+ }
261
+ buffers.clear();
262
+ return {
263
+ flushed,
264
+ failed
265
+ };
266
+ }
267
+ };
268
+ }
269
+ export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
package/dist/index.d.mts CHANGED
@@ -1,36 +1,20 @@
1
- import { CodecCtx, CompactionThresholds, CompactionTier, DEFAULT_SEARCH_TYPE, DataSource, EngineOptions, FileSetRef, GcCtx, Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, RAW_DAILY_COMPACT_THRESHOLD, Row, RunSQLOptions, SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, TableName, TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, countRawDailies, dayPartition, dedupeOverlappingTiers, enumeratePartitions, hourPartition, inferLegacyTier, inferSearchType, objectKey, splitOverlappingTiers } from "./_chunks/storage.mjs";
1
+ import { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, Row, RunSQLOptions, SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, TableName, TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, enumeratePartitions } from "./_chunks/storage.mjs";
2
2
  import { DuckDBFactory, DuckDBHandle, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor } from "./_chunks/duckdb.mjs";
3
3
  import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dates, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, page_queries, pages, queries } from "./_chunks/schema.mjs";
4
4
  import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
5
- import { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, listIcebergDataFiles, listIcebergTables } from "./_chunks/sink.mjs";
5
+ import { IcebergTableName, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult } from "./_chunks/sink.mjs";
6
6
  import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
7
7
  import { FILES_PLACEHOLDER, ResolvedQuery, resolveParquetSQL, substituteNamedFiles } from "./_chunks/planner.mjs";
8
8
  import { createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "./_chunks/pg-adapter.mjs";
9
9
  import { rebuildDailyFromHourly } from "./rollups.mjs";
10
10
  import { ENGINE_QUERY_CAPABILITIES, createSqlQuerySource } from "./_chunks/index.mjs";
11
11
  import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
12
- import { Grain as Grain$1, Row as Row$1, TableName as TableName$1 } from "@gscdump/contracts";
12
+ import { Grain as Grain$1, Row as Row$1, TableName as TableName$1, TenantCtx as TenantCtx$1 } from "@gscdump/contracts";
13
+ import { SearchType as SearchType$1 } from "gscdump/query";
13
14
  declare function coerceRow(row: Row$1): Row$1;
14
15
  declare function coerceRows(rows: readonly Row$1[]): Row$1[];
15
16
  declare const MAX_DAY_BYTES: number;
16
17
  declare function createStorageEngine(opts: EngineOptions): StorageEngine;
17
- interface GcDeps {
18
- dataSource: DataSource;
19
- manifestStore: ManifestStore;
20
- }
21
- interface GcOptions {
22
- userId?: string;
23
- siteId?: string;
24
- /**
25
- * Retention for hourly partitions (`hourly/{date}`) in milliseconds.
26
- * Defaults to 90 days; entries with `createdAt < now - hourlyRetentionMs`
27
- * are retired and their bytes deleted alongside ordinary orphan sweeping.
28
- */
29
- hourlyRetentionMs?: number;
30
- }
31
- declare function gcOrphansImpl(deps: GcDeps, now: number, graceMs: number, opts?: GcOptions): Promise<{
32
- deleted: number;
33
- }>;
34
18
  interface IngestAccumulatorEngine {
35
19
  writeDay: (scope: TenantCtx & {
36
20
  table: TableName$1;
@@ -125,16 +109,33 @@ interface CreateIngestAccumulatorOptions extends RowAccumulatorOptions {
125
109
  }
126
110
  declare function createNoopIngestAccumulator(): IngestAccumulator;
127
111
  declare function createIngestAccumulator(opts: CreateIngestAccumulatorOptions): IngestAccumulator;
128
- type IcebergAppendSink = Sink;
112
+ declare function dayPartition(date: string): string;
113
+ /**
114
+ * Hourly partition keyed by the PT calendar day (`YYYY-MM-DD`). One parquet
115
+ * per day holds 24 hourly buckets — read-merge-write keeps `(url, hour)`
116
+ * idempotency across retries. Names sort lexically alongside daily ones but
117
+ * never collide because of the `hourly/` prefix.
118
+ */
119
+ declare function hourPartition(date: string): string;
120
+ /**
121
+ * Default `searchType` for entries written before the field landed and for
122
+ * sync paths that don't request a specific type. GSC's own default; the
123
+ * vast majority of stored data is web-search.
124
+ */
125
+ declare const DEFAULT_SEARCH_TYPE: SearchType$1;
126
+ declare function objectKey(ctx: TenantCtx$1, table: TableName$1, partition: string, version: number, searchType?: SearchType$1): string;
127
+ /**
128
+ * Resolve the search type for an entry, defaulting legacy entries to `web`.
129
+ * Use this anywhere code needs to bucket entries by searchType.
130
+ */
131
+ declare function inferSearchType(entry: Pick<ManifestEntry, 'searchType'>): SearchType$1;
129
132
  /**
130
- * Create an `IcebergAppendSink` over the R2 Data Catalog.
131
- *
132
- * `emit` buffers; `close()` commits one `icebergAppend()` per table touched.
133
- * The catalog connection (REST context + signed S3 resolver) is established
134
- * lazily on the first flush and reused — a sink that is opened and closed
135
- * with no rows never touches the network.
133
+ * Infer the tier for an entry that pre-dates the `tier` field. Daily files
134
+ * are `raw`; monthly files are `d30`. Anything else (already migrated, or
135
+ * a partition shape we haven't seen) returns undefined and the caller must
136
+ * decide how to handle it.
136
137
  */
137
- declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
138
+ declare function inferLegacyTier(entry: Pick<ManifestEntry, 'partition' | 'tier'>): CompactionTier | undefined;
138
139
  /** A row as stored by the fake — data columns plus the injected identity columns. */
139
140
  type StoredRow = Row & {
140
141
  site_id: string;
@@ -181,4 +182,4 @@ declare const MIN_SYNC_IMPRESSIONS = 1;
181
182
  declare const MIN_COUNTRY_IMPRESSIONS = 10;
182
183
  declare const MAX_SITEMAP_URLS_PER_SITE = 50000;
183
184
  declare const MAX_TRACKED_URLS_PER_SITE = 200000;
184
- export { type CodecCtx, type ColumnDef, type ColumnType, type CommitRetryOptions, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, ENGINE_QUERY_CAPABILITIES, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type InMemorySink, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListIcebergDataFilesOptions, type ListLiveFilter, type LocalIcebergSinkOptions, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type Sink, type SinkCapabilities, type SinkCloseResult, type SinkOptions, type SinkSlice, type SinkWriteResult, type StorageEngine, type StoredRow, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, connectIcebergCatalog, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergAppendSink, createIcebergResolverAdapter, createIcebergTables, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dedupeOverlappingTiers, dimensionToColumn, drizzleSchema, dropIcebergTables, ensureIcebergNamespace, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isCommitRateLimited, listIcebergDataFiles, listIcebergTables, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
185
+ export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type CreateIngestAccumulatorOptions, DEFAULT_SEARCH_TYPE, type DataSource, type DateWeight, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, ENGINE_QUERY_CAPABILITIES, type EngineOptions, FILES_PLACEHOLDER, type FileSetRef, type FinalizeOptions, type FinalizeResult, type GcCtx, type Grain, type GscApiRow, type InMemorySink, type IngestAccumulator, type IngestAccumulatorCtx, type IngestAccumulatorEngine, type IngestAccumulatorHooks, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, ROW_LIMIT_R2, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type Sink, type SinkCapabilities, type SinkCloseResult, type SinkOptions, type SinkSlice, type SinkWriteResult, type StorageEngine, type StoredRow, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, type SyncTableName, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, type TableName, type TableSchema, type TableTier, type TenantCtx, type TieredTableName, WEIGHT_PRIORITY, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
package/dist/index.mjs CHANGED
@@ -1,189 +1,14 @@
1
1
  import { ENGINE_QUERY_CAPABILITIES, coerceRow, coerceRows, createSqlQuerySource } from "./_chunks/source.mjs";
2
+ import { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, objectKey } from "./_chunks/layout.mjs";
2
3
  import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, page_queries, pages, queries } from "./_chunks/schema.mjs";
3
- import { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, objectKey } from "./_chunks/storage.mjs";
4
- import { FILES_PLACEHOLDER, RAW_DAILY_COMPACT_THRESHOLD, countRawDailies, dedupeOverlappingTiers, enumeratePartitions, resolveParquetSQL, splitOverlappingTiers, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
4
+ import { FILES_PLACEHOLDER, enumeratePartitions, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
5
5
  import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
6
- import { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine, gcOrphansImpl } from "./_chunks/engine.mjs";
7
- import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, icebergTableSpec } from "./_chunks/iceberg-schema.mjs";
6
+ import { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "./_chunks/engine.mjs";
8
7
  import { assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
9
8
  import "./planner.mjs";
10
9
  import { createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "./_chunks/resolver.mjs";
11
10
  import { rebuildDailyFromHourly } from "./rollups.mjs";
12
11
  import { fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
13
- import { icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "icebird";
14
- const ICEBERG_TYPE_MAP = {
15
- STRING: "string",
16
- INT: "int",
17
- LONG: "long",
18
- DOUBLE: "double",
19
- DATE: "date"
20
- };
21
- function icebergSchemaFor(table) {
22
- return {
23
- "type": "struct",
24
- "schema-id": 0,
25
- "fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
26
- id: col.fieldId,
27
- name: col.name,
28
- required: col.required,
29
- type: ICEBERG_TYPE_MAP[col.type]
30
- }))
31
- };
32
- }
33
- function icebergPartitionSpecFor(table) {
34
- const fields = ICEBERG_SCHEMAS[table].columns;
35
- const fieldId = (name) => {
36
- const col = fields.find((c) => c.name === name);
37
- if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
38
- return col.fieldId;
39
- };
40
- return {
41
- "spec-id": 0,
42
- "fields": ICEBERG_PARTITION_SPEC.map((p, i) => ({
43
- "source-id": fieldId(p.sourceColumn),
44
- "field-id": 1e3 + i,
45
- "name": p.name,
46
- "transform": p.transform
47
- }))
48
- };
49
- }
50
- async function connectIcebergCatalog(config) {
51
- return {
52
- catalog: await restCatalogConnect({
53
- url: config.catalogUri,
54
- warehouse: config.warehouse,
55
- requestInit: { headers: { Authorization: `Bearer ${config.catalogToken}` } }
56
- }),
57
- resolver: s3SignedResolver({
58
- accessKeyId: config.s3.accessKeyId,
59
- secretAccessKey: config.s3.secretAccessKey,
60
- region: config.s3.region ?? "auto",
61
- endpoint: config.s3.endpoint,
62
- pathStyle: true
63
- }),
64
- namespace: config.namespace
65
- };
66
- }
67
- function isCommitRateLimited(err) {
68
- if (err && typeof err === "object" && err.status === 429) return true;
69
- const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
70
- return msg.includes("429") || msg.includes("too many commits") || msg.includes("rate limit");
71
- }
72
- function defaultCommitSleep(ms) {
73
- return new Promise((resolve) => setTimeout(resolve, ms));
74
- }
75
- async function icebergAppendRetrying(args, options = {}) {
76
- const maxAttempts = options.maxAttempts ?? 6;
77
- const baseDelayMs = options.baseDelayMs ?? 1e3;
78
- const maxDelayMs = options.maxDelayMs ?? 2e4;
79
- const sleep = options.sleep ?? defaultCommitSleep;
80
- const random = options.random ?? Math.random;
81
- for (let attempt = 0; attempt < maxAttempts; attempt++) {
82
- const err = await icebergAppend(args).then(() => void 0, (e) => e);
83
- if (err === void 0) return;
84
- if (!isCommitRateLimited(err) || attempt === maxAttempts - 1) throw err;
85
- const ceiling = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt);
86
- await sleep(Math.floor(random() * ceiling));
87
- }
88
- }
89
- async function ensureIcebergNamespace(conn) {
90
- await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
91
- }
92
- async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
93
- const results = [];
94
- for (const table of tables) await icebergCreateTable({
95
- catalog: conn.catalog,
96
- namespace: conn.namespace,
97
- table,
98
- schema: icebergSchemaFor(table),
99
- partitionSpec: icebergPartitionSpecFor(table)
100
- }).then(() => results.push({
101
- table,
102
- ok: true
103
- }), (e) => results.push({
104
- table,
105
- ok: false,
106
- error: String(e)
107
- }));
108
- return results;
109
- }
110
- async function listIcebergTables(conn) {
111
- return restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name).sort(), () => []);
112
- }
113
- function monthsInRange(range) {
114
- const [sy, sm] = range.start.split("-").map(Number);
115
- const [ey, em] = range.end.split("-").map(Number);
116
- const out = [];
117
- let y = sy;
118
- let m = sm;
119
- while (y < ey || y === ey && m <= em) {
120
- out.push(`${y}-${String(m).padStart(2, "0")}`);
121
- m++;
122
- if (m > 12) {
123
- m = 1;
124
- y++;
125
- }
126
- }
127
- return out;
128
- }
129
- function monthsSinceEpoch(ym) {
130
- const [y, m] = ym.split("-").map(Number);
131
- return (y - 1970) * 12 + (m - 1);
132
- }
133
- function stripBucket(filePath) {
134
- if (!filePath.startsWith("s3://")) return filePath;
135
- const rest = filePath.slice(5);
136
- const slash = rest.indexOf("/");
137
- return slash >= 0 ? rest.slice(slash + 1) : rest;
138
- }
139
- async function listIcebergDataFiles(conn, opts) {
140
- const { metadata } = await restCatalogLoadTable(conn.catalog, {
141
- namespace: conn.namespace,
142
- table: opts.table
143
- });
144
- if (metadata["current-snapshot-id"] == null) return [];
145
- const wantedMonths = new Set(monthsInRange(opts.range).map(monthsSinceEpoch));
146
- const manifests = await icebergManifests({
147
- metadata,
148
- resolver: conn.resolver
149
- });
150
- const out = [];
151
- for (const m of manifests) for (const entry of m.entries) {
152
- if (entry.status === 2) continue;
153
- const df = entry.data_file;
154
- if (df.content !== 0) continue;
155
- const part = df.partition;
156
- if (part.site_id !== opts.siteId) continue;
157
- if (part.search_type !== opts.searchType) continue;
158
- const month = part.date_month;
159
- if (typeof month !== "number" || !wantedMonths.has(month)) continue;
160
- out.push({
161
- filePath: df.file_path,
162
- objectKey: stripBucket(df.file_path),
163
- bytes: Number(df.file_size_in_bytes),
164
- rowCount: Number(df.record_count)
165
- });
166
- }
167
- return out;
168
- }
169
- async function dropIcebergTables(conn, tables) {
170
- const targets = tables ?? await restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name), () => []);
171
- const results = [];
172
- for (const table of targets) await icebergDropTable({
173
- catalog: conn.catalog,
174
- namespace: conn.namespace,
175
- table,
176
- purgeRequested: true
177
- }).then(() => results.push({
178
- table,
179
- ok: true
180
- }), (e) => results.push({
181
- table,
182
- ok: false,
183
- error: String(e)
184
- }));
185
- return results;
186
- }
187
12
  const NOOP_RESULT = {
188
13
  flushed: 0,
189
14
  recovered: 0,
@@ -299,99 +124,6 @@ function createIngestAccumulator(opts) {
299
124
  }
300
125
  };
301
126
  }
302
- const DAY_MILLIS = 864e5;
303
- function toIcebergDate(value) {
304
- if (typeof value === "string") {
305
- const ms = Date.parse(`${value}T00:00:00Z`);
306
- if (Number.isNaN(ms)) throw new TypeError(`toIcebergDate: invalid date string '${value}'`);
307
- return Math.floor(ms / DAY_MILLIS);
308
- }
309
- if (value instanceof Date) {
310
- const ms = value.getTime();
311
- if (Number.isNaN(ms)) throw new TypeError("toIcebergDate: invalid Date (NaN)");
312
- return Math.floor(ms / DAY_MILLIS);
313
- }
314
- return value;
315
- }
316
- function coerceJsonSafe(value) {
317
- if (typeof value === "bigint") return Number(value);
318
- return value;
319
- }
320
- function toRecords(slice, rows) {
321
- const siteId = slice.ctx.siteId ?? "";
322
- return rows.map((row) => {
323
- const out = {};
324
- for (const k in row) out[k] = coerceJsonSafe(row[k]);
325
- out.date = toIcebergDate(out.date);
326
- out.site_id = siteId;
327
- out.search_type = slice.searchType;
328
- return out;
329
- });
330
- }
331
- function createIcebergAppendSink(options) {
332
- let connection;
333
- const buffers = /* @__PURE__ */ new Map();
334
- function connect() {
335
- connection ??= connectIcebergCatalog(options.catalog);
336
- return connection;
337
- }
338
- return {
339
- capabilities: { appendOnly: true },
340
- async emit(slice, rows) {
341
- if (rows.length === 0) return { rowCount: 0 };
342
- const records = toRecords(slice, rows);
343
- const buffer = buffers.get(slice.table);
344
- if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
345
- else buffers.set(slice.table, records);
346
- return { rowCount: records.length };
347
- },
348
- async close() {
349
- const flushed = [];
350
- const failed = [];
351
- if (buffers.size === 0) return {
352
- flushed,
353
- failed
354
- };
355
- const conn = await connect().then((c) => c, (err) => {
356
- connection = void 0;
357
- return { error: String(err) };
358
- });
359
- if ("error" in conn) {
360
- for (const [table, records] of buffers) if (records.length > 0) failed.push({
361
- table,
362
- error: conn.error
363
- });
364
- buffers.clear();
365
- return {
366
- flushed,
367
- failed
368
- };
369
- }
370
- for (const [table, records] of buffers) {
371
- if (records.length === 0) continue;
372
- await icebergAppendRetrying({
373
- catalog: conn.catalog,
374
- namespace: conn.namespace,
375
- table,
376
- resolver: conn.resolver,
377
- records
378
- }, options.commitRetry).then(() => {
379
- flushed.push(table);
380
- }, (err) => {
381
- failed.push({
382
- table,
383
- error: String(err)
384
- });
385
- });
386
- }
387
- buffers.clear();
388
- return {
389
- flushed,
390
- failed
391
- };
392
- }
393
- };
394
- }
395
127
  const KEY_SEP = "\0";
396
128
  function partitionKey(slice) {
397
129
  return [
@@ -544,4 +276,4 @@ const MIN_SYNC_IMPRESSIONS = 1;
544
276
  const MIN_COUNTRY_IMPRESSIONS = 10;
545
277
  const MAX_SITEMAP_URLS_PER_SITE = 5e4;
546
278
  const MAX_TRACKED_URLS_PER_SITE = 2e5;
547
- export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, connectIcebergCatalog, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergAppendSink, createIcebergResolverAdapter, createIcebergTables, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dedupeOverlappingTiers, dimensionToColumn, drizzleSchema, dropIcebergTables, ensureIcebergNamespace, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isCommitRateLimited, listIcebergDataFiles, listIcebergTables, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
279
+ export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
package/dist/rollups.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import "./_chunks/storage.mjs";
1
+ import "./_chunks/layout.mjs";
2
2
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
3
3
  import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./entities.mjs";
4
4
  import { MS_PER_DAY } from "gscdump";
@@ -20,4 +20,4 @@ interface LocalIcebergSink extends Sink {
20
20
  * use this sink must skip when the stack is unreachable.
21
21
  */
22
22
  declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
23
- export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
23
+ export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, type LocalIcebergSinkOptions, createLocalIcebergSink };
@@ -1,4 +1,4 @@
1
- import { ICEBERG_SCHEMAS } from "./_chunks/iceberg-schema.mjs";
1
+ import { ICEBERG_SCHEMAS } from "./_chunks/schema2.mjs";
2
2
  import { execFile } from "node:child_process";
3
3
  import { dirname, join } from "node:path";
4
4
  import process from "node:process";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.24.0",
4
+ "version": "0.25.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -66,11 +66,6 @@
66
66
  "import": "./dist/sql-fragments.mjs",
67
67
  "default": "./dist/sql-fragments.mjs"
68
68
  },
69
- "./schedule": {
70
- "types": "./dist/schedule.d.mts",
71
- "import": "./dist/schedule.mjs",
72
- "default": "./dist/schedule.mjs"
73
- },
74
69
  "./entities": {
75
70
  "types": "./dist/entities.d.mts",
76
71
  "import": "./dist/entities.mjs",
@@ -81,6 +76,11 @@
81
76
  "import": "./dist/rollups.mjs",
82
77
  "default": "./dist/rollups.mjs"
83
78
  },
79
+ "./iceberg": {
80
+ "types": "./dist/iceberg/index.d.mts",
81
+ "import": "./dist/iceberg/index.mjs",
82
+ "default": "./dist/iceberg/index.mjs"
83
+ },
84
84
  "./node": {
85
85
  "types": "./dist/adapters/node.d.mts",
86
86
  "import": "./dist/adapters/node.mjs",
@@ -180,8 +180,8 @@
180
180
  "drizzle-orm": "1.0.0-rc.3",
181
181
  "icebird": "^0.8.6",
182
182
  "proper-lockfile": "^4.1.2",
183
- "@gscdump/contracts": "0.24.0",
184
- "gscdump": "0.24.0"
183
+ "gscdump": "0.25.0",
184
+ "@gscdump/contracts": "0.25.0"
185
185
  },
186
186
  "devDependencies": {
187
187
  "@duckdb/duckdb-wasm": "^1.32.0",
@@ -1,13 +1,4 @@
1
1
  import { MS_PER_DAY, toIsoDate } from "gscdump";
2
- const DEFAULT_SEARCH_TYPE = "web";
3
- function inferSearchType(entry) {
4
- return entry.searchType ?? "web";
5
- }
6
- function inferLegacyTier(entry) {
7
- if (entry.tier !== void 0) return entry.tier;
8
- if (entry.partition.startsWith("daily/")) return "raw";
9
- if (entry.partition.startsWith("monthly/")) return "d30";
10
- }
11
2
  function dayPartition(date) {
12
3
  return `daily/${date}`;
13
4
  }
@@ -33,10 +24,19 @@ function quarterOfMonth(month) {
33
24
  const [y, m] = month.split("-").map(Number);
34
25
  return `${y}-Q${Math.floor((m - 1) / 3) + 1}`;
35
26
  }
27
+ function tenantPrefix(ctx) {
28
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
29
+ }
30
+ const DEFAULT_SEARCH_TYPE = "web";
36
31
  function objectKey(ctx, table, partition, version, searchType) {
37
32
  return `${ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/${table}` : `u_${ctx.userId}/${table}`}/${searchType !== void 0 && searchType !== "web" ? `${searchType}/` : ""}${partition}__v${version}.parquet`;
38
33
  }
39
- function tenantPrefix(ctx) {
40
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
34
+ function inferSearchType(entry) {
35
+ return entry.searchType ?? "web";
36
+ }
37
+ function inferLegacyTier(entry) {
38
+ if (entry.tier !== void 0) return entry.tier;
39
+ if (entry.partition.startsWith("daily/")) return "raw";
40
+ if (entry.partition.startsWith("monthly/")) return "d30";
41
41
  }
42
42
  export { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, mondayOfWeek, monthPartition, objectKey, quarterOfMonth, quarterPartition, tenantPrefix, weekPartition };