@gscdump/engine 0.24.1 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -1,189 +1,15 @@
1
1
  import { ENGINE_QUERY_CAPABILITIES, coerceRow, coerceRows, createSqlQuerySource } from "./_chunks/source.mjs";
2
+ import { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, objectKey } from "./_chunks/layout.mjs";
2
3
  import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, page_queries, pages, queries } from "./_chunks/schema.mjs";
3
- import { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, objectKey } from "./_chunks/storage.mjs";
4
- import { FILES_PLACEHOLDER, RAW_DAILY_COMPACT_THRESHOLD, countRawDailies, dedupeOverlappingTiers, enumeratePartitions, resolveParquetSQL, splitOverlappingTiers, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
4
+ import { enumeratePartitions } from "./_chunks/compaction.mjs";
5
+ import { FILES_PLACEHOLDER, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
5
6
  import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
6
- import { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine, gcOrphansImpl } from "./_chunks/engine.mjs";
7
- import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, icebergTableSpec } from "./_chunks/iceberg-schema.mjs";
7
+ import { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "./_chunks/engine.mjs";
8
8
  import { assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
9
9
  import "./planner.mjs";
10
10
  import { createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "./_chunks/resolver.mjs";
11
11
  import { rebuildDailyFromHourly } from "./rollups.mjs";
12
12
  import { fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
13
- import { icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "icebird";
14
- const ICEBERG_TYPE_MAP = {
15
- STRING: "string",
16
- INT: "int",
17
- LONG: "long",
18
- DOUBLE: "double",
19
- DATE: "date"
20
- };
21
- function icebergSchemaFor(table) {
22
- return {
23
- "type": "struct",
24
- "schema-id": 0,
25
- "fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
26
- id: col.fieldId,
27
- name: col.name,
28
- required: col.required,
29
- type: ICEBERG_TYPE_MAP[col.type]
30
- }))
31
- };
32
- }
33
- function icebergPartitionSpecFor(table) {
34
- const fields = ICEBERG_SCHEMAS[table].columns;
35
- const fieldId = (name) => {
36
- const col = fields.find((c) => c.name === name);
37
- if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
38
- return col.fieldId;
39
- };
40
- return {
41
- "spec-id": 0,
42
- "fields": ICEBERG_PARTITION_SPEC.map((p, i) => ({
43
- "source-id": fieldId(p.sourceColumn),
44
- "field-id": 1e3 + i,
45
- "name": p.name,
46
- "transform": p.transform
47
- }))
48
- };
49
- }
50
- async function connectIcebergCatalog(config) {
51
- return {
52
- catalog: await restCatalogConnect({
53
- url: config.catalogUri,
54
- warehouse: config.warehouse,
55
- requestInit: { headers: { Authorization: `Bearer ${config.catalogToken}` } }
56
- }),
57
- resolver: s3SignedResolver({
58
- accessKeyId: config.s3.accessKeyId,
59
- secretAccessKey: config.s3.secretAccessKey,
60
- region: config.s3.region ?? "auto",
61
- endpoint: config.s3.endpoint,
62
- pathStyle: true
63
- }),
64
- namespace: config.namespace
65
- };
66
- }
67
- function isCommitRateLimited(err) {
68
- if (err && typeof err === "object" && err.status === 429) return true;
69
- const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
70
- return msg.includes("429") || msg.includes("too many commits") || msg.includes("rate limit");
71
- }
72
- function defaultCommitSleep(ms) {
73
- return new Promise((resolve) => setTimeout(resolve, ms));
74
- }
75
- async function icebergAppendRetrying(args, options = {}) {
76
- const maxAttempts = options.maxAttempts ?? 6;
77
- const baseDelayMs = options.baseDelayMs ?? 1e3;
78
- const maxDelayMs = options.maxDelayMs ?? 2e4;
79
- const sleep = options.sleep ?? defaultCommitSleep;
80
- const random = options.random ?? Math.random;
81
- for (let attempt = 0; attempt < maxAttempts; attempt++) {
82
- const err = await icebergAppend(args).then(() => void 0, (e) => e);
83
- if (err === void 0) return;
84
- if (!isCommitRateLimited(err) || attempt === maxAttempts - 1) throw err;
85
- const ceiling = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt);
86
- await sleep(Math.floor(random() * ceiling));
87
- }
88
- }
89
- async function ensureIcebergNamespace(conn) {
90
- await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
91
- }
92
- async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
93
- const results = [];
94
- for (const table of tables) await icebergCreateTable({
95
- catalog: conn.catalog,
96
- namespace: conn.namespace,
97
- table,
98
- schema: icebergSchemaFor(table),
99
- partitionSpec: icebergPartitionSpecFor(table)
100
- }).then(() => results.push({
101
- table,
102
- ok: true
103
- }), (e) => results.push({
104
- table,
105
- ok: false,
106
- error: String(e)
107
- }));
108
- return results;
109
- }
110
- async function listIcebergTables(conn) {
111
- return restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name).sort(), () => []);
112
- }
113
- function monthsInRange(range) {
114
- const [sy, sm] = range.start.split("-").map(Number);
115
- const [ey, em] = range.end.split("-").map(Number);
116
- const out = [];
117
- let y = sy;
118
- let m = sm;
119
- while (y < ey || y === ey && m <= em) {
120
- out.push(`${y}-${String(m).padStart(2, "0")}`);
121
- m++;
122
- if (m > 12) {
123
- m = 1;
124
- y++;
125
- }
126
- }
127
- return out;
128
- }
129
- function monthsSinceEpoch(ym) {
130
- const [y, m] = ym.split("-").map(Number);
131
- return (y - 1970) * 12 + (m - 1);
132
- }
133
- function stripBucket(filePath) {
134
- if (!filePath.startsWith("s3://")) return filePath;
135
- const rest = filePath.slice(5);
136
- const slash = rest.indexOf("/");
137
- return slash >= 0 ? rest.slice(slash + 1) : rest;
138
- }
139
- async function listIcebergDataFiles(conn, opts) {
140
- const { metadata } = await restCatalogLoadTable(conn.catalog, {
141
- namespace: conn.namespace,
142
- table: opts.table
143
- });
144
- if (metadata["current-snapshot-id"] == null) return [];
145
- const wantedMonths = new Set(monthsInRange(opts.range).map(monthsSinceEpoch));
146
- const manifests = await icebergManifests({
147
- metadata,
148
- resolver: conn.resolver
149
- });
150
- const out = [];
151
- for (const m of manifests) for (const entry of m.entries) {
152
- if (entry.status === 2) continue;
153
- const df = entry.data_file;
154
- if (df.content !== 0) continue;
155
- const part = df.partition;
156
- if (part.site_id !== opts.siteId) continue;
157
- if (part.search_type !== opts.searchType) continue;
158
- const month = part.date_month;
159
- if (typeof month !== "number" || !wantedMonths.has(month)) continue;
160
- out.push({
161
- filePath: df.file_path,
162
- objectKey: stripBucket(df.file_path),
163
- bytes: Number(df.file_size_in_bytes),
164
- rowCount: Number(df.record_count)
165
- });
166
- }
167
- return out;
168
- }
169
- async function dropIcebergTables(conn, tables) {
170
- const targets = tables ?? await restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name), () => []);
171
- const results = [];
172
- for (const table of targets) await icebergDropTable({
173
- catalog: conn.catalog,
174
- namespace: conn.namespace,
175
- table,
176
- purgeRequested: true
177
- }).then(() => results.push({
178
- table,
179
- ok: true
180
- }), (e) => results.push({
181
- table,
182
- ok: false,
183
- error: String(e)
184
- }));
185
- return results;
186
- }
187
13
  const NOOP_RESULT = {
188
14
  flushed: 0,
189
15
  recovered: 0,
@@ -299,99 +125,6 @@ function createIngestAccumulator(opts) {
299
125
  }
300
126
  };
301
127
  }
302
- const DAY_MILLIS = 864e5;
303
- function toIcebergDate(value) {
304
- if (typeof value === "string") {
305
- const ms = Date.parse(`${value}T00:00:00Z`);
306
- if (Number.isNaN(ms)) throw new TypeError(`toIcebergDate: invalid date string '${value}'`);
307
- return Math.floor(ms / DAY_MILLIS);
308
- }
309
- if (value instanceof Date) {
310
- const ms = value.getTime();
311
- if (Number.isNaN(ms)) throw new TypeError("toIcebergDate: invalid Date (NaN)");
312
- return Math.floor(ms / DAY_MILLIS);
313
- }
314
- return value;
315
- }
316
- function coerceJsonSafe(value) {
317
- if (typeof value === "bigint") return Number(value);
318
- return value;
319
- }
320
- function toRecords(slice, rows) {
321
- const siteId = slice.ctx.siteId ?? "";
322
- return rows.map((row) => {
323
- const out = {};
324
- for (const k in row) out[k] = coerceJsonSafe(row[k]);
325
- out.date = toIcebergDate(out.date);
326
- out.site_id = siteId;
327
- out.search_type = slice.searchType;
328
- return out;
329
- });
330
- }
331
- function createIcebergAppendSink(options) {
332
- let connection;
333
- const buffers = /* @__PURE__ */ new Map();
334
- function connect() {
335
- connection ??= connectIcebergCatalog(options.catalog);
336
- return connection;
337
- }
338
- return {
339
- capabilities: { appendOnly: true },
340
- async emit(slice, rows) {
341
- if (rows.length === 0) return { rowCount: 0 };
342
- const records = toRecords(slice, rows);
343
- const buffer = buffers.get(slice.table);
344
- if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
345
- else buffers.set(slice.table, records);
346
- return { rowCount: records.length };
347
- },
348
- async close() {
349
- const flushed = [];
350
- const failed = [];
351
- if (buffers.size === 0) return {
352
- flushed,
353
- failed
354
- };
355
- const conn = await connect().then((c) => c, (err) => {
356
- connection = void 0;
357
- return { error: String(err) };
358
- });
359
- if ("error" in conn) {
360
- for (const [table, records] of buffers) if (records.length > 0) failed.push({
361
- table,
362
- error: conn.error
363
- });
364
- buffers.clear();
365
- return {
366
- flushed,
367
- failed
368
- };
369
- }
370
- for (const [table, records] of buffers) {
371
- if (records.length === 0) continue;
372
- await icebergAppendRetrying({
373
- catalog: conn.catalog,
374
- namespace: conn.namespace,
375
- table,
376
- resolver: conn.resolver,
377
- records
378
- }, options.commitRetry).then(() => {
379
- flushed.push(table);
380
- }, (err) => {
381
- failed.push({
382
- table,
383
- error: String(err)
384
- });
385
- });
386
- }
387
- buffers.clear();
388
- return {
389
- flushed,
390
- failed
391
- };
392
- }
393
- };
394
- }
395
128
  const KEY_SEP = "\0";
396
129
  function partitionKey(slice) {
397
130
  return [
@@ -544,4 +277,4 @@ const MIN_SYNC_IMPRESSIONS = 1;
544
277
  const MIN_COUNTRY_IMPRESSIONS = 10;
545
278
  const MAX_SITEMAP_URLS_PER_SITE = 5e4;
546
279
  const MAX_TRACKED_URLS_PER_SITE = 2e5;
547
- export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, connectIcebergCatalog, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergAppendSink, createIcebergResolverAdapter, createIcebergTables, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dedupeOverlappingTiers, dimensionToColumn, drizzleSchema, dropIcebergTables, ensureIcebergNamespace, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isCommitRateLimited, listIcebergDataFiles, listIcebergTables, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
280
+ export { DEFAULT_SEARCH_TYPE, ENGINE_QUERY_CAPABILITIES, FILES_PLACEHOLDER, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergResolverAdapter, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createParquetResolverAdapter, createRowAccumulator, createSqlQuerySource, createStorageEngine, currentSchemaVersion, dates, dayPartition, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, objectKey, page_queries, pages, parseEnabledSearchTypes, pgResolverAdapter, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
package/dist/planner.mjs CHANGED
@@ -1,2 +1,3 @@
1
- import { FILES_PLACEHOLDER, compileLogicalQueryPlan, enumeratePartitions, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
1
+ import { enumeratePartitions } from "./_chunks/compaction.mjs";
2
+ import { FILES_PLACEHOLDER, compileLogicalQueryPlan, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
2
3
  export { FILES_PLACEHOLDER, compileLogicalQueryPlan, enumeratePartitions, resolveParquetSQL, substituteNamedFiles };
package/dist/rollups.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import "./_chunks/storage.mjs";
1
+ import "./_chunks/layout.mjs";
2
2
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
3
3
  import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./entities.mjs";
4
4
  import { MS_PER_DAY } from "gscdump";
@@ -20,4 +20,4 @@ interface LocalIcebergSink extends Sink {
20
20
  * use this sink must skip when the stack is unreachable.
21
21
  */
22
22
  declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
23
- export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
23
+ export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, type LocalIcebergSinkOptions, createLocalIcebergSink };
@@ -1,4 +1,4 @@
1
- import { ICEBERG_SCHEMAS } from "./_chunks/iceberg-schema.mjs";
1
+ import { ICEBERG_SCHEMAS } from "./_chunks/schema2.mjs";
2
2
  import { execFile } from "node:child_process";
3
3
  import { dirname, join } from "node:path";
4
4
  import process from "node:process";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.24.1",
4
+ "version": "0.25.1",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -41,6 +41,11 @@
41
41
  "import": "./dist/planner.mjs",
42
42
  "default": "./dist/planner.mjs"
43
43
  },
44
+ "./compaction": {
45
+ "types": "./dist/compaction-public.d.mts",
46
+ "import": "./dist/compaction-public.mjs",
47
+ "default": "./dist/compaction-public.mjs"
48
+ },
44
49
  "./schema": {
45
50
  "types": "./dist/schema.d.mts",
46
51
  "import": "./dist/schema.mjs",
@@ -66,11 +71,6 @@
66
71
  "import": "./dist/sql-fragments.mjs",
67
72
  "default": "./dist/sql-fragments.mjs"
68
73
  },
69
- "./schedule": {
70
- "types": "./dist/schedule.d.mts",
71
- "import": "./dist/schedule.mjs",
72
- "default": "./dist/schedule.mjs"
73
- },
74
74
  "./entities": {
75
75
  "types": "./dist/entities.d.mts",
76
76
  "import": "./dist/entities.mjs",
@@ -81,6 +81,11 @@
81
81
  "import": "./dist/rollups.mjs",
82
82
  "default": "./dist/rollups.mjs"
83
83
  },
84
+ "./iceberg": {
85
+ "types": "./dist/iceberg/index.d.mts",
86
+ "import": "./dist/iceberg/index.mjs",
87
+ "default": "./dist/iceberg/index.mjs"
88
+ },
84
89
  "./node": {
85
90
  "types": "./dist/adapters/node.d.mts",
86
91
  "import": "./dist/adapters/node.mjs",
@@ -180,8 +185,8 @@
180
185
  "drizzle-orm": "1.0.0-rc.3",
181
186
  "icebird": "^0.8.6",
182
187
  "proper-lockfile": "^4.1.2",
183
- "@gscdump/contracts": "0.24.1",
184
- "gscdump": "0.24.1"
188
+ "@gscdump/contracts": "0.25.1",
189
+ "gscdump": "0.25.1"
185
190
  },
186
191
  "devDependencies": {
187
192
  "@duckdb/duckdb-wasm": "^1.32.0",
@@ -1,13 +1,4 @@
1
1
  import { MS_PER_DAY, toIsoDate } from "gscdump";
2
- const DEFAULT_SEARCH_TYPE = "web";
3
- function inferSearchType(entry) {
4
- return entry.searchType ?? "web";
5
- }
6
- function inferLegacyTier(entry) {
7
- if (entry.tier !== void 0) return entry.tier;
8
- if (entry.partition.startsWith("daily/")) return "raw";
9
- if (entry.partition.startsWith("monthly/")) return "d30";
10
- }
11
2
  function dayPartition(date) {
12
3
  return `daily/${date}`;
13
4
  }
@@ -33,10 +24,19 @@ function quarterOfMonth(month) {
33
24
  const [y, m] = month.split("-").map(Number);
34
25
  return `${y}-Q${Math.floor((m - 1) / 3) + 1}`;
35
26
  }
27
+ function tenantPrefix(ctx) {
28
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
29
+ }
30
+ const DEFAULT_SEARCH_TYPE = "web";
36
31
  function objectKey(ctx, table, partition, version, searchType) {
37
32
  return `${ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/${table}` : `u_${ctx.userId}/${table}`}/${searchType !== void 0 && searchType !== "web" ? `${searchType}/` : ""}${partition}__v${version}.parquet`;
38
33
  }
39
- function tenantPrefix(ctx) {
40
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
34
+ function inferSearchType(entry) {
35
+ return entry.searchType ?? "web";
36
+ }
37
+ function inferLegacyTier(entry) {
38
+ if (entry.tier !== void 0) return entry.tier;
39
+ if (entry.partition.startsWith("daily/")) return "raw";
40
+ if (entry.partition.startsWith("monthly/")) return "d30";
41
41
  }
42
42
  export { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, mondayOfWeek, monthPartition, objectKey, quarterOfMonth, quarterPartition, tenantPrefix, weekPartition };