@gscdump/engine 0.20.3 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/engine.mjs +1 -1
- package/dist/_chunks/iceberg-schema.mjs +67 -0
- package/dist/_chunks/registry.d.mts +1 -1
- package/dist/_chunks/resolver.mjs +15 -21
- package/dist/_chunks/schema.d.mts +452 -133
- package/dist/_chunks/schema.mjs +50 -24
- package/dist/_chunks/sink.d.mts +329 -0
- package/dist/_chunks/storage.d.mts +4 -4
- package/dist/adapters/duckdb-node.mjs +2 -2
- package/dist/adapters/hyparquet.mjs +5 -5
- package/dist/index.d.mts +39 -7
- package/dist/index.mjs +272 -14
- package/dist/ingest.d.mts +23 -3
- package/dist/ingest.mjs +43 -18
- package/dist/rollups.mjs +12 -12
- package/dist/schema.d.mts +2 -2
- package/dist/schema.mjs +2 -2
- package/dist/sink-node.d.mts +31 -0
- package/dist/sink-node.mjs +76 -0
- package/dist/vendor/hysnappy-purejs.d.mts +29 -0
- package/dist/vendor/hysnappy-purejs.mjs +13 -0
- package/package.json +14 -3
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,132 @@
|
|
|
1
1
|
import { n as coerceRows, t as coerceRow } from "./_chunks/coerce.mjs";
|
|
2
|
-
import { a as dimensionToColumn, d as
|
|
2
|
+
import { a as dimensionToColumn, d as dates, f as drizzleSchema, g as queries, h as pages, l as TABLE_METADATA, m as page_queries, n as allTables, o as inferTable, p as hourly_pages, r as currentSchemaVersion, t as SCHEMAS, u as countries } from "./_chunks/schema.mjs";
|
|
3
3
|
import { a as inferSearchType, c as objectKey, i as inferLegacyTier, n as dayPartition, r as hourPartition, t as DEFAULT_SEARCH_TYPE } from "./_chunks/storage.mjs";
|
|
4
4
|
import { a as RAW_DAILY_COMPACT_THRESHOLD, c as dedupeOverlappingTiers, i as substituteNamedFiles, l as enumeratePartitions, r as resolveParquetSQL, s as countRawDailies, t as FILES_PLACEHOLDER, u as splitOverlappingTiers } from "./_chunks/parquet-plan.mjs";
|
|
5
5
|
import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
|
|
6
6
|
import { a as createDuckDBCodec, i as canonicalEmptyParquetSchema, n as createStorageEngine, o as createDuckDBExecutor, r as gcOrphansImpl, t as MAX_DAY_BYTES } from "./_chunks/engine.mjs";
|
|
7
|
-
import {
|
|
7
|
+
import { a as ICEBERG_TABLES, i as ICEBERG_SCHEMAS, n as ICEBERG_PARTITION_COLUMNS, o as icebergTableSpec, r as ICEBERG_PARTITION_SPEC, t as ICEBERG_FIELD_ID_BASE } from "./_chunks/iceberg-schema.mjs";
|
|
8
|
+
import { assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
|
|
8
9
|
import "./planner.mjs";
|
|
9
10
|
import { rebuildDailyFromHourly } from "./rollups.mjs";
|
|
10
11
|
import { fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
|
|
12
|
+
import { icebergAppend, icebergCreateTable, icebergDropTable, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, s3SignedResolver } from "icebird";
|
|
13
|
+
const ICEBERG_TYPE_MAP = {
|
|
14
|
+
STRING: "string",
|
|
15
|
+
INT: "int",
|
|
16
|
+
LONG: "long",
|
|
17
|
+
DOUBLE: "double",
|
|
18
|
+
DATE: "date"
|
|
19
|
+
};
|
|
20
|
+
function icebergSchemaFor(table) {
|
|
21
|
+
return {
|
|
22
|
+
"type": "struct",
|
|
23
|
+
"schema-id": 0,
|
|
24
|
+
"fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
|
|
25
|
+
id: col.fieldId,
|
|
26
|
+
name: col.name,
|
|
27
|
+
required: col.required,
|
|
28
|
+
type: ICEBERG_TYPE_MAP[col.type]
|
|
29
|
+
}))
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
function icebergPartitionSpecFor(table) {
|
|
33
|
+
const fields = ICEBERG_SCHEMAS[table].columns;
|
|
34
|
+
const fieldId = (name) => {
|
|
35
|
+
const col = fields.find((c) => c.name === name);
|
|
36
|
+
if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
|
|
37
|
+
return col.fieldId;
|
|
38
|
+
};
|
|
39
|
+
return {
|
|
40
|
+
"spec-id": 0,
|
|
41
|
+
"fields": ICEBERG_PARTITION_SPEC.map((p, i) => ({
|
|
42
|
+
"source-id": fieldId(p.sourceColumn),
|
|
43
|
+
"field-id": 1e3 + i,
|
|
44
|
+
"name": p.name,
|
|
45
|
+
"transform": p.transform
|
|
46
|
+
}))
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
async function connectIcebergCatalog(config) {
|
|
50
|
+
return {
|
|
51
|
+
catalog: await restCatalogConnect({
|
|
52
|
+
url: config.catalogUri,
|
|
53
|
+
warehouse: config.warehouse,
|
|
54
|
+
requestInit: { headers: { Authorization: `Bearer ${config.catalogToken}` } }
|
|
55
|
+
}),
|
|
56
|
+
resolver: s3SignedResolver({
|
|
57
|
+
accessKeyId: config.s3.accessKeyId,
|
|
58
|
+
secretAccessKey: config.s3.secretAccessKey,
|
|
59
|
+
region: config.s3.region ?? "auto",
|
|
60
|
+
endpoint: config.s3.endpoint,
|
|
61
|
+
pathStyle: true
|
|
62
|
+
}),
|
|
63
|
+
namespace: config.namespace
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
function isCommitRateLimited(err) {
|
|
67
|
+
if (err && typeof err === "object" && err.status === 429) return true;
|
|
68
|
+
const msg = (err instanceof Error ? err.message : String(err)).toLowerCase();
|
|
69
|
+
return msg.includes("429") || msg.includes("too many commits") || msg.includes("rate limit");
|
|
70
|
+
}
|
|
71
|
+
function defaultCommitSleep(ms) {
|
|
72
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
73
|
+
}
|
|
74
|
+
async function icebergAppendRetrying(args, options = {}) {
|
|
75
|
+
const maxAttempts = options.maxAttempts ?? 6;
|
|
76
|
+
const baseDelayMs = options.baseDelayMs ?? 1e3;
|
|
77
|
+
const maxDelayMs = options.maxDelayMs ?? 2e4;
|
|
78
|
+
const sleep = options.sleep ?? defaultCommitSleep;
|
|
79
|
+
const random = options.random ?? Math.random;
|
|
80
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
81
|
+
const err = await icebergAppend(args).then(() => void 0, (e) => e);
|
|
82
|
+
if (err === void 0) return;
|
|
83
|
+
if (!isCommitRateLimited(err) || attempt === maxAttempts - 1) throw err;
|
|
84
|
+
const ceiling = Math.min(maxDelayMs, baseDelayMs * 2 ** attempt);
|
|
85
|
+
await sleep(Math.floor(random() * ceiling));
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
async function ensureIcebergNamespace(conn) {
|
|
89
|
+
await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
|
|
90
|
+
}
|
|
91
|
+
async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
|
|
92
|
+
const results = [];
|
|
93
|
+
for (const table of tables) await icebergCreateTable({
|
|
94
|
+
catalog: conn.catalog,
|
|
95
|
+
namespace: conn.namespace,
|
|
96
|
+
table,
|
|
97
|
+
schema: icebergSchemaFor(table),
|
|
98
|
+
partitionSpec: icebergPartitionSpecFor(table)
|
|
99
|
+
}).then(() => results.push({
|
|
100
|
+
table,
|
|
101
|
+
ok: true
|
|
102
|
+
}), (e) => results.push({
|
|
103
|
+
table,
|
|
104
|
+
ok: false,
|
|
105
|
+
error: String(e)
|
|
106
|
+
}));
|
|
107
|
+
return results;
|
|
108
|
+
}
|
|
109
|
+
async function listIcebergTables(conn) {
|
|
110
|
+
return restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name).sort(), () => []);
|
|
111
|
+
}
|
|
112
|
+
async function dropIcebergTables(conn, tables) {
|
|
113
|
+
const targets = tables ?? await restCatalogListTables(conn.catalog, { namespace: conn.namespace }).then((list) => list.map((t) => t.name), () => []);
|
|
114
|
+
const results = [];
|
|
115
|
+
for (const table of targets) await icebergDropTable({
|
|
116
|
+
catalog: conn.catalog,
|
|
117
|
+
namespace: conn.namespace,
|
|
118
|
+
table,
|
|
119
|
+
purgeRequested: true
|
|
120
|
+
}).then(() => results.push({
|
|
121
|
+
table,
|
|
122
|
+
ok: true
|
|
123
|
+
}), (e) => results.push({
|
|
124
|
+
table,
|
|
125
|
+
ok: false,
|
|
126
|
+
error: String(e)
|
|
127
|
+
}));
|
|
128
|
+
return results;
|
|
129
|
+
}
|
|
11
130
|
const NOOP_RESULT = {
|
|
12
131
|
flushed: 0,
|
|
13
132
|
recovered: 0,
|
|
@@ -123,38 +242,177 @@ function createIngestAccumulator(opts) {
|
|
|
123
242
|
}
|
|
124
243
|
};
|
|
125
244
|
}
|
|
245
|
+
const DAY_MILLIS = 864e5;
|
|
246
|
+
function toIcebergDate(value) {
|
|
247
|
+
if (typeof value === "string") return Math.floor(Date.parse(`${value}T00:00:00Z`) / DAY_MILLIS);
|
|
248
|
+
if (value instanceof Date) return Math.floor(value.getTime() / DAY_MILLIS);
|
|
249
|
+
return value;
|
|
250
|
+
}
|
|
251
|
+
function toRecords(slice, rows) {
|
|
252
|
+
const siteId = slice.ctx.siteId ?? "";
|
|
253
|
+
return rows.map((row) => ({
|
|
254
|
+
...row,
|
|
255
|
+
date: toIcebergDate(row.date),
|
|
256
|
+
site_id: siteId,
|
|
257
|
+
search_type: slice.searchType
|
|
258
|
+
}));
|
|
259
|
+
}
|
|
260
|
+
function createIcebergAppendSink(options) {
|
|
261
|
+
let connection;
|
|
262
|
+
const buffers = /* @__PURE__ */ new Map();
|
|
263
|
+
function connect() {
|
|
264
|
+
connection ??= connectIcebergCatalog(options.catalog);
|
|
265
|
+
return connection;
|
|
266
|
+
}
|
|
267
|
+
return {
|
|
268
|
+
capabilities: { appendOnly: true },
|
|
269
|
+
async emit(slice, rows) {
|
|
270
|
+
if (rows.length === 0) return { rowCount: 0 };
|
|
271
|
+
const records = toRecords(slice, rows);
|
|
272
|
+
const buffer = buffers.get(slice.table);
|
|
273
|
+
if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
|
|
274
|
+
else buffers.set(slice.table, records);
|
|
275
|
+
return { rowCount: records.length };
|
|
276
|
+
},
|
|
277
|
+
async close() {
|
|
278
|
+
const flushed = [];
|
|
279
|
+
const failed = [];
|
|
280
|
+
if (buffers.size === 0) return {
|
|
281
|
+
flushed,
|
|
282
|
+
failed
|
|
283
|
+
};
|
|
284
|
+
const conn = await connect().then((c) => c, (err) => {
|
|
285
|
+
connection = void 0;
|
|
286
|
+
return { error: String(err) };
|
|
287
|
+
});
|
|
288
|
+
if ("error" in conn) {
|
|
289
|
+
for (const [table, records] of buffers) if (records.length > 0) failed.push({
|
|
290
|
+
table,
|
|
291
|
+
error: conn.error
|
|
292
|
+
});
|
|
293
|
+
buffers.clear();
|
|
294
|
+
return {
|
|
295
|
+
flushed,
|
|
296
|
+
failed
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
for (const [table, records] of buffers) {
|
|
300
|
+
if (records.length === 0) continue;
|
|
301
|
+
await icebergAppendRetrying({
|
|
302
|
+
catalog: conn.catalog,
|
|
303
|
+
namespace: conn.namespace,
|
|
304
|
+
table,
|
|
305
|
+
resolver: conn.resolver,
|
|
306
|
+
records
|
|
307
|
+
}, options.commitRetry).then(() => {
|
|
308
|
+
flushed.push(table);
|
|
309
|
+
}, (err) => {
|
|
310
|
+
failed.push({
|
|
311
|
+
table,
|
|
312
|
+
error: String(err)
|
|
313
|
+
});
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
buffers.clear();
|
|
317
|
+
return {
|
|
318
|
+
flushed,
|
|
319
|
+
failed
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
};
|
|
323
|
+
}
|
|
324
|
+
const KEY_SEP = "\0";
|
|
325
|
+
function partitionKey(slice) {
|
|
326
|
+
return [
|
|
327
|
+
slice.table,
|
|
328
|
+
slice.ctx.siteId ?? "",
|
|
329
|
+
slice.searchType,
|
|
330
|
+
slice.date
|
|
331
|
+
].join(KEY_SEP);
|
|
332
|
+
}
|
|
333
|
+
function tableOfKey(key) {
|
|
334
|
+
return key.slice(0, key.indexOf(KEY_SEP));
|
|
335
|
+
}
|
|
336
|
+
function withIdentity(slice, rows) {
|
|
337
|
+
return rows.map((r) => ({
|
|
338
|
+
...r,
|
|
339
|
+
site_id: slice.ctx.siteId ?? "",
|
|
340
|
+
search_type: slice.searchType
|
|
341
|
+
}));
|
|
342
|
+
}
|
|
343
|
+
function createInMemorySink() {
|
|
344
|
+
const partitions = /* @__PURE__ */ new Map();
|
|
345
|
+
let closed = false;
|
|
346
|
+
function allRows() {
|
|
347
|
+
return [...partitions.values()].flat();
|
|
348
|
+
}
|
|
349
|
+
return {
|
|
350
|
+
capabilities: { appendOnly: true },
|
|
351
|
+
async emit(slice, rows) {
|
|
352
|
+
const key = partitionKey(slice);
|
|
353
|
+
const stored = withIdentity(slice, rows);
|
|
354
|
+
const existing = partitions.get(key);
|
|
355
|
+
if (existing) existing.push(...stored);
|
|
356
|
+
else partitions.set(key, stored);
|
|
357
|
+
return { rowCount: stored.length };
|
|
358
|
+
},
|
|
359
|
+
async close() {
|
|
360
|
+
closed = true;
|
|
361
|
+
return {
|
|
362
|
+
flushed: [...new Set([...partitions.keys()].map((k) => tableOfKey(k)))],
|
|
363
|
+
failed: []
|
|
364
|
+
};
|
|
365
|
+
},
|
|
366
|
+
get rows() {
|
|
367
|
+
return allRows();
|
|
368
|
+
},
|
|
369
|
+
get closed() {
|
|
370
|
+
return closed;
|
|
371
|
+
},
|
|
372
|
+
rowsFor(table) {
|
|
373
|
+
return [...partitions.entries()].filter(([k]) => tableOfKey(k) === table).flatMap(([, v]) => v);
|
|
374
|
+
},
|
|
375
|
+
rowsForSlice(slice) {
|
|
376
|
+
return [...partitions.get(partitionKey(slice)) ?? []];
|
|
377
|
+
},
|
|
378
|
+
reset() {
|
|
379
|
+
partitions.clear();
|
|
380
|
+
closed = false;
|
|
381
|
+
}
|
|
382
|
+
};
|
|
383
|
+
}
|
|
126
384
|
const TABLES_BY_SEARCH_TYPE = {
|
|
127
385
|
web: [
|
|
128
386
|
"pages",
|
|
129
|
-
"
|
|
387
|
+
"queries",
|
|
130
388
|
"countries",
|
|
131
|
-
"
|
|
132
|
-
"
|
|
389
|
+
"page_queries",
|
|
390
|
+
"dates"
|
|
133
391
|
],
|
|
134
392
|
discover: [
|
|
135
393
|
"pages",
|
|
136
394
|
"countries",
|
|
137
|
-
"
|
|
395
|
+
"dates"
|
|
138
396
|
],
|
|
139
397
|
news: [
|
|
140
398
|
"pages",
|
|
141
399
|
"countries",
|
|
142
|
-
"
|
|
400
|
+
"dates"
|
|
143
401
|
],
|
|
144
402
|
googleNews: [
|
|
145
403
|
"pages",
|
|
146
404
|
"countries",
|
|
147
|
-
"
|
|
405
|
+
"dates"
|
|
148
406
|
],
|
|
149
407
|
image: [
|
|
150
408
|
"pages",
|
|
151
409
|
"countries",
|
|
152
|
-
"
|
|
410
|
+
"dates"
|
|
153
411
|
],
|
|
154
412
|
video: [
|
|
155
413
|
"pages",
|
|
156
414
|
"countries",
|
|
157
|
-
"
|
|
415
|
+
"dates"
|
|
158
416
|
]
|
|
159
417
|
};
|
|
160
418
|
function parseEnabledSearchTypes(raw) {
|
|
@@ -181,10 +439,10 @@ function validateEnabledSearchTypes(value) {
|
|
|
181
439
|
}
|
|
182
440
|
const TABLE_TIERS = {
|
|
183
441
|
pages: "critical",
|
|
184
|
-
|
|
442
|
+
queries: "critical",
|
|
185
443
|
countries: "standard",
|
|
186
|
-
|
|
187
|
-
|
|
444
|
+
dates: "standard",
|
|
445
|
+
page_queries: "extended"
|
|
188
446
|
};
|
|
189
447
|
function getTableTier(table) {
|
|
190
448
|
return TABLE_TIERS[table] || "extended";
|
|
@@ -215,4 +473,4 @@ const MIN_SYNC_IMPRESSIONS = 1;
|
|
|
215
473
|
const MIN_COUNTRY_IMPRESSIONS = 10;
|
|
216
474
|
const MAX_SITEMAP_URLS_PER_SITE = 5e4;
|
|
217
475
|
const MAX_TRACKED_URLS_PER_SITE = 2e5;
|
|
218
|
-
export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIngestAccumulator, createNoopIngestAccumulator, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, dedupeOverlappingTiers,
|
|
476
|
+
export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, MAX_DAY_BYTES, MAX_GSC_PAGES_R2, MAX_SITEMAP_URLS_PER_SITE, MAX_TRACKED_URLS_PER_SITE, MIN_COUNTRY_IMPRESSIONS, MIN_SYNC_IMPRESSIONS, RAW_DAILY_COMPACT_THRESHOLD, ROW_LIMIT_R2, SCHEMAS, TABLES_BY_SEARCH_TYPE, TABLE_METADATA, TABLE_TIERS, TIER_PRIORITY, WEIGHT_PRIORITY, allTables, assembleDatesRow, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, connectIcebergCatalog, countRawDailies, countries, createDuckDBCodec, createDuckDBExecutor, createIcebergAppendSink, createIcebergTables, createInMemorySink, createIngestAccumulator, createNoopIngestAccumulator, createRowAccumulator, createStorageEngine, currentSchemaVersion, dates, dayPartition, dedupeOverlappingTiers, dimensionToColumn, drizzleSchema, dropIcebergTables, ensureIcebergNamespace, enumeratePartitions, fixedPolicy, formatLiteral, gcOrphansImpl, getDateWeight, getTableTier, getTablesForTier, hourPartition, hourly_pages, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, isCommitRateLimited, listIcebergTables, objectKey, page_queries, pages, parseEnabledSearchTypes, queries, rebuildDailyFromHourly, resolveParquetSQL, sitemapPolicy, splitOverlappingTiers, substituteNamedFiles, toPath, toSumPosition, transformGscRow, validateEnabledSearchTypes };
|
package/dist/ingest.d.mts
CHANGED
|
@@ -19,8 +19,8 @@ interface IngestOptions {
|
|
|
19
19
|
/**
|
|
20
20
|
* Canonical form of a query string, stored alongside `query` as
|
|
21
21
|
* `query_canonical`. Site-specific (e.g. synonym groups, stemming); if
|
|
22
|
-
* omitted, `query_canonical` is null. Applied to `
|
|
23
|
-
* `
|
|
22
|
+
* omitted, `query_canonical` is null. Applied to `queries` +
|
|
23
|
+
* `page_queries` tables only.
|
|
24
24
|
*/
|
|
25
25
|
normalizeQuery?: (query: string) => string | null | undefined;
|
|
26
26
|
}
|
|
@@ -44,6 +44,26 @@ declare function transformGscRow(table: TableName, apiRow: GscApiRow, options?:
|
|
|
44
44
|
date: string;
|
|
45
45
|
row: Row;
|
|
46
46
|
} | null;
|
|
47
|
+
/**
|
|
48
|
+
* Assemble one `dates` row for a single `date` from the two GSC queries that
|
|
49
|
+
* back the table:
|
|
50
|
+
*
|
|
51
|
+
* - `totalsRow` — the GSC `['date']` query result: the TRUE site totals
|
|
52
|
+
* (clicks/impressions/position), including anonymized impressions.
|
|
53
|
+
* - `deviceRows` — the GSC `['date','device']` query results for that date:
|
|
54
|
+
* one row per device, pivoted into the 9 `*_{device}` columns.
|
|
55
|
+
* - `queryGrainedImpressions` — total impressions summed from the
|
|
56
|
+
* `['query','date']` (or `['page','query','date']`) query for the same date,
|
|
57
|
+
* used to derive `anonymized_impressions_pct`.
|
|
58
|
+
*
|
|
59
|
+
* `anonymized_impressions_pct = 1 - query_grained_impressions /
|
|
60
|
+
* page_grained_impressions`, where the page/date totals come from `totalsRow`.
|
|
61
|
+
* Mirrors the legacy `dailyTotalsRollup` formula. Clamped to `[0, 1]`.
|
|
62
|
+
*/
|
|
63
|
+
declare function assembleDatesRow(date: string, totalsRow: GscApiRow, deviceRows: readonly GscApiRow[], queryGrainedImpressions: number): {
|
|
64
|
+
date: string;
|
|
65
|
+
row: Row;
|
|
66
|
+
};
|
|
47
67
|
interface RowAccumulator {
|
|
48
68
|
/**
|
|
49
69
|
* Push a batch of GSC API rows into the accumulator. Returns `false` if
|
|
@@ -93,4 +113,4 @@ interface RowAccumulatorOptions extends IngestOptions {
|
|
|
93
113
|
trackDateBoundary?: boolean;
|
|
94
114
|
}
|
|
95
115
|
declare function createRowAccumulator(options?: RowAccumulatorOptions): RowAccumulator;
|
|
96
|
-
export { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, TABLE_DIMS, createRowAccumulator, toPath, toSumPosition, transformGscRow };
|
|
116
|
+
export { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, TABLE_DIMS, assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow };
|
package/dist/ingest.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
const TABLE_DIMS = {
|
|
2
2
|
pages: ["page", "date"],
|
|
3
|
-
|
|
3
|
+
queries: ["query", "date"],
|
|
4
4
|
countries: ["country", "date"],
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
dates: ["date"],
|
|
6
|
+
page_queries: [
|
|
7
7
|
"page",
|
|
8
8
|
"query",
|
|
9
9
|
"date"
|
|
@@ -40,7 +40,7 @@ function transformGscRow(table, apiRow, options = {}) {
|
|
|
40
40
|
}
|
|
41
41
|
};
|
|
42
42
|
}
|
|
43
|
-
if (table === "
|
|
43
|
+
if (table === "queries") {
|
|
44
44
|
const query = String(keys[0] ?? "");
|
|
45
45
|
const date = String(keys[1] ?? "");
|
|
46
46
|
return {
|
|
@@ -68,19 +68,6 @@ function transformGscRow(table, apiRow, options = {}) {
|
|
|
68
68
|
}
|
|
69
69
|
};
|
|
70
70
|
}
|
|
71
|
-
if (table === "devices") {
|
|
72
|
-
const date = String(keys[1] ?? "");
|
|
73
|
-
return {
|
|
74
|
-
date,
|
|
75
|
-
row: {
|
|
76
|
-
device: String(keys[0] ?? ""),
|
|
77
|
-
date,
|
|
78
|
-
clicks,
|
|
79
|
-
impressions,
|
|
80
|
-
sum_position
|
|
81
|
-
}
|
|
82
|
-
};
|
|
83
|
-
}
|
|
84
71
|
if (table === "hourly_pages") {
|
|
85
72
|
const hour = String(keys[0] ?? "");
|
|
86
73
|
const date = hour.slice(0, 10);
|
|
@@ -109,6 +96,7 @@ function transformGscRow(table, apiRow, options = {}) {
|
|
|
109
96
|
}
|
|
110
97
|
};
|
|
111
98
|
}
|
|
99
|
+
if (table === "dates") throw new Error("`dates` rows must be built via assembleDatesRow, not transformGscRow");
|
|
112
100
|
const query = String(keys[1] ?? "");
|
|
113
101
|
const date = String(keys[2] ?? "");
|
|
114
102
|
const query_canonical = options.normalizeQuery?.(query) ?? null;
|
|
@@ -125,6 +113,43 @@ function transformGscRow(table, apiRow, options = {}) {
|
|
|
125
113
|
}
|
|
126
114
|
};
|
|
127
115
|
}
|
|
116
|
+
const DEVICE_SUFFIX = {
|
|
117
|
+
DESKTOP: "desktop",
|
|
118
|
+
MOBILE: "mobile",
|
|
119
|
+
TABLET: "tablet"
|
|
120
|
+
};
|
|
121
|
+
function assembleDatesRow(date, totalsRow, deviceRows, queryGrainedImpressions) {
|
|
122
|
+
const clicks = totalsRow.clicks || 0;
|
|
123
|
+
const impressions = totalsRow.impressions || 0;
|
|
124
|
+
const row = {
|
|
125
|
+
date,
|
|
126
|
+
clicks,
|
|
127
|
+
impressions,
|
|
128
|
+
sum_position: toSumPosition(totalsRow.position || 0, impressions),
|
|
129
|
+
anonymized_impressions_pct: impressions > 0 ? Math.min(1, Math.max(0, 1 - queryGrainedImpressions / impressions)) : 0,
|
|
130
|
+
clicks_desktop: 0,
|
|
131
|
+
clicks_mobile: 0,
|
|
132
|
+
clicks_tablet: 0,
|
|
133
|
+
impressions_desktop: 0,
|
|
134
|
+
impressions_mobile: 0,
|
|
135
|
+
impressions_tablet: 0,
|
|
136
|
+
sum_position_desktop: 0,
|
|
137
|
+
sum_position_mobile: 0,
|
|
138
|
+
sum_position_tablet: 0
|
|
139
|
+
};
|
|
140
|
+
for (const dr of deviceRows) {
|
|
141
|
+
const suffix = DEVICE_SUFFIX[String(dr.keys?.[1] ?? dr.keys?.[0] ?? "").toUpperCase()];
|
|
142
|
+
if (!suffix) continue;
|
|
143
|
+
const dImpr = dr.impressions || 0;
|
|
144
|
+
row[`clicks_${suffix}`] = dr.clicks || 0;
|
|
145
|
+
row[`impressions_${suffix}`] = dImpr;
|
|
146
|
+
row[`sum_position_${suffix}`] = toSumPosition(dr.position || 0, dImpr);
|
|
147
|
+
}
|
|
148
|
+
return {
|
|
149
|
+
date,
|
|
150
|
+
row
|
|
151
|
+
};
|
|
152
|
+
}
|
|
128
153
|
const DEFAULT_MAX_ROWS = 5e5;
|
|
129
154
|
function createRowAccumulator(options = {}) {
|
|
130
155
|
const maxRows = options.maxRows ?? DEFAULT_MAX_ROWS;
|
|
@@ -200,4 +225,4 @@ function createRowAccumulator(options = {}) {
|
|
|
200
225
|
}
|
|
201
226
|
};
|
|
202
227
|
}
|
|
203
|
-
export { TABLE_DIMS, createRowAccumulator, toPath, toSumPosition, transformGscRow };
|
|
228
|
+
export { TABLE_DIMS, assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow };
|
package/dist/rollups.mjs
CHANGED
|
@@ -269,10 +269,10 @@ const dailyTotalsRollup = {
|
|
|
269
269
|
ORDER BY date
|
|
270
270
|
`
|
|
271
271
|
});
|
|
272
|
-
const
|
|
272
|
+
const queryRows = await runWindowed({
|
|
273
273
|
engine,
|
|
274
274
|
ctx,
|
|
275
|
-
table: "
|
|
275
|
+
table: "queries",
|
|
276
276
|
...searchType !== void 0 ? { searchType } : {},
|
|
277
277
|
sqlFor: (w) => `
|
|
278
278
|
SELECT
|
|
@@ -297,14 +297,14 @@ const dailyTotalsRollup = {
|
|
|
297
297
|
cur.sum_position += Number(r.sum_position);
|
|
298
298
|
pagesByDate.set(date, cur);
|
|
299
299
|
}
|
|
300
|
-
const
|
|
301
|
-
for (const r of
|
|
300
|
+
const queryImpressionsByDate = /* @__PURE__ */ new Map();
|
|
301
|
+
for (const r of queryRows) {
|
|
302
302
|
const date = String(r.date);
|
|
303
|
-
|
|
303
|
+
queryImpressionsByDate.set(date, (queryImpressionsByDate.get(date) ?? BigInt(0)) + BigInt(r.impressions));
|
|
304
304
|
}
|
|
305
305
|
return Array.from(pagesByDate.values()).sort((a, b) => a.date < b.date ? -1 : 1).map((r) => {
|
|
306
306
|
const totalImpressions = BigInt(r.impressions);
|
|
307
|
-
const queryImpressions =
|
|
307
|
+
const queryImpressions = queryImpressionsByDate.get(String(r.date)) ?? BigInt(0);
|
|
308
308
|
const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
|
|
309
309
|
return {
|
|
310
310
|
date: r.date,
|
|
@@ -439,15 +439,15 @@ const topKeywords28dRollup = {
|
|
|
439
439
|
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
440
440
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
441
441
|
ctx,
|
|
442
|
-
table: "
|
|
442
|
+
table: "queries",
|
|
443
443
|
...searchType !== void 0 ? { searchType } : {}
|
|
444
444
|
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
445
445
|
if (partitions.length === 0) return [];
|
|
446
446
|
return (await engine.runSQL({
|
|
447
447
|
ctx,
|
|
448
|
-
table: "
|
|
448
|
+
table: "queries",
|
|
449
449
|
fileSets: { FILES: {
|
|
450
|
-
table: "
|
|
450
|
+
table: "queries",
|
|
451
451
|
partitions
|
|
452
452
|
} },
|
|
453
453
|
...searchType !== void 0 ? { searchType } : {},
|
|
@@ -502,15 +502,15 @@ const topKeywords28dParquetRollup = {
|
|
|
502
502
|
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
503
503
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
504
504
|
ctx,
|
|
505
|
-
table: "
|
|
505
|
+
table: "queries",
|
|
506
506
|
...searchType !== void 0 ? { searchType } : {}
|
|
507
507
|
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
508
508
|
if (partitions.length === 0) return [];
|
|
509
509
|
return (await engine.runSQL({
|
|
510
510
|
ctx,
|
|
511
|
-
table: "
|
|
511
|
+
table: "queries",
|
|
512
512
|
fileSets: { FILES: {
|
|
513
|
-
table: "
|
|
513
|
+
table: "queries",
|
|
514
514
|
partitions
|
|
515
515
|
} },
|
|
516
516
|
...searchType !== void 0 ? { searchType } : {},
|
package/dist/schema.d.mts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { _ as hourly_pages, a as allTables, b as
|
|
2
|
-
export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion,
|
|
1
|
+
import { _ as hourly_pages, a as allTables, b as queries, c as dimensionToColumn, d as schemaFor, f as DrizzleSchema, g as drizzleSchema, h as dates, i as TableSchema, l as inferTable, m as countries, n as ColumnType, o as currentSchemaVersion, p as TABLE_METADATA, r as SCHEMAS, s as dedupeByNaturalKey, t as ColumnDef, u as naturalKeyColumns, v as page_queries, x as search_appearance, y as pages } from "./_chunks/schema.mjs";
|
|
2
|
+
export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance };
|
package/dist/schema.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { _ as search_appearance, a as dimensionToColumn, c as schemaFor, d as
|
|
2
|
-
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion,
|
|
1
|
+
import { _ as search_appearance, a as dimensionToColumn, c as schemaFor, d as dates, f as drizzleSchema, g as queries, h as pages, i as dedupeByNaturalKey, l as TABLE_METADATA, m as page_queries, n as allTables, o as inferTable, p as hourly_pages, r as currentSchemaVersion, s as naturalKeyColumns, t as SCHEMAS, u as countries } from "./_chunks/schema.mjs";
|
|
2
|
+
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance };
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { n as LocalIcebergSinkOptions, r as Sink } from "./_chunks/sink.mjs";
|
|
2
|
+
/** S3-compatible credentials for the warehouse (POC: MinIO). */
|
|
3
|
+
interface LocalIcebergS3Config {
|
|
4
|
+
/** S3 endpoint host (POC MinIO: `localhost:9100`). */
|
|
5
|
+
endpoint: string;
|
|
6
|
+
accessKeyId: string;
|
|
7
|
+
secretAccessKey: string;
|
|
8
|
+
region?: string;
|
|
9
|
+
}
|
|
10
|
+
/** Full `LocalIcebergSink` options — extends the frozen contract options. */
|
|
11
|
+
interface LocalIcebergSinkFullOptions extends LocalIcebergSinkOptions {
|
|
12
|
+
/** S3 credentials for the warehouse. Defaults to the POC MinIO creds. */
|
|
13
|
+
s3?: LocalIcebergS3Config;
|
|
14
|
+
/** Python interpreter. Defaults to `$GSCDUMP_ICEBERG_PYTHON` then `python3`. */
|
|
15
|
+
python?: string;
|
|
16
|
+
/** Override the writer-script path. Defaults to `scripts/iceberg-writer.py`. */
|
|
17
|
+
writerScript?: string;
|
|
18
|
+
}
|
|
19
|
+
interface LocalIcebergSink extends Sink {
|
|
20
|
+
/** The catalog namespace the 5 tables live under. */
|
|
21
|
+
readonly namespace: string;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Create a `LocalIcebergSink` pointed at a local Iceberg REST catalog.
|
|
25
|
+
*
|
|
26
|
+
* Requires the POC docker stack (`poc/iceberg/docker-compose.iceberg.yml`)
|
|
27
|
+
* running and a Python env with `pyiceberg` + `pyarrow` available. Tests that
|
|
28
|
+
* use this sink must skip when the stack is unreachable.
|
|
29
|
+
*/
|
|
30
|
+
declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
|
|
31
|
+
export { type LocalIcebergS3Config, type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { i as ICEBERG_SCHEMAS } from "./_chunks/iceberg-schema.mjs";
|
|
2
|
+
import { execFile } from "node:child_process";
|
|
3
|
+
import { dirname, join } from "node:path";
|
|
4
|
+
import process from "node:process";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
const POC_S3 = {
|
|
7
|
+
endpoint: "localhost:9100",
|
|
8
|
+
accessKeyId: "poc",
|
|
9
|
+
secretAccessKey: "pocpocpoc",
|
|
10
|
+
region: "us-east-1"
|
|
11
|
+
};
|
|
12
|
+
function resolveWriterScript(override) {
|
|
13
|
+
if (override) return override;
|
|
14
|
+
return join(dirname(fileURLToPath(import.meta.url)), "..", "..", "scripts", "iceberg-writer.py");
|
|
15
|
+
}
|
|
16
|
+
function runWriter(python, script, job) {
|
|
17
|
+
return new Promise((resolve, reject) => {
|
|
18
|
+
execFile(python, [script], { maxBuffer: 64 * 1024 * 1024 }, (err, stdout, stderr) => {
|
|
19
|
+
let parsed;
|
|
20
|
+
if (stdout.trim()) try {
|
|
21
|
+
parsed = JSON.parse(stdout);
|
|
22
|
+
} catch {}
|
|
23
|
+
if (parsed?.error) {
|
|
24
|
+
reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer failed: ${parsed.error}`));
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
if (err) {
|
|
28
|
+
reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer process failed (${err.message})${stderr ? `: ${stderr}` : ""}`));
|
|
29
|
+
return;
|
|
30
|
+
}
|
|
31
|
+
if (!parsed) {
|
|
32
|
+
reject(/* @__PURE__ */ new Error(`LocalIcebergSink writer produced no parseable output: ${stdout || stderr}`));
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
resolve(parsed);
|
|
36
|
+
}).stdin?.end(JSON.stringify(job));
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
function createLocalIcebergSink(options) {
|
|
40
|
+
const s3 = options.s3 ?? POC_S3;
|
|
41
|
+
const python = options.python ?? process.env.GSCDUMP_ICEBERG_PYTHON ?? "python3";
|
|
42
|
+
const script = resolveWriterScript(options.writerScript);
|
|
43
|
+
function buildJob(op, slice, rows) {
|
|
44
|
+
return {
|
|
45
|
+
op,
|
|
46
|
+
catalogUri: options.catalogUri,
|
|
47
|
+
namespace: options.namespace,
|
|
48
|
+
warehouse: options.warehouse,
|
|
49
|
+
s3,
|
|
50
|
+
table: slice.table,
|
|
51
|
+
spec: ICEBERG_SCHEMAS[slice.table],
|
|
52
|
+
siteId: slice.ctx.siteId ?? "",
|
|
53
|
+
searchType: slice.searchType,
|
|
54
|
+
date: slice.date,
|
|
55
|
+
rows
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
const touched = /* @__PURE__ */ new Set();
|
|
59
|
+
return {
|
|
60
|
+
namespace: options.namespace,
|
|
61
|
+
capabilities: { appendOnly: true },
|
|
62
|
+
async emit(slice, rows) {
|
|
63
|
+
if (rows.length === 0) return { rowCount: 0 };
|
|
64
|
+
const res = await runWriter(python, script, buildJob("emit", slice, rows));
|
|
65
|
+
touched.add(slice.table);
|
|
66
|
+
return { rowCount: res.rowCount ?? 0 };
|
|
67
|
+
},
|
|
68
|
+
async close() {
|
|
69
|
+
return {
|
|
70
|
+
flushed: [...touched],
|
|
71
|
+
failed: []
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
export { createLocalIcebergSink };
|