@gscdump/engine 0.20.1 → 0.20.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/rollups.d.mts +19 -6
- package/dist/rollups.mjs +48 -34
- package/package.json +3 -3
package/dist/rollups.d.mts
CHANGED
|
@@ -90,13 +90,14 @@ interface RollupDef {
|
|
|
90
90
|
*/
|
|
91
91
|
dataSource: DataSource;
|
|
92
92
|
/**
|
|
93
|
-
*
|
|
94
|
-
*
|
|
95
|
-
*
|
|
96
|
-
*
|
|
97
|
-
*
|
|
93
|
+
* UTC millis the trailing window anchors to — its inclusive END. Equals
|
|
94
|
+
* the newest synced/finalized data date when the runner is given
|
|
95
|
+
* `dataEndDate`, otherwise wall-clock build time. Builders derive window
|
|
96
|
+
* cutoffs from this (e.g. the trailing-28d boundary) and inline a date
|
|
97
|
+
* literal so the SQL stays portable across DuckDB builds without the ICU
|
|
98
|
+
* extension (Workers DuckDB — `CURRENT_DATE` lives in ICU).
|
|
98
99
|
*/
|
|
99
|
-
|
|
100
|
+
windowAnchorMs: number;
|
|
100
101
|
/**
|
|
101
102
|
* GSC search-type slice the runner was invoked for. Builders forward
|
|
102
103
|
* this to every `engine.runSQL` call so the aggregated facts come
|
|
@@ -127,10 +128,13 @@ declare function rollupParquetKey(ctx: TenantCtx, id: string, builtAt: number, s
|
|
|
127
128
|
interface RollupBucket {
|
|
128
129
|
list: (opts: {
|
|
129
130
|
prefix: string;
|
|
131
|
+
cursor?: string;
|
|
130
132
|
}) => Promise<{
|
|
131
133
|
objects: Array<{
|
|
132
134
|
key: string;
|
|
133
135
|
}>;
|
|
136
|
+
truncated?: boolean;
|
|
137
|
+
cursor?: string;
|
|
134
138
|
}>;
|
|
135
139
|
get: (key: string) => Promise<{
|
|
136
140
|
text: () => Promise<string>;
|
|
@@ -153,6 +157,15 @@ interface RebuildRollupsOptions {
|
|
|
153
157
|
* only tenants and explicit cross-type admin views.
|
|
154
158
|
*/
|
|
155
159
|
searchType?: SearchType;
|
|
160
|
+
/**
|
|
161
|
+
* ISO date (`YYYY-MM-DD`) of the newest synced/finalized day. Trailing-
|
|
162
|
+
* window rollups (28d/90d) anchor their window END here instead of
|
|
163
|
+
* wall-clock build time, so a "last 28 days" rollup covers the 28 days of
|
|
164
|
+
* data that actually exist — not 28 days back from whenever the job ran,
|
|
165
|
+
* which would include GSC's 2-3 day empty tail. Omit for the legacy
|
|
166
|
+
* wall-clock behaviour.
|
|
167
|
+
*/
|
|
168
|
+
dataEndDate?: string;
|
|
156
169
|
}
|
|
157
170
|
interface RebuildRollupResult {
|
|
158
171
|
id: string;
|
package/dist/rollups.mjs
CHANGED
|
@@ -15,18 +15,25 @@ function rollupParquetKey(ctx, id, builtAt, searchType) {
|
|
|
15
15
|
const ROLLUP_FILE_RE = /^(?<id>[a-z0-9_]+)__v(?<ts>\d+)\.json$/;
|
|
16
16
|
async function readLatestRollup(bucket, ctx, id, searchType) {
|
|
17
17
|
const prefix = `${rollupPrefix(ctx, searchType)}/`;
|
|
18
|
-
const listing = await bucket.list({ prefix }).catch(() => null);
|
|
19
|
-
if (!listing) return null;
|
|
20
18
|
let newest = null;
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
19
|
+
let cursor;
|
|
20
|
+
do {
|
|
21
|
+
const listing = await bucket.list({
|
|
22
|
+
prefix,
|
|
23
|
+
cursor
|
|
24
|
+
}).catch(() => null);
|
|
25
|
+
if (!listing) return null;
|
|
26
|
+
for (const obj of listing.objects) {
|
|
27
|
+
const m = ROLLUP_FILE_RE.exec(obj.key.slice(prefix.length));
|
|
28
|
+
if (!m?.groups || m.groups.id !== id) continue;
|
|
29
|
+
const ts = Number(m.groups.ts);
|
|
30
|
+
if (!newest || ts > newest.ts) newest = {
|
|
31
|
+
ts,
|
|
32
|
+
key: obj.key
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
cursor = listing.truncated ? listing.cursor : void 0;
|
|
36
|
+
} while (cursor !== void 0);
|
|
30
37
|
if (!newest) return null;
|
|
31
38
|
const obj = await bucket.get(newest.key).catch(() => null);
|
|
32
39
|
if (!obj) return null;
|
|
@@ -34,16 +41,18 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
|
|
|
34
41
|
}
|
|
35
42
|
async function rebuildRollups(opts) {
|
|
36
43
|
const now = opts.now ?? (() => Date.now());
|
|
44
|
+
const dataEndMs = opts.dataEndDate !== void 0 ? isoDateToUtcMs(opts.dataEndDate) : null;
|
|
37
45
|
const results = [];
|
|
38
46
|
for (const def of opts.defs) {
|
|
39
47
|
const builtAt = now();
|
|
48
|
+
const windowAnchorMs = dataEndMs ?? builtAt;
|
|
40
49
|
const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
|
|
41
50
|
try {
|
|
42
51
|
const payload = await def.build({
|
|
43
52
|
engine: opts.engine,
|
|
44
53
|
ctx: opts.ctx,
|
|
45
54
|
dataSource: opts.dataSource,
|
|
46
|
-
|
|
55
|
+
windowAnchorMs,
|
|
47
56
|
...defSearchType !== void 0 ? { searchType: defSearchType } : {}
|
|
48
57
|
});
|
|
49
58
|
if (def.format === "parquet") {
|
|
@@ -108,6 +117,11 @@ async function rebuildRollups(opts) {
|
|
|
108
117
|
}
|
|
109
118
|
return results;
|
|
110
119
|
}
|
|
120
|
+
function isoDateToUtcMs(iso) {
|
|
121
|
+
const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(iso);
|
|
122
|
+
if (!m) throw new Error(`dataEndDate must be ISO YYYY-MM-DD, got: ${iso}`);
|
|
123
|
+
return Date.UTC(Number(m[1]), Number(m[2]) - 1, Number(m[3]));
|
|
124
|
+
}
|
|
111
125
|
function utcDateMinusDays(at, days) {
|
|
112
126
|
const d = new Date(at - days * MS_PER_DAY);
|
|
113
127
|
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
@@ -343,13 +357,13 @@ const weeklyTotalsRollup = {
|
|
|
343
357
|
const topPages28dRollup = {
|
|
344
358
|
id: "top_pages_28d",
|
|
345
359
|
windowDays: 28,
|
|
346
|
-
async build({ engine, ctx,
|
|
347
|
-
const cutoff = utcDateMinusDays(
|
|
360
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
361
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
348
362
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
349
363
|
ctx,
|
|
350
364
|
table: "pages",
|
|
351
365
|
...searchType !== void 0 ? { searchType } : {}
|
|
352
|
-
}), cutoff, utcDateMinusDays(
|
|
366
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
353
367
|
if (partitions.length === 0) return [];
|
|
354
368
|
return (await engine.runSQL({
|
|
355
369
|
ctx,
|
|
@@ -382,13 +396,13 @@ const topPages28dRollup = {
|
|
|
382
396
|
const topCountries28dRollup = {
|
|
383
397
|
id: "top_countries_28d",
|
|
384
398
|
windowDays: 28,
|
|
385
|
-
async build({ engine, ctx,
|
|
386
|
-
const cutoff = utcDateMinusDays(
|
|
399
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
400
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
387
401
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
388
402
|
ctx,
|
|
389
403
|
table: "countries",
|
|
390
404
|
...searchType !== void 0 ? { searchType } : {}
|
|
391
|
-
}), cutoff, utcDateMinusDays(
|
|
405
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
392
406
|
if (partitions.length === 0) return [];
|
|
393
407
|
return (await engine.runSQL({
|
|
394
408
|
ctx,
|
|
@@ -421,13 +435,13 @@ const topCountries28dRollup = {
|
|
|
421
435
|
const topKeywords28dRollup = {
|
|
422
436
|
id: "top_keywords_28d",
|
|
423
437
|
windowDays: 28,
|
|
424
|
-
async build({ engine, ctx,
|
|
425
|
-
const cutoff = utcDateMinusDays(
|
|
438
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
439
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
426
440
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
427
441
|
ctx,
|
|
428
442
|
table: "keywords",
|
|
429
443
|
...searchType !== void 0 ? { searchType } : {}
|
|
430
|
-
}), cutoff, utcDateMinusDays(
|
|
444
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
431
445
|
if (partitions.length === 0) return [];
|
|
432
446
|
return (await engine.runSQL({
|
|
433
447
|
ctx,
|
|
@@ -484,13 +498,13 @@ const topKeywords28dParquetRollup = {
|
|
|
484
498
|
}
|
|
485
499
|
],
|
|
486
500
|
parquetSortKey: ["clicks"],
|
|
487
|
-
async build({ engine, ctx,
|
|
488
|
-
const cutoff = utcDateMinusDays(
|
|
501
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
502
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
489
503
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
490
504
|
ctx,
|
|
491
505
|
table: "keywords",
|
|
492
506
|
...searchType !== void 0 ? { searchType } : {}
|
|
493
|
-
}), cutoff, utcDateMinusDays(
|
|
507
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
494
508
|
if (partitions.length === 0) return [];
|
|
495
509
|
return (await engine.runSQL({
|
|
496
510
|
ctx,
|
|
@@ -568,7 +582,7 @@ const indexingHealthRollup = {
|
|
|
568
582
|
id: "indexing_health",
|
|
569
583
|
windowDays: 90,
|
|
570
584
|
sliceOrthogonal: true,
|
|
571
|
-
async build({ engine, ctx, dataSource,
|
|
585
|
+
async build({ engine, ctx, dataSource, windowAnchorMs }) {
|
|
572
586
|
const key = inspectionParquetKey(ctx);
|
|
573
587
|
if (!await dataSource.head?.(key)) return { days: [] };
|
|
574
588
|
const sql = `
|
|
@@ -583,7 +597,7 @@ const indexingHealthRollup = {
|
|
|
583
597
|
SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
|
|
584
598
|
SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
|
|
585
599
|
FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
|
|
586
|
-
WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(
|
|
600
|
+
WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(windowAnchorMs, 90)}'
|
|
587
601
|
GROUP BY 1
|
|
588
602
|
ORDER BY 1
|
|
589
603
|
`;
|
|
@@ -612,19 +626,19 @@ const indexPercentRollup = {
|
|
|
612
626
|
id: "index_percent",
|
|
613
627
|
windowDays: 90,
|
|
614
628
|
sliceOrthogonal: true,
|
|
615
|
-
async build({ engine, ctx, dataSource,
|
|
629
|
+
async build({ engine, ctx, dataSource, windowAnchorMs, searchType }) {
|
|
616
630
|
const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
|
|
617
631
|
if (urlsKeys.length === 0) return {
|
|
618
632
|
totalSitemapUrls: 0,
|
|
619
633
|
days: []
|
|
620
634
|
};
|
|
621
|
-
const cutoff = utcDateMinusDays(
|
|
635
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 90);
|
|
622
636
|
const factSearchType = searchType ?? "web";
|
|
623
637
|
const pagesPartitions = partitionsInRange(await engine.listPartitions({
|
|
624
638
|
ctx,
|
|
625
639
|
table: "pages",
|
|
626
640
|
searchType: factSearchType
|
|
627
|
-
}), cutoff, utcDateMinusDays(
|
|
641
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
628
642
|
const numerator = await engine.runSQL({
|
|
629
643
|
ctx,
|
|
630
644
|
table: "pages",
|
|
@@ -683,10 +697,10 @@ const sitemapHealthRollup = {
|
|
|
683
697
|
id: "sitemap_health",
|
|
684
698
|
windowDays: 90,
|
|
685
699
|
sliceOrthogonal: true,
|
|
686
|
-
async build({ dataSource, ctx,
|
|
700
|
+
async build({ dataSource, ctx, windowAnchorMs }) {
|
|
687
701
|
const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
|
|
688
702
|
const records = Object.values(index.records);
|
|
689
|
-
const cutoff = utcDateMinusDays(
|
|
703
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 90);
|
|
690
704
|
const byDay = /* @__PURE__ */ new Map();
|
|
691
705
|
const feeds = [];
|
|
692
706
|
for (const r of records) {
|
|
@@ -727,10 +741,10 @@ const sitemapChanges28dRollup = {
|
|
|
727
741
|
id: "sitemap_changes_28d",
|
|
728
742
|
windowDays: 28,
|
|
729
743
|
sliceOrthogonal: true,
|
|
730
|
-
async build({ dataSource, ctx,
|
|
744
|
+
async build({ dataSource, ctx, windowAnchorMs }) {
|
|
731
745
|
const store = createSitemapStore({ dataSource });
|
|
732
|
-
const from = utcDateMinusDays(
|
|
733
|
-
const to = utcDateMinusDays(
|
|
746
|
+
const from = utcDateMinusDays(windowAnchorMs, 28);
|
|
747
|
+
const to = utcDateMinusDays(windowAnchorMs, 0);
|
|
734
748
|
const counts = /* @__PURE__ */ new Map();
|
|
735
749
|
const addedTop = [];
|
|
736
750
|
const removedTop = [];
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.20.
|
|
4
|
+
"version": "0.20.3",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -169,8 +169,8 @@
|
|
|
169
169
|
"dependencies": {
|
|
170
170
|
"drizzle-orm": "^0.45.2",
|
|
171
171
|
"proper-lockfile": "^4.1.2",
|
|
172
|
-
"gscdump": "0.20.
|
|
173
|
-
"@gscdump/contracts": "0.20.
|
|
172
|
+
"gscdump": "0.20.3",
|
|
173
|
+
"@gscdump/contracts": "0.20.3"
|
|
174
174
|
},
|
|
175
175
|
"devDependencies": {
|
|
176
176
|
"@duckdb/duckdb-wasm": "^1.32.0",
|