@gscdump/engine 0.20.2 → 0.20.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/rollups.d.mts +16 -6
- package/dist/rollups.mjs +30 -23
- package/package.json +3 -3
package/dist/rollups.d.mts
CHANGED
|
@@ -90,13 +90,14 @@ interface RollupDef {
|
|
|
90
90
|
*/
|
|
91
91
|
dataSource: DataSource;
|
|
92
92
|
/**
|
|
93
|
-
*
|
|
94
|
-
*
|
|
95
|
-
*
|
|
96
|
-
*
|
|
97
|
-
*
|
|
93
|
+
* UTC millis the trailing window anchors to — its inclusive END. Equals
|
|
94
|
+
* the newest synced/finalized data date when the runner is given
|
|
95
|
+
* `dataEndDate`, otherwise wall-clock build time. Builders derive window
|
|
96
|
+
* cutoffs from this (e.g. the trailing-28d boundary) and inline a date
|
|
97
|
+
* literal so the SQL stays portable across DuckDB builds without the ICU
|
|
98
|
+
* extension (Workers DuckDB — `CURRENT_DATE` lives in ICU).
|
|
98
99
|
*/
|
|
99
|
-
|
|
100
|
+
windowAnchorMs: number;
|
|
100
101
|
/**
|
|
101
102
|
* GSC search-type slice the runner was invoked for. Builders forward
|
|
102
103
|
* this to every `engine.runSQL` call so the aggregated facts come
|
|
@@ -156,6 +157,15 @@ interface RebuildRollupsOptions {
|
|
|
156
157
|
* only tenants and explicit cross-type admin views.
|
|
157
158
|
*/
|
|
158
159
|
searchType?: SearchType;
|
|
160
|
+
/**
|
|
161
|
+
* ISO date (`YYYY-MM-DD`) of the newest synced/finalized day. Trailing-
|
|
162
|
+
* window rollups (28d/90d) anchor their window END here instead of
|
|
163
|
+
* wall-clock build time, so a "last 28 days" rollup covers the 28 days of
|
|
164
|
+
* data that actually exist — not 28 days back from whenever the job ran,
|
|
165
|
+
* which would include GSC's 2-3 day empty tail. Omit for the legacy
|
|
166
|
+
* wall-clock behaviour.
|
|
167
|
+
*/
|
|
168
|
+
dataEndDate?: string;
|
|
159
169
|
}
|
|
160
170
|
interface RebuildRollupResult {
|
|
161
171
|
id: string;
|
package/dist/rollups.mjs
CHANGED
|
@@ -41,16 +41,18 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
|
|
|
41
41
|
}
|
|
42
42
|
async function rebuildRollups(opts) {
|
|
43
43
|
const now = opts.now ?? (() => Date.now());
|
|
44
|
+
const dataEndMs = opts.dataEndDate !== void 0 ? isoDateToUtcMs(opts.dataEndDate) : null;
|
|
44
45
|
const results = [];
|
|
45
46
|
for (const def of opts.defs) {
|
|
46
47
|
const builtAt = now();
|
|
48
|
+
const windowAnchorMs = dataEndMs ?? builtAt;
|
|
47
49
|
const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
|
|
48
50
|
try {
|
|
49
51
|
const payload = await def.build({
|
|
50
52
|
engine: opts.engine,
|
|
51
53
|
ctx: opts.ctx,
|
|
52
54
|
dataSource: opts.dataSource,
|
|
53
|
-
|
|
55
|
+
windowAnchorMs,
|
|
54
56
|
...defSearchType !== void 0 ? { searchType: defSearchType } : {}
|
|
55
57
|
});
|
|
56
58
|
if (def.format === "parquet") {
|
|
@@ -115,6 +117,11 @@ async function rebuildRollups(opts) {
|
|
|
115
117
|
}
|
|
116
118
|
return results;
|
|
117
119
|
}
|
|
120
|
+
function isoDateToUtcMs(iso) {
|
|
121
|
+
const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(iso);
|
|
122
|
+
if (!m) throw new Error(`dataEndDate must be ISO YYYY-MM-DD, got: ${iso}`);
|
|
123
|
+
return Date.UTC(Number(m[1]), Number(m[2]) - 1, Number(m[3]));
|
|
124
|
+
}
|
|
118
125
|
function utcDateMinusDays(at, days) {
|
|
119
126
|
const d = new Date(at - days * MS_PER_DAY);
|
|
120
127
|
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
@@ -350,13 +357,13 @@ const weeklyTotalsRollup = {
|
|
|
350
357
|
const topPages28dRollup = {
|
|
351
358
|
id: "top_pages_28d",
|
|
352
359
|
windowDays: 28,
|
|
353
|
-
async build({ engine, ctx,
|
|
354
|
-
const cutoff = utcDateMinusDays(
|
|
360
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
361
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
355
362
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
356
363
|
ctx,
|
|
357
364
|
table: "pages",
|
|
358
365
|
...searchType !== void 0 ? { searchType } : {}
|
|
359
|
-
}), cutoff, utcDateMinusDays(
|
|
366
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
360
367
|
if (partitions.length === 0) return [];
|
|
361
368
|
return (await engine.runSQL({
|
|
362
369
|
ctx,
|
|
@@ -389,13 +396,13 @@ const topPages28dRollup = {
|
|
|
389
396
|
const topCountries28dRollup = {
|
|
390
397
|
id: "top_countries_28d",
|
|
391
398
|
windowDays: 28,
|
|
392
|
-
async build({ engine, ctx,
|
|
393
|
-
const cutoff = utcDateMinusDays(
|
|
399
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
400
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
394
401
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
395
402
|
ctx,
|
|
396
403
|
table: "countries",
|
|
397
404
|
...searchType !== void 0 ? { searchType } : {}
|
|
398
|
-
}), cutoff, utcDateMinusDays(
|
|
405
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
399
406
|
if (partitions.length === 0) return [];
|
|
400
407
|
return (await engine.runSQL({
|
|
401
408
|
ctx,
|
|
@@ -428,13 +435,13 @@ const topCountries28dRollup = {
|
|
|
428
435
|
const topKeywords28dRollup = {
|
|
429
436
|
id: "top_keywords_28d",
|
|
430
437
|
windowDays: 28,
|
|
431
|
-
async build({ engine, ctx,
|
|
432
|
-
const cutoff = utcDateMinusDays(
|
|
438
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
439
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
433
440
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
434
441
|
ctx,
|
|
435
442
|
table: "keywords",
|
|
436
443
|
...searchType !== void 0 ? { searchType } : {}
|
|
437
|
-
}), cutoff, utcDateMinusDays(
|
|
444
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
438
445
|
if (partitions.length === 0) return [];
|
|
439
446
|
return (await engine.runSQL({
|
|
440
447
|
ctx,
|
|
@@ -491,13 +498,13 @@ const topKeywords28dParquetRollup = {
|
|
|
491
498
|
}
|
|
492
499
|
],
|
|
493
500
|
parquetSortKey: ["clicks"],
|
|
494
|
-
async build({ engine, ctx,
|
|
495
|
-
const cutoff = utcDateMinusDays(
|
|
501
|
+
async build({ engine, ctx, windowAnchorMs, searchType }) {
|
|
502
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 28);
|
|
496
503
|
const partitions = partitionsInRange(await engine.listPartitions({
|
|
497
504
|
ctx,
|
|
498
505
|
table: "keywords",
|
|
499
506
|
...searchType !== void 0 ? { searchType } : {}
|
|
500
|
-
}), cutoff, utcDateMinusDays(
|
|
507
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
501
508
|
if (partitions.length === 0) return [];
|
|
502
509
|
return (await engine.runSQL({
|
|
503
510
|
ctx,
|
|
@@ -575,7 +582,7 @@ const indexingHealthRollup = {
|
|
|
575
582
|
id: "indexing_health",
|
|
576
583
|
windowDays: 90,
|
|
577
584
|
sliceOrthogonal: true,
|
|
578
|
-
async build({ engine, ctx, dataSource,
|
|
585
|
+
async build({ engine, ctx, dataSource, windowAnchorMs }) {
|
|
579
586
|
const key = inspectionParquetKey(ctx);
|
|
580
587
|
if (!await dataSource.head?.(key)) return { days: [] };
|
|
581
588
|
const sql = `
|
|
@@ -590,7 +597,7 @@ const indexingHealthRollup = {
|
|
|
590
597
|
SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
|
|
591
598
|
SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
|
|
592
599
|
FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
|
|
593
|
-
WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(
|
|
600
|
+
WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(windowAnchorMs, 90)}'
|
|
594
601
|
GROUP BY 1
|
|
595
602
|
ORDER BY 1
|
|
596
603
|
`;
|
|
@@ -619,19 +626,19 @@ const indexPercentRollup = {
|
|
|
619
626
|
id: "index_percent",
|
|
620
627
|
windowDays: 90,
|
|
621
628
|
sliceOrthogonal: true,
|
|
622
|
-
async build({ engine, ctx, dataSource,
|
|
629
|
+
async build({ engine, ctx, dataSource, windowAnchorMs, searchType }) {
|
|
623
630
|
const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
|
|
624
631
|
if (urlsKeys.length === 0) return {
|
|
625
632
|
totalSitemapUrls: 0,
|
|
626
633
|
days: []
|
|
627
634
|
};
|
|
628
|
-
const cutoff = utcDateMinusDays(
|
|
635
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 90);
|
|
629
636
|
const factSearchType = searchType ?? "web";
|
|
630
637
|
const pagesPartitions = partitionsInRange(await engine.listPartitions({
|
|
631
638
|
ctx,
|
|
632
639
|
table: "pages",
|
|
633
640
|
searchType: factSearchType
|
|
634
|
-
}), cutoff, utcDateMinusDays(
|
|
641
|
+
}), cutoff, utcDateMinusDays(windowAnchorMs, 0));
|
|
635
642
|
const numerator = await engine.runSQL({
|
|
636
643
|
ctx,
|
|
637
644
|
table: "pages",
|
|
@@ -690,10 +697,10 @@ const sitemapHealthRollup = {
|
|
|
690
697
|
id: "sitemap_health",
|
|
691
698
|
windowDays: 90,
|
|
692
699
|
sliceOrthogonal: true,
|
|
693
|
-
async build({ dataSource, ctx,
|
|
700
|
+
async build({ dataSource, ctx, windowAnchorMs }) {
|
|
694
701
|
const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
|
|
695
702
|
const records = Object.values(index.records);
|
|
696
|
-
const cutoff = utcDateMinusDays(
|
|
703
|
+
const cutoff = utcDateMinusDays(windowAnchorMs, 90);
|
|
697
704
|
const byDay = /* @__PURE__ */ new Map();
|
|
698
705
|
const feeds = [];
|
|
699
706
|
for (const r of records) {
|
|
@@ -734,10 +741,10 @@ const sitemapChanges28dRollup = {
|
|
|
734
741
|
id: "sitemap_changes_28d",
|
|
735
742
|
windowDays: 28,
|
|
736
743
|
sliceOrthogonal: true,
|
|
737
|
-
async build({ dataSource, ctx,
|
|
744
|
+
async build({ dataSource, ctx, windowAnchorMs }) {
|
|
738
745
|
const store = createSitemapStore({ dataSource });
|
|
739
|
-
const from = utcDateMinusDays(
|
|
740
|
-
const to = utcDateMinusDays(
|
|
746
|
+
const from = utcDateMinusDays(windowAnchorMs, 28);
|
|
747
|
+
const to = utcDateMinusDays(windowAnchorMs, 0);
|
|
741
748
|
const counts = /* @__PURE__ */ new Map();
|
|
742
749
|
const addedTop = [];
|
|
743
750
|
const removedTop = [];
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.20.
|
|
4
|
+
"version": "0.20.3",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -169,8 +169,8 @@
|
|
|
169
169
|
"dependencies": {
|
|
170
170
|
"drizzle-orm": "^0.45.2",
|
|
171
171
|
"proper-lockfile": "^4.1.2",
|
|
172
|
-
"
|
|
173
|
-
"gscdump": "0.20.
|
|
172
|
+
"gscdump": "0.20.3",
|
|
173
|
+
"@gscdump/contracts": "0.20.3"
|
|
174
174
|
},
|
|
175
175
|
"devDependencies": {
|
|
176
176
|
"@duckdb/duckdb-wasm": "^1.32.0",
|