@gscdump/engine 0.20.2 → 0.20.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -90,13 +90,14 @@ interface RollupDef {
90
90
  */
91
91
  dataSource: DataSource;
92
92
  /**
93
- * Wall-clock millis when the runner started this rollup. Use for
94
- * derived window cutoffs (e.g. trailing-28d boundary) so the SQL can
95
- * inline a date literal and stay portable across DuckDB builds that
96
- * don't bundle the ICU extension (Workers DuckDB, for one CURRENT_DATE
97
- * lives in ICU).
93
+ * UTC millis the trailing window anchors to — its inclusive END. Equals
94
+ * the newest synced/finalized data date when the runner is given
95
+ * `dataEndDate`, otherwise wall-clock build time. Builders derive window
96
+ * cutoffs from this (e.g. the trailing-28d boundary) and inline a date
97
+ * literal so the SQL stays portable across DuckDB builds without the ICU
98
+ * extension (Workers DuckDB — `CURRENT_DATE` lives in ICU).
98
99
  */
99
- builtAt: number;
100
+ windowAnchorMs: number;
100
101
  /**
101
102
  * GSC search-type slice the runner was invoked for. Builders forward
102
103
  * this to every `engine.runSQL` call so the aggregated facts come
@@ -156,6 +157,15 @@ interface RebuildRollupsOptions {
156
157
  * only tenants and explicit cross-type admin views.
157
158
  */
158
159
  searchType?: SearchType;
160
+ /**
161
+ * ISO date (`YYYY-MM-DD`) of the newest synced/finalized day. Trailing-
162
+ * window rollups (28d/90d) anchor their window END here instead of
163
+ * wall-clock build time, so a "last 28 days" rollup covers the 28 days of
164
+ * data that actually exist — not 28 days back from whenever the job ran,
165
+ * which would include GSC's 2-3 day empty tail. Omit for the legacy
166
+ * wall-clock behaviour.
167
+ */
168
+ dataEndDate?: string;
159
169
  }
160
170
  interface RebuildRollupResult {
161
171
  id: string;
package/dist/rollups.mjs CHANGED
@@ -41,16 +41,18 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
41
41
  }
42
42
  async function rebuildRollups(opts) {
43
43
  const now = opts.now ?? (() => Date.now());
44
+ const dataEndMs = opts.dataEndDate !== void 0 ? isoDateToUtcMs(opts.dataEndDate) : null;
44
45
  const results = [];
45
46
  for (const def of opts.defs) {
46
47
  const builtAt = now();
48
+ const windowAnchorMs = dataEndMs ?? builtAt;
47
49
  const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
48
50
  try {
49
51
  const payload = await def.build({
50
52
  engine: opts.engine,
51
53
  ctx: opts.ctx,
52
54
  dataSource: opts.dataSource,
53
- builtAt,
55
+ windowAnchorMs,
54
56
  ...defSearchType !== void 0 ? { searchType: defSearchType } : {}
55
57
  });
56
58
  if (def.format === "parquet") {
@@ -115,6 +117,11 @@ async function rebuildRollups(opts) {
115
117
  }
116
118
  return results;
117
119
  }
120
+ function isoDateToUtcMs(iso) {
121
+ const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(iso);
122
+ if (!m) throw new Error(`dataEndDate must be ISO YYYY-MM-DD, got: ${iso}`);
123
+ return Date.UTC(Number(m[1]), Number(m[2]) - 1, Number(m[3]));
124
+ }
118
125
  function utcDateMinusDays(at, days) {
119
126
  const d = new Date(at - days * MS_PER_DAY);
120
127
  return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
@@ -350,13 +357,13 @@ const weeklyTotalsRollup = {
350
357
  const topPages28dRollup = {
351
358
  id: "top_pages_28d",
352
359
  windowDays: 28,
353
- async build({ engine, ctx, builtAt, searchType }) {
354
- const cutoff = utcDateMinusDays(builtAt, 28);
360
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
361
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
355
362
  const partitions = partitionsInRange(await engine.listPartitions({
356
363
  ctx,
357
364
  table: "pages",
358
365
  ...searchType !== void 0 ? { searchType } : {}
359
- }), cutoff, utcDateMinusDays(builtAt, 0));
366
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
360
367
  if (partitions.length === 0) return [];
361
368
  return (await engine.runSQL({
362
369
  ctx,
@@ -389,13 +396,13 @@ const topPages28dRollup = {
389
396
  const topCountries28dRollup = {
390
397
  id: "top_countries_28d",
391
398
  windowDays: 28,
392
- async build({ engine, ctx, builtAt, searchType }) {
393
- const cutoff = utcDateMinusDays(builtAt, 28);
399
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
400
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
394
401
  const partitions = partitionsInRange(await engine.listPartitions({
395
402
  ctx,
396
403
  table: "countries",
397
404
  ...searchType !== void 0 ? { searchType } : {}
398
- }), cutoff, utcDateMinusDays(builtAt, 0));
405
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
399
406
  if (partitions.length === 0) return [];
400
407
  return (await engine.runSQL({
401
408
  ctx,
@@ -428,13 +435,13 @@ const topCountries28dRollup = {
428
435
  const topKeywords28dRollup = {
429
436
  id: "top_keywords_28d",
430
437
  windowDays: 28,
431
- async build({ engine, ctx, builtAt, searchType }) {
432
- const cutoff = utcDateMinusDays(builtAt, 28);
438
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
439
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
433
440
  const partitions = partitionsInRange(await engine.listPartitions({
434
441
  ctx,
435
442
  table: "keywords",
436
443
  ...searchType !== void 0 ? { searchType } : {}
437
- }), cutoff, utcDateMinusDays(builtAt, 0));
444
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
438
445
  if (partitions.length === 0) return [];
439
446
  return (await engine.runSQL({
440
447
  ctx,
@@ -491,13 +498,13 @@ const topKeywords28dParquetRollup = {
491
498
  }
492
499
  ],
493
500
  parquetSortKey: ["clicks"],
494
- async build({ engine, ctx, builtAt, searchType }) {
495
- const cutoff = utcDateMinusDays(builtAt, 28);
501
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
502
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
496
503
  const partitions = partitionsInRange(await engine.listPartitions({
497
504
  ctx,
498
505
  table: "keywords",
499
506
  ...searchType !== void 0 ? { searchType } : {}
500
- }), cutoff, utcDateMinusDays(builtAt, 0));
507
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
501
508
  if (partitions.length === 0) return [];
502
509
  return (await engine.runSQL({
503
510
  ctx,
@@ -575,7 +582,7 @@ const indexingHealthRollup = {
575
582
  id: "indexing_health",
576
583
  windowDays: 90,
577
584
  sliceOrthogonal: true,
578
- async build({ engine, ctx, dataSource, builtAt }) {
585
+ async build({ engine, ctx, dataSource, windowAnchorMs }) {
579
586
  const key = inspectionParquetKey(ctx);
580
587
  if (!await dataSource.head?.(key)) return { days: [] };
581
588
  const sql = `
@@ -590,7 +597,7 @@ const indexingHealthRollup = {
590
597
  SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
591
598
  SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
592
599
  FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
593
- WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
600
+ WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(windowAnchorMs, 90)}'
594
601
  GROUP BY 1
595
602
  ORDER BY 1
596
603
  `;
@@ -619,19 +626,19 @@ const indexPercentRollup = {
619
626
  id: "index_percent",
620
627
  windowDays: 90,
621
628
  sliceOrthogonal: true,
622
- async build({ engine, ctx, dataSource, builtAt, searchType }) {
629
+ async build({ engine, ctx, dataSource, windowAnchorMs, searchType }) {
623
630
  const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
624
631
  if (urlsKeys.length === 0) return {
625
632
  totalSitemapUrls: 0,
626
633
  days: []
627
634
  };
628
- const cutoff = utcDateMinusDays(builtAt, 90);
635
+ const cutoff = utcDateMinusDays(windowAnchorMs, 90);
629
636
  const factSearchType = searchType ?? "web";
630
637
  const pagesPartitions = partitionsInRange(await engine.listPartitions({
631
638
  ctx,
632
639
  table: "pages",
633
640
  searchType: factSearchType
634
- }), cutoff, utcDateMinusDays(builtAt, 0));
641
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
635
642
  const numerator = await engine.runSQL({
636
643
  ctx,
637
644
  table: "pages",
@@ -690,10 +697,10 @@ const sitemapHealthRollup = {
690
697
  id: "sitemap_health",
691
698
  windowDays: 90,
692
699
  sliceOrthogonal: true,
693
- async build({ dataSource, ctx, builtAt }) {
700
+ async build({ dataSource, ctx, windowAnchorMs }) {
694
701
  const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
695
702
  const records = Object.values(index.records);
696
- const cutoff = utcDateMinusDays(builtAt, 90);
703
+ const cutoff = utcDateMinusDays(windowAnchorMs, 90);
697
704
  const byDay = /* @__PURE__ */ new Map();
698
705
  const feeds = [];
699
706
  for (const r of records) {
@@ -734,10 +741,10 @@ const sitemapChanges28dRollup = {
734
741
  id: "sitemap_changes_28d",
735
742
  windowDays: 28,
736
743
  sliceOrthogonal: true,
737
- async build({ dataSource, ctx, builtAt }) {
744
+ async build({ dataSource, ctx, windowAnchorMs }) {
738
745
  const store = createSitemapStore({ dataSource });
739
- const from = utcDateMinusDays(builtAt, 28);
740
- const to = utcDateMinusDays(builtAt, 0);
746
+ const from = utcDateMinusDays(windowAnchorMs, 28);
747
+ const to = utcDateMinusDays(windowAnchorMs, 0);
741
748
  const counts = /* @__PURE__ */ new Map();
742
749
  const addedTop = [];
743
750
  const removedTop = [];
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.20.2",
4
+ "version": "0.20.3",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "@gscdump/contracts": "0.20.2",
173
- "gscdump": "0.20.2"
172
+ "gscdump": "0.20.3",
173
+ "@gscdump/contracts": "0.20.3"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",