@gscdump/engine 0.20.1 → 0.20.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -90,13 +90,14 @@ interface RollupDef {
90
90
  */
91
91
  dataSource: DataSource;
92
92
  /**
93
- * Wall-clock millis when the runner started this rollup. Use for
94
- * derived window cutoffs (e.g. trailing-28d boundary) so the SQL can
95
- * inline a date literal and stay portable across DuckDB builds that
96
- * don't bundle the ICU extension (Workers DuckDB, for one CURRENT_DATE
97
- * lives in ICU).
93
+ * UTC millis the trailing window anchors to — its inclusive END. Equals
94
+ * the newest synced/finalized data date when the runner is given
95
+ * `dataEndDate`, otherwise wall-clock build time. Builders derive window
96
+ * cutoffs from this (e.g. the trailing-28d boundary) and inline a date
97
+ * literal so the SQL stays portable across DuckDB builds without the ICU
98
+ * extension (Workers DuckDB — `CURRENT_DATE` lives in ICU).
98
99
  */
99
- builtAt: number;
100
+ windowAnchorMs: number;
100
101
  /**
101
102
  * GSC search-type slice the runner was invoked for. Builders forward
102
103
  * this to every `engine.runSQL` call so the aggregated facts come
@@ -127,10 +128,13 @@ declare function rollupParquetKey(ctx: TenantCtx, id: string, builtAt: number, s
127
128
  interface RollupBucket {
128
129
  list: (opts: {
129
130
  prefix: string;
131
+ cursor?: string;
130
132
  }) => Promise<{
131
133
  objects: Array<{
132
134
  key: string;
133
135
  }>;
136
+ truncated?: boolean;
137
+ cursor?: string;
134
138
  }>;
135
139
  get: (key: string) => Promise<{
136
140
  text: () => Promise<string>;
@@ -153,6 +157,15 @@ interface RebuildRollupsOptions {
153
157
  * only tenants and explicit cross-type admin views.
154
158
  */
155
159
  searchType?: SearchType;
160
+ /**
161
+ * ISO date (`YYYY-MM-DD`) of the newest synced/finalized day. Trailing-
162
+ * window rollups (28d/90d) anchor their window END here instead of
163
+ * wall-clock build time, so a "last 28 days" rollup covers the 28 days of
164
+ * data that actually exist — not 28 days back from whenever the job ran,
165
+ * which would include GSC's 2-3 day empty tail. Omit for the legacy
166
+ * wall-clock behaviour.
167
+ */
168
+ dataEndDate?: string;
156
169
  }
157
170
  interface RebuildRollupResult {
158
171
  id: string;
package/dist/rollups.mjs CHANGED
@@ -15,18 +15,25 @@ function rollupParquetKey(ctx, id, builtAt, searchType) {
15
15
  const ROLLUP_FILE_RE = /^(?<id>[a-z0-9_]+)__v(?<ts>\d+)\.json$/;
16
16
  async function readLatestRollup(bucket, ctx, id, searchType) {
17
17
  const prefix = `${rollupPrefix(ctx, searchType)}/`;
18
- const listing = await bucket.list({ prefix }).catch(() => null);
19
- if (!listing) return null;
20
18
  let newest = null;
21
- for (const obj of listing.objects) {
22
- const m = ROLLUP_FILE_RE.exec(obj.key.slice(prefix.length));
23
- if (!m?.groups || m.groups.id !== id) continue;
24
- const ts = Number(m.groups.ts);
25
- if (!newest || ts > newest.ts) newest = {
26
- ts,
27
- key: obj.key
28
- };
29
- }
19
+ let cursor;
20
+ do {
21
+ const listing = await bucket.list({
22
+ prefix,
23
+ cursor
24
+ }).catch(() => null);
25
+ if (!listing) return null;
26
+ for (const obj of listing.objects) {
27
+ const m = ROLLUP_FILE_RE.exec(obj.key.slice(prefix.length));
28
+ if (!m?.groups || m.groups.id !== id) continue;
29
+ const ts = Number(m.groups.ts);
30
+ if (!newest || ts > newest.ts) newest = {
31
+ ts,
32
+ key: obj.key
33
+ };
34
+ }
35
+ cursor = listing.truncated ? listing.cursor : void 0;
36
+ } while (cursor !== void 0);
30
37
  if (!newest) return null;
31
38
  const obj = await bucket.get(newest.key).catch(() => null);
32
39
  if (!obj) return null;
@@ -34,16 +41,18 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
34
41
  }
35
42
  async function rebuildRollups(opts) {
36
43
  const now = opts.now ?? (() => Date.now());
44
+ const dataEndMs = opts.dataEndDate !== void 0 ? isoDateToUtcMs(opts.dataEndDate) : null;
37
45
  const results = [];
38
46
  for (const def of opts.defs) {
39
47
  const builtAt = now();
48
+ const windowAnchorMs = dataEndMs ?? builtAt;
40
49
  const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
41
50
  try {
42
51
  const payload = await def.build({
43
52
  engine: opts.engine,
44
53
  ctx: opts.ctx,
45
54
  dataSource: opts.dataSource,
46
- builtAt,
55
+ windowAnchorMs,
47
56
  ...defSearchType !== void 0 ? { searchType: defSearchType } : {}
48
57
  });
49
58
  if (def.format === "parquet") {
@@ -108,6 +117,11 @@ async function rebuildRollups(opts) {
108
117
  }
109
118
  return results;
110
119
  }
120
+ function isoDateToUtcMs(iso) {
121
+ const m = /^(\d{4})-(\d{2})-(\d{2})$/.exec(iso);
122
+ if (!m) throw new Error(`dataEndDate must be ISO YYYY-MM-DD, got: ${iso}`);
123
+ return Date.UTC(Number(m[1]), Number(m[2]) - 1, Number(m[3]));
124
+ }
111
125
  function utcDateMinusDays(at, days) {
112
126
  const d = new Date(at - days * MS_PER_DAY);
113
127
  return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
@@ -343,13 +357,13 @@ const weeklyTotalsRollup = {
343
357
  const topPages28dRollup = {
344
358
  id: "top_pages_28d",
345
359
  windowDays: 28,
346
- async build({ engine, ctx, builtAt, searchType }) {
347
- const cutoff = utcDateMinusDays(builtAt, 28);
360
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
361
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
348
362
  const partitions = partitionsInRange(await engine.listPartitions({
349
363
  ctx,
350
364
  table: "pages",
351
365
  ...searchType !== void 0 ? { searchType } : {}
352
- }), cutoff, utcDateMinusDays(builtAt, 0));
366
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
353
367
  if (partitions.length === 0) return [];
354
368
  return (await engine.runSQL({
355
369
  ctx,
@@ -382,13 +396,13 @@ const topPages28dRollup = {
382
396
  const topCountries28dRollup = {
383
397
  id: "top_countries_28d",
384
398
  windowDays: 28,
385
- async build({ engine, ctx, builtAt, searchType }) {
386
- const cutoff = utcDateMinusDays(builtAt, 28);
399
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
400
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
387
401
  const partitions = partitionsInRange(await engine.listPartitions({
388
402
  ctx,
389
403
  table: "countries",
390
404
  ...searchType !== void 0 ? { searchType } : {}
391
- }), cutoff, utcDateMinusDays(builtAt, 0));
405
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
392
406
  if (partitions.length === 0) return [];
393
407
  return (await engine.runSQL({
394
408
  ctx,
@@ -421,13 +435,13 @@ const topCountries28dRollup = {
421
435
  const topKeywords28dRollup = {
422
436
  id: "top_keywords_28d",
423
437
  windowDays: 28,
424
- async build({ engine, ctx, builtAt, searchType }) {
425
- const cutoff = utcDateMinusDays(builtAt, 28);
438
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
439
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
426
440
  const partitions = partitionsInRange(await engine.listPartitions({
427
441
  ctx,
428
442
  table: "keywords",
429
443
  ...searchType !== void 0 ? { searchType } : {}
430
- }), cutoff, utcDateMinusDays(builtAt, 0));
444
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
431
445
  if (partitions.length === 0) return [];
432
446
  return (await engine.runSQL({
433
447
  ctx,
@@ -484,13 +498,13 @@ const topKeywords28dParquetRollup = {
484
498
  }
485
499
  ],
486
500
  parquetSortKey: ["clicks"],
487
- async build({ engine, ctx, builtAt, searchType }) {
488
- const cutoff = utcDateMinusDays(builtAt, 28);
501
+ async build({ engine, ctx, windowAnchorMs, searchType }) {
502
+ const cutoff = utcDateMinusDays(windowAnchorMs, 28);
489
503
  const partitions = partitionsInRange(await engine.listPartitions({
490
504
  ctx,
491
505
  table: "keywords",
492
506
  ...searchType !== void 0 ? { searchType } : {}
493
- }), cutoff, utcDateMinusDays(builtAt, 0));
507
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
494
508
  if (partitions.length === 0) return [];
495
509
  return (await engine.runSQL({
496
510
  ctx,
@@ -568,7 +582,7 @@ const indexingHealthRollup = {
568
582
  id: "indexing_health",
569
583
  windowDays: 90,
570
584
  sliceOrthogonal: true,
571
- async build({ engine, ctx, dataSource, builtAt }) {
585
+ async build({ engine, ctx, dataSource, windowAnchorMs }) {
572
586
  const key = inspectionParquetKey(ctx);
573
587
  if (!await dataSource.head?.(key)) return { days: [] };
574
588
  const sql = `
@@ -583,7 +597,7 @@ const indexingHealthRollup = {
583
597
  SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
584
598
  SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
585
599
  FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
586
- WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
600
+ WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(windowAnchorMs, 90)}'
587
601
  GROUP BY 1
588
602
  ORDER BY 1
589
603
  `;
@@ -612,19 +626,19 @@ const indexPercentRollup = {
612
626
  id: "index_percent",
613
627
  windowDays: 90,
614
628
  sliceOrthogonal: true,
615
- async build({ engine, ctx, dataSource, builtAt, searchType }) {
629
+ async build({ engine, ctx, dataSource, windowAnchorMs, searchType }) {
616
630
  const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
617
631
  if (urlsKeys.length === 0) return {
618
632
  totalSitemapUrls: 0,
619
633
  days: []
620
634
  };
621
- const cutoff = utcDateMinusDays(builtAt, 90);
635
+ const cutoff = utcDateMinusDays(windowAnchorMs, 90);
622
636
  const factSearchType = searchType ?? "web";
623
637
  const pagesPartitions = partitionsInRange(await engine.listPartitions({
624
638
  ctx,
625
639
  table: "pages",
626
640
  searchType: factSearchType
627
- }), cutoff, utcDateMinusDays(builtAt, 0));
641
+ }), cutoff, utcDateMinusDays(windowAnchorMs, 0));
628
642
  const numerator = await engine.runSQL({
629
643
  ctx,
630
644
  table: "pages",
@@ -683,10 +697,10 @@ const sitemapHealthRollup = {
683
697
  id: "sitemap_health",
684
698
  windowDays: 90,
685
699
  sliceOrthogonal: true,
686
- async build({ dataSource, ctx, builtAt }) {
700
+ async build({ dataSource, ctx, windowAnchorMs }) {
687
701
  const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
688
702
  const records = Object.values(index.records);
689
- const cutoff = utcDateMinusDays(builtAt, 90);
703
+ const cutoff = utcDateMinusDays(windowAnchorMs, 90);
690
704
  const byDay = /* @__PURE__ */ new Map();
691
705
  const feeds = [];
692
706
  for (const r of records) {
@@ -727,10 +741,10 @@ const sitemapChanges28dRollup = {
727
741
  id: "sitemap_changes_28d",
728
742
  windowDays: 28,
729
743
  sliceOrthogonal: true,
730
- async build({ dataSource, ctx, builtAt }) {
744
+ async build({ dataSource, ctx, windowAnchorMs }) {
731
745
  const store = createSitemapStore({ dataSource });
732
- const from = utcDateMinusDays(builtAt, 28);
733
- const to = utcDateMinusDays(builtAt, 0);
746
+ const from = utcDateMinusDays(windowAnchorMs, 28);
747
+ const to = utcDateMinusDays(windowAnchorMs, 0);
734
748
  const counts = /* @__PURE__ */ new Map();
735
749
  const addedTop = [];
736
750
  const removedTop = [];
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.20.1",
4
+ "version": "0.20.3",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "gscdump": "0.20.1",
173
- "@gscdump/contracts": "0.20.1"
172
+ "gscdump": "0.20.3",
173
+ "@gscdump/contracts": "0.20.3"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",