@gscdump/engine 0.19.3 → 0.19.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,13 +17,23 @@ function assertSatisfies(analyzer, source) {
17
17
  if (missing.length > 0) throw new AnalyzerCapabilityError(analyzer.id, missing);
18
18
  }
19
19
  async function runAnalyzerFromSource(source, params, registry) {
20
- const analyzer = registry.resolveAnalyzer(params.type, sourceHas(source, "executeSql"));
20
+ let analyzer = registry.resolveAnalyzer(params.type, sourceHas(source, "executeSql"));
21
21
  if (!analyzer) throw new AnalyzerCapabilityError(params.type, ["executeSql"]);
22
22
  assertSatisfies(analyzer, source);
23
- const plan = analyzer.build(params, {
23
+ const buildCtx = {
24
24
  adapter: source.adapter,
25
25
  siteId: source.siteId
26
- });
26
+ };
27
+ let plan;
28
+ try {
29
+ plan = analyzer.build(params, buildCtx);
30
+ } catch (err) {
31
+ const rowsVariant = err?.name === "UnresolvableDatasetError" ? registry.getAnalyzerVariants(params.type)?.rows : void 0;
32
+ if (!rowsVariant) throw err;
33
+ assertSatisfies(rowsVariant, source);
34
+ analyzer = rowsVariant;
35
+ plan = rowsVariant.build(params, buildCtx);
36
+ }
27
37
  if (plan.kind === "rows") return runRowsPlanAgainstSource(source, analyzer, plan, params);
28
38
  return runSqlPlanAgainstSource(source, analyzer, plan, params);
29
39
  }
@@ -145,7 +145,7 @@ function inferTable(dimensions) {
145
145
  if (dims.has("country")) return "countries";
146
146
  if (dims.has("device")) return "devices";
147
147
  if (dims.has("searchAppearance")) return "search_appearance";
148
- return "devices";
148
+ return "pages";
149
149
  }
150
150
  function naturalKeyColumns(table) {
151
151
  return TABLE_METADATA[table].sortKey;
@@ -1,5 +1,6 @@
1
1
  import { C as Row, M as TableName, i as DataSource, m as ParquetCodec, t as CodecCtx } from "../_chunks/storage.mjs";
2
2
  import { t as ColumnDef } from "../_chunks/schema.mjs";
3
+ import { ParquetQueryFilter } from "hyparquet";
3
4
  declare function encodeRowsToParquet(table: TableName, rows: readonly Row[]): Uint8Array;
4
5
  interface EncodeFlexOptions {
5
6
  /** Columns defining the output schema + order. */
@@ -17,7 +18,18 @@ interface EncodeFlexOptions {
17
18
  * merges cleanly with fact-table reads.
18
19
  */
19
20
  declare function encodeRowsToParquetFlex(rows: readonly Row[], opts: EncodeFlexOptions): Uint8Array;
20
- declare function decodeParquetToRows(bytes: Uint8Array): Promise<Row[]>;
21
+ interface DecodeParquetOptions {
22
+ /**
23
+ * Row filter pushed down into the parquet reader. hyparquet evaluates this
24
+ * per row group — pruning groups whose column statistics can't match and
25
+ * materialising only matching rows — so a filtered decode of a large file
26
+ * holds at most one row group plus the matches in memory, never the whole
27
+ * file. Use this whenever the caller needs a sub-slice of a big parquet
28
+ * (e.g. one feedpath out of a site-wide sitemap-urls index).
29
+ */
30
+ filter?: ParquetQueryFilter;
31
+ }
32
+ declare function decodeParquetToRows(bytes: Uint8Array, opts?: DecodeParquetOptions): Promise<Row[]>;
21
33
  interface HyparquetCodecOptions {
22
34
  /**
23
35
  * Override `readRows`. Useful when reads should be delegated to a faster
@@ -27,4 +39,4 @@ interface HyparquetCodecOptions {
27
39
  readRows?: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row[]>;
28
40
  }
29
41
  declare function createHyparquetCodec(options?: HyparquetCodecOptions): ParquetCodec;
30
- export { EncodeFlexOptions, HyparquetCodecOptions, createHyparquetCodec, decodeParquetToRows, encodeRowsToParquet, encodeRowsToParquetFlex };
42
+ export { DecodeParquetOptions, EncodeFlexOptions, HyparquetCodecOptions, createHyparquetCodec, decodeParquetToRows, encodeRowsToParquet, encodeRowsToParquetFlex };
@@ -103,9 +103,12 @@ function asyncBufferFromBytes(bytes) {
103
103
  }
104
104
  };
105
105
  }
106
- async function decodeParquetToRows(bytes) {
106
+ async function decodeParquetToRows(bytes, opts = {}) {
107
107
  if (bytes.byteLength === 0) return [];
108
- return await parquetReadObjects({ file: asyncBufferFromBytes(bytes) });
108
+ return await parquetReadObjects({
109
+ file: asyncBufferFromBytes(bytes),
110
+ ...opts.filter ? { filter: opts.filter } : {}
111
+ });
109
112
  }
110
113
  function createHyparquetCodec(options = {}) {
111
114
  return {
package/dist/entities.mjs CHANGED
@@ -418,7 +418,7 @@ function createSitemapStore(opts) {
418
418
  const fpHash = hash(feedpath);
419
419
  const includeRemoved = opts?.includeRemoved ?? false;
420
420
  const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
421
- const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
421
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes, { filter: { feedpath_hash: { $eq: fpHash } } }) : [];
422
422
  const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
423
423
  const live = /* @__PURE__ */ new Map();
424
424
  const removedMap = /* @__PURE__ */ new Map();
@@ -27,6 +27,21 @@ interface RollupEngine {
27
27
  }) => Promise<{
28
28
  rows: import('@gscdump/engine/contracts').Row[];
29
29
  }>;
30
+ /**
31
+ * Read the live manifest for a (tenant, table[, searchType]) cohort —
32
+ * cheap, no parquet decode. Builders use this to chunk a full-history scan
33
+ * into byte-bounded windows so a single `runSQL` call never has to ship
34
+ * more than ~14MB of decoded rows across the Workers service-binding RPC
35
+ * (32MiB hard cap).
36
+ */
37
+ listPartitions: (opts: {
38
+ ctx: TenantCtx;
39
+ table: import('@gscdump/engine/contracts').TableName;
40
+ searchType?: SearchType;
41
+ }) => Promise<Array<{
42
+ partition: string;
43
+ bytes: number;
44
+ }>>;
30
45
  }
31
46
  /**
32
47
  * One rollup definition. Build runs SQL over the tenant's facts and/or reads
@@ -150,8 +165,64 @@ interface RebuildRollupResult {
150
165
  /** Parquet payload byte size when `format === 'parquet'`. */
151
166
  parquetBytes?: number;
152
167
  builtAt: number;
168
+ /**
169
+ * Set when this def's build/encode/write failed. The runner records the
170
+ * failure and continues with the remaining defs so one bad rollup never
171
+ * aborts the rest. Successful defs have no `error`.
172
+ */
173
+ error?: string;
153
174
  }
154
175
  declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRollupResult[]>;
176
+ /**
177
+ * Target decoded-bytes budget per window. Sits well under the 28MiB executor
178
+ * guard so headroom remains for SQL + result rows.
179
+ */
180
+ declare const WINDOW_BYTE_BUDGET: number;
181
+ /**
182
+ * UTC day-aligned [startMs, endMs] span a partition covers. Returns null for
183
+ * `hourly/` partitions and anything unrecognised — those are excluded from
184
+ * windowed planning.
185
+ */
186
+ declare function partitionDaySpan(partition: string): {
187
+ startMs: number;
188
+ endMs: number;
189
+ } | null;
190
+ /**
191
+ * Plan byte-bounded windows over a partition set. Each window names the
192
+ * partitions whose span intersects it; a coarse tier file can land in two
193
+ * windows, so every windowed SQL MUST also date-filter to the window bounds.
194
+ */
195
+ declare function planRollupWindows(parts: Array<{
196
+ partition: string;
197
+ bytes: number;
198
+ }>, clampRange?: {
199
+ start: string;
200
+ end: string;
201
+ }): Array<{
202
+ start: string;
203
+ end: string;
204
+ partitions: string[];
205
+ }>;
206
+ /** Partition strings whose span intersects the inclusive [start, end] date range. */
207
+ declare function partitionsInRange(parts: Array<{
208
+ partition: string;
209
+ bytes: number;
210
+ }>, start: string, end: string): string[];
211
+ /**
212
+ * Run a full-history aggregation in byte-bounded windows and concat the rows.
213
+ * Each window's SQL MUST date-filter to `[w.start, w.end]` (see `sqlFor`) so a
214
+ * tier file spanning a window boundary doesn't double-count calendar dates.
215
+ */
216
+ declare function runWindowed(opts: {
217
+ engine: RollupEngine;
218
+ ctx: TenantCtx;
219
+ table: import('@gscdump/engine/contracts').TableName;
220
+ searchType?: SearchType;
221
+ sqlFor: (w: {
222
+ start: string;
223
+ end: string;
224
+ }) => string;
225
+ }): Promise<Row$1[]>;
155
226
  /**
156
227
  * Daily totals across the full history. One row per (date, table) with
157
228
  * clicks + impressions + position. Powers sparklines and headline totals.
@@ -267,4 +338,4 @@ declare function rebuildDailyFromHourly(opts: RebuildDailyFromHourlyOptions): Pr
267
338
  rowsWritten: number;
268
339
  }>;
269
340
  declare const DEFAULT_ROLLUPS: readonly RollupDef[];
270
- export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
341
+ export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/dist/rollups.mjs CHANGED
@@ -34,69 +34,76 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
34
34
  async function rebuildRollups(opts) {
35
35
  const now = opts.now ?? (() => Date.now());
36
36
  const results = [];
37
- const searchType = opts.searchType;
38
- if (searchType !== void 0) {
39
- for (const def of opts.defs) if (def.sliceOrthogonal === true) throw new Error(`rollup def '${def.id}' is slice-orthogonal; do not pass searchType`);
40
- }
41
37
  for (const def of opts.defs) {
42
38
  const builtAt = now();
43
- const payload = await def.build({
44
- engine: opts.engine,
45
- ctx: opts.ctx,
46
- dataSource: opts.dataSource,
47
- builtAt,
48
- ...searchType !== void 0 ? { searchType } : {}
49
- });
50
- if (def.format === "parquet") {
51
- if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
52
- const rows = payload ?? [];
53
- const parquetBytes = encodeRowsToParquetFlex(rows, {
54
- columns: def.parquetColumns,
55
- sortKey: def.parquetSortKey
39
+ const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
40
+ try {
41
+ const payload = await def.build({
42
+ engine: opts.engine,
43
+ ctx: opts.ctx,
44
+ dataSource: opts.dataSource,
45
+ builtAt,
46
+ ...defSearchType !== void 0 ? { searchType: defSearchType } : {}
56
47
  });
57
- const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt, searchType);
58
- await opts.dataSource.write(parquetKey, parquetBytes);
59
- const pointer = {
60
- parquetKey,
61
- rowCount: rows.length
62
- };
48
+ if (def.format === "parquet") {
49
+ if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
50
+ const rows = payload ?? [];
51
+ const parquetBytes = encodeRowsToParquetFlex(rows, {
52
+ columns: def.parquetColumns,
53
+ sortKey: def.parquetSortKey
54
+ });
55
+ const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt, defSearchType);
56
+ await opts.dataSource.write(parquetKey, parquetBytes);
57
+ const pointer = {
58
+ parquetKey,
59
+ rowCount: rows.length
60
+ };
61
+ const envelope = {
62
+ version: 1,
63
+ id: def.id,
64
+ builtAt,
65
+ windowDays: def.windowDays,
66
+ payload: pointer
67
+ };
68
+ const envelopeBytes = new TextEncoder().encode(JSON.stringify(envelope));
69
+ const key = rollupKey(opts.ctx, def.id, builtAt, defSearchType);
70
+ await opts.dataSource.write(key, envelopeBytes);
71
+ results.push({
72
+ id: def.id,
73
+ objectKey: key,
74
+ parquetKey,
75
+ bytes: envelopeBytes.byteLength,
76
+ parquetBytes: parquetBytes.byteLength,
77
+ builtAt
78
+ });
79
+ continue;
80
+ }
63
81
  const envelope = {
64
82
  version: 1,
65
83
  id: def.id,
66
84
  builtAt,
67
85
  windowDays: def.windowDays,
68
- payload: pointer
86
+ payload
69
87
  };
70
- const envelopeBytes = new TextEncoder().encode(JSON.stringify(envelope));
71
- const key = rollupKey(opts.ctx, def.id, builtAt, searchType);
72
- await opts.dataSource.write(key, envelopeBytes);
88
+ const json = JSON.stringify(envelope);
89
+ const bytes = new TextEncoder().encode(json);
90
+ const key = rollupKey(opts.ctx, def.id, builtAt, defSearchType);
91
+ await opts.dataSource.write(key, bytes);
73
92
  results.push({
74
93
  id: def.id,
75
94
  objectKey: key,
76
- parquetKey,
77
- bytes: envelopeBytes.byteLength,
78
- parquetBytes: parquetBytes.byteLength,
95
+ bytes: bytes.byteLength,
79
96
  builtAt
80
97
  });
81
- continue;
98
+ } catch (err) {
99
+ results.push({
100
+ id: def.id,
101
+ objectKey: "",
102
+ bytes: 0,
103
+ builtAt,
104
+ error: err instanceof Error ? err.stack || err.message : String(err)
105
+ });
82
106
  }
83
- const envelope = {
84
- version: 1,
85
- id: def.id,
86
- builtAt,
87
- windowDays: def.windowDays,
88
- payload
89
- };
90
- const json = JSON.stringify(envelope);
91
- const bytes = new TextEncoder().encode(json);
92
- const key = rollupKey(opts.ctx, def.id, builtAt, searchType);
93
- await opts.dataSource.write(key, bytes);
94
- results.push({
95
- id: def.id,
96
- objectKey: key,
97
- bytes: bytes.byteLength,
98
- builtAt
99
- });
100
107
  }
101
108
  return results;
102
109
  }
@@ -104,42 +111,183 @@ function utcDateMinusDays(at, days) {
104
111
  const d = new Date(at - days * MS_PER_DAY);
105
112
  return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
106
113
  }
114
+ const WINDOW_BYTE_BUDGET = 14 * 1024 * 1024;
115
+ const DAY_RE = /^daily\/(\d{4})-(\d{2})-(\d{2})$/;
116
+ const WEEK_RE = /^weekly\/(\d{4})-(\d{2})-(\d{2})$/;
117
+ const MONTH_RE = /^monthly\/(\d{4})-(\d{2})$/;
118
+ const QUARTER_RE = /^quarterly\/(\d{4})-Q([1-4])$/;
119
+ function isoDate(ms) {
120
+ const d = new Date(ms);
121
+ return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
122
+ }
123
+ function partitionDaySpan(partition) {
124
+ const day = DAY_RE.exec(partition);
125
+ if (day) {
126
+ const ms = Date.UTC(Number(day[1]), Number(day[2]) - 1, Number(day[3]));
127
+ return {
128
+ startMs: ms,
129
+ endMs: ms
130
+ };
131
+ }
132
+ const week = WEEK_RE.exec(partition);
133
+ if (week) {
134
+ const ms = Date.UTC(Number(week[1]), Number(week[2]) - 1, Number(week[3]));
135
+ return {
136
+ startMs: ms,
137
+ endMs: ms + 6 * MS_PER_DAY
138
+ };
139
+ }
140
+ const month = MONTH_RE.exec(partition);
141
+ if (month) {
142
+ const y = Number(month[1]);
143
+ const m = Number(month[2]) - 1;
144
+ return {
145
+ startMs: Date.UTC(y, m, 1),
146
+ endMs: Date.UTC(y, m + 1, 1) - MS_PER_DAY
147
+ };
148
+ }
149
+ const quarter = QUARTER_RE.exec(partition);
150
+ if (quarter) {
151
+ const y = Number(quarter[1]);
152
+ const startMonth = (Number(quarter[2]) - 1) * 3;
153
+ return {
154
+ startMs: Date.UTC(y, startMonth, 1),
155
+ endMs: Date.UTC(y, startMonth + 3, 1) - MS_PER_DAY
156
+ };
157
+ }
158
+ return null;
159
+ }
160
+ function clamp(n, lo, hi) {
161
+ return Math.max(lo, Math.min(hi, n));
162
+ }
163
+ function planRollupWindows(parts, clampRange) {
164
+ const clampStartMs = clampRange ? Date.parse(`${clampRange.start}T00:00:00Z`) : void 0;
165
+ const clampEndMs = clampRange ? Date.parse(`${clampRange.end}T00:00:00Z`) : void 0;
166
+ const spans = [];
167
+ for (const p of parts) {
168
+ const span = partitionDaySpan(p.partition);
169
+ if (!span) continue;
170
+ if (clampStartMs !== void 0 && clampEndMs !== void 0) {
171
+ if (span.endMs < clampStartMs || span.startMs > clampEndMs) continue;
172
+ }
173
+ spans.push({
174
+ partition: p.partition,
175
+ bytes: p.bytes,
176
+ startMs: span.startMs,
177
+ endMs: span.endMs
178
+ });
179
+ }
180
+ if (spans.length === 0) return [];
181
+ let rangeStartMs = Math.min(...spans.map((s) => s.startMs));
182
+ let rangeEndMs = Math.max(...spans.map((s) => s.endMs));
183
+ if (clampStartMs !== void 0) rangeStartMs = Math.max(rangeStartMs, clampStartMs);
184
+ if (clampEndMs !== void 0) rangeEndMs = Math.min(rangeEndMs, clampEndMs);
185
+ const totalBytes = spans.reduce((a, s) => a + s.bytes, 0);
186
+ const spanDays = Math.floor((rangeEndMs - rangeStartMs) / MS_PER_DAY) + 1;
187
+ const bytesPerDay = Math.max(1, totalBytes / spanDays);
188
+ const windowDays = clamp(Math.floor(WINDOW_BYTE_BUDGET / bytesPerDay), 7, 400);
189
+ const windows = [];
190
+ let cursorMs = rangeStartMs;
191
+ while (cursorMs <= rangeEndMs) {
192
+ const windowEndMs = Math.min(cursorMs + (windowDays - 1) * MS_PER_DAY, rangeEndMs);
193
+ const partitions = spans.filter((s) => s.endMs >= cursorMs && s.startMs <= windowEndMs).map((s) => s.partition);
194
+ if (partitions.length > 0) windows.push({
195
+ start: isoDate(cursorMs),
196
+ end: isoDate(windowEndMs),
197
+ partitions
198
+ });
199
+ cursorMs = windowEndMs + MS_PER_DAY;
200
+ }
201
+ return windows;
202
+ }
203
+ function partitionsInRange(parts, start, end) {
204
+ const startMs = Date.parse(`${start}T00:00:00Z`);
205
+ const endMs = Date.parse(`${end}T00:00:00Z`);
206
+ const out = [];
207
+ for (const p of parts) {
208
+ const span = partitionDaySpan(p.partition);
209
+ if (!span) continue;
210
+ if (span.endMs >= startMs && span.startMs <= endMs) out.push(p.partition);
211
+ }
212
+ return out;
213
+ }
214
+ async function runWindowed(opts) {
215
+ const windows = planRollupWindows(await opts.engine.listPartitions({
216
+ ctx: opts.ctx,
217
+ table: opts.table,
218
+ ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
219
+ }));
220
+ const rows = [];
221
+ for (const w of windows) {
222
+ const result = await opts.engine.runSQL({
223
+ ctx: opts.ctx,
224
+ table: opts.table,
225
+ fileSets: { FILES: {
226
+ table: opts.table,
227
+ partitions: w.partitions
228
+ } },
229
+ sql: opts.sqlFor(w),
230
+ ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
231
+ });
232
+ rows.push(...result.rows);
233
+ }
234
+ return rows;
235
+ }
107
236
  const dailyTotalsRollup = {
108
237
  id: "daily_totals",
109
238
  windowDays: null,
110
239
  async build({ engine, ctx, searchType }) {
111
- const pages = await engine.runSQL({
240
+ const pageRows = await runWindowed({
241
+ engine,
112
242
  ctx,
113
243
  table: "pages",
114
- fileSets: { FILES: { table: "pages" } },
115
- sql: `
244
+ ...searchType !== void 0 ? { searchType } : {},
245
+ sqlFor: (w) => `
116
246
  SELECT
117
247
  date,
118
248
  SUM(clicks)::BIGINT AS clicks,
119
249
  SUM(impressions)::BIGINT AS impressions,
120
250
  SUM(sum_position)::DOUBLE AS sum_position
121
251
  FROM read_parquet({{FILES}}, union_by_name = true)
252
+ WHERE date >= '${w.start}' AND date <= '${w.end}'
122
253
  GROUP BY date
123
254
  ORDER BY date
124
- `,
125
- ...searchType !== void 0 ? { searchType } : {}
255
+ `
126
256
  });
127
- const keywords = await engine.runSQL({
257
+ const keywordRows = await runWindowed({
258
+ engine,
128
259
  ctx,
129
260
  table: "keywords",
130
- fileSets: { FILES: { table: "keywords" } },
131
- sql: `
261
+ ...searchType !== void 0 ? { searchType } : {},
262
+ sqlFor: (w) => `
132
263
  SELECT
133
264
  date,
134
265
  SUM(impressions)::BIGINT AS impressions
135
266
  FROM read_parquet({{FILES}}, union_by_name = true)
267
+ WHERE date >= '${w.start}' AND date <= '${w.end}'
136
268
  GROUP BY date
137
- `,
138
- ...searchType !== void 0 ? { searchType } : {}
269
+ `
139
270
  });
271
+ const pagesByDate = /* @__PURE__ */ new Map();
272
+ for (const r of pageRows) {
273
+ const date = String(r.date);
274
+ const cur = pagesByDate.get(date) ?? {
275
+ date,
276
+ clicks: BigInt(0),
277
+ impressions: BigInt(0),
278
+ sum_position: 0
279
+ };
280
+ cur.clicks += BigInt(r.clicks);
281
+ cur.impressions += BigInt(r.impressions);
282
+ cur.sum_position += Number(r.sum_position);
283
+ pagesByDate.set(date, cur);
284
+ }
140
285
  const keywordImpressionsByDate = /* @__PURE__ */ new Map();
141
- for (const r of keywords.rows) keywordImpressionsByDate.set(String(r.date), BigInt(r.impressions));
142
- return pages.rows.map((r) => {
286
+ for (const r of keywordRows) {
287
+ const date = String(r.date);
288
+ keywordImpressionsByDate.set(date, (keywordImpressionsByDate.get(date) ?? BigInt(0)) + BigInt(r.impressions));
289
+ }
290
+ return Array.from(pagesByDate.values()).sort((a, b) => a.date < b.date ? -1 : 1).map((r) => {
143
291
  const totalImpressions = BigInt(r.impressions);
144
292
  const queryImpressions = keywordImpressionsByDate.get(String(r.date)) ?? BigInt(0);
145
293
  const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
@@ -157,27 +305,38 @@ const weeklyTotalsRollup = {
157
305
  id: "weekly_totals",
158
306
  windowDays: null,
159
307
  async build({ engine, ctx, searchType }) {
160
- return (await engine.runSQL({
308
+ const rows = await runWindowed({
309
+ engine,
161
310
  ctx,
162
311
  table: "pages",
163
- fileSets: { FILES: { table: "pages" } },
164
312
  ...searchType !== void 0 ? { searchType } : {},
165
- sql: `
313
+ sqlFor: (w) => `
166
314
  SELECT
167
315
  strftime(date_trunc('week', date::DATE), '%Y-%m-%d') AS week,
168
316
  SUM(clicks)::BIGINT AS clicks,
169
317
  SUM(impressions)::BIGINT AS impressions,
170
318
  SUM(sum_position)::DOUBLE AS sum_position
171
319
  FROM read_parquet({{FILES}}, union_by_name = true)
320
+ WHERE date >= '${w.start}' AND date <= '${w.end}'
172
321
  GROUP BY 1
173
322
  ORDER BY 1
174
323
  `
175
- })).rows.map((r) => ({
176
- week: r.week,
177
- clicks: Number(r.clicks),
178
- impressions: Number(r.impressions),
179
- sum_position: Number(r.sum_position)
180
- }));
324
+ });
325
+ const byWeek = /* @__PURE__ */ new Map();
326
+ for (const r of rows) {
327
+ const week = String(r.week);
328
+ const cur = byWeek.get(week) ?? {
329
+ week,
330
+ clicks: 0,
331
+ impressions: 0,
332
+ sum_position: 0
333
+ };
334
+ cur.clicks += Number(r.clicks);
335
+ cur.impressions += Number(r.impressions);
336
+ cur.sum_position += Number(r.sum_position);
337
+ byWeek.set(week, cur);
338
+ }
339
+ return Array.from(byWeek.values()).sort((a, b) => a.week < b.week ? -1 : 1);
181
340
  }
182
341
  };
183
342
  const topPages28dRollup = {
@@ -185,10 +344,19 @@ const topPages28dRollup = {
185
344
  windowDays: 28,
186
345
  async build({ engine, ctx, builtAt, searchType }) {
187
346
  const cutoff = utcDateMinusDays(builtAt, 28);
347
+ const partitions = partitionsInRange(await engine.listPartitions({
348
+ ctx,
349
+ table: "pages",
350
+ ...searchType !== void 0 ? { searchType } : {}
351
+ }), cutoff, utcDateMinusDays(builtAt, 0));
352
+ if (partitions.length === 0) return [];
188
353
  return (await engine.runSQL({
189
354
  ctx,
190
355
  table: "pages",
191
- fileSets: { FILES: { table: "pages" } },
356
+ fileSets: { FILES: {
357
+ table: "pages",
358
+ partitions
359
+ } },
192
360
  ...searchType !== void 0 ? { searchType } : {},
193
361
  sql: `
194
362
  SELECT
@@ -215,10 +383,19 @@ const topCountries28dRollup = {
215
383
  windowDays: 28,
216
384
  async build({ engine, ctx, builtAt, searchType }) {
217
385
  const cutoff = utcDateMinusDays(builtAt, 28);
386
+ const partitions = partitionsInRange(await engine.listPartitions({
387
+ ctx,
388
+ table: "countries",
389
+ ...searchType !== void 0 ? { searchType } : {}
390
+ }), cutoff, utcDateMinusDays(builtAt, 0));
391
+ if (partitions.length === 0) return [];
218
392
  return (await engine.runSQL({
219
393
  ctx,
220
394
  table: "countries",
221
- fileSets: { FILES: { table: "countries" } },
395
+ fileSets: { FILES: {
396
+ table: "countries",
397
+ partitions
398
+ } },
222
399
  ...searchType !== void 0 ? { searchType } : {},
223
400
  sql: `
224
401
  SELECT
@@ -245,10 +422,19 @@ const topKeywords28dRollup = {
245
422
  windowDays: 28,
246
423
  async build({ engine, ctx, builtAt, searchType }) {
247
424
  const cutoff = utcDateMinusDays(builtAt, 28);
425
+ const partitions = partitionsInRange(await engine.listPartitions({
426
+ ctx,
427
+ table: "keywords",
428
+ ...searchType !== void 0 ? { searchType } : {}
429
+ }), cutoff, utcDateMinusDays(builtAt, 0));
430
+ if (partitions.length === 0) return [];
248
431
  return (await engine.runSQL({
249
432
  ctx,
250
433
  table: "keywords",
251
- fileSets: { FILES: { table: "keywords" } },
434
+ fileSets: { FILES: {
435
+ table: "keywords",
436
+ partitions
437
+ } },
252
438
  ...searchType !== void 0 ? { searchType } : {},
253
439
  sql: `
254
440
  SELECT
@@ -299,10 +485,19 @@ const topKeywords28dParquetRollup = {
299
485
  parquetSortKey: ["clicks"],
300
486
  async build({ engine, ctx, builtAt, searchType }) {
301
487
  const cutoff = utcDateMinusDays(builtAt, 28);
488
+ const partitions = partitionsInRange(await engine.listPartitions({
489
+ ctx,
490
+ table: "keywords",
491
+ ...searchType !== void 0 ? { searchType } : {}
492
+ }), cutoff, utcDateMinusDays(builtAt, 0));
493
+ if (partitions.length === 0) return [];
302
494
  return (await engine.runSQL({
303
495
  ctx,
304
496
  table: "keywords",
305
- fileSets: { FILES: { table: "keywords" } },
497
+ fileSets: { FILES: {
498
+ table: "keywords",
499
+ partitions
500
+ } },
306
501
  ...searchType !== void 0 ? { searchType } : {},
307
502
  sql: `
308
503
  SELECT
@@ -423,11 +618,19 @@ const indexPercentRollup = {
423
618
  days: []
424
619
  };
425
620
  const cutoff = utcDateMinusDays(builtAt, 90);
621
+ const pagesPartitions = partitionsInRange(await engine.listPartitions({
622
+ ctx,
623
+ table: "pages",
624
+ ...searchType !== void 0 ? { searchType } : {}
625
+ }), cutoff, utcDateMinusDays(builtAt, 0));
426
626
  const numerator = await engine.runSQL({
427
627
  ctx,
428
628
  table: "pages",
429
629
  fileSets: {
430
- PAGES: { table: "pages" },
630
+ PAGES: {
631
+ table: "pages",
632
+ partitions: pagesPartitions
633
+ },
431
634
  URLS: {
432
635
  table: "pages",
433
636
  keys: [urlsKey]
@@ -626,4 +829,4 @@ const DEFAULT_ROLLUPS = [
626
829
  sitemapHealthRollup,
627
830
  sitemapChanges28dRollup
628
831
  ];
629
- export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
832
+ export { DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.19.3",
4
+ "version": "0.19.6",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "@gscdump/contracts": "0.19.3",
173
- "gscdump": "0.19.3"
172
+ "gscdump": "0.19.6",
173
+ "@gscdump/contracts": "0.19.6"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",