@gscdump/engine 0.31.3 → 0.31.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { DataSource, FileSetRef, Row as Row$1 } from "./_chunks/storage.mjs";
1
+ import { DataSource, FileSetRef, Row as Row$1, TableName as TableName$1 } from "./_chunks/storage.mjs";
2
2
  import { ColumnDef as ColumnDef$1 } from "./_chunks/schema.mjs";
3
3
  import { EngineError } from "./_chunks/errors.mjs";
4
4
  import { SearchType } from "gscdump/query";
@@ -200,6 +200,26 @@ declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRol
200
200
  * production and raise if headroom allows.
201
201
  */
202
202
  declare const WINDOW_BYTE_BUDGET: number;
203
+ /**
204
+ * Per-page OUTPUT row cap for key-paginated rollups (`runWindowed({ paginate })`
205
+ * and `runPagedQuery`). `planRollupWindows` bounds the *input* parquet bytes a
206
+ * window scans, which is a fine proxy for output size on fact aggregations whose
207
+ * grain matches the input (one output row per input date). It is NOT a proxy for
208
+ * aggregations that COLLAPSE to a smaller-cardinality grain whose row count is
209
+ * driven by a high-cardinality GROUP key — `(query_canonical × date)` and
210
+ * `(query_canonical)` — where output rows scale with distinct canonicals, not
211
+ * input bytes. For those, each `runSQL` result (shipped as an Arrow IPC stream
212
+ * over the Workers service-binding RPC; 28MiB guard in `@gscdump/cloudflare`,
213
+ * duckdb-worker `assertResultBudget` at 24MiB / 100k rows) must be bounded by
214
+ * paging the OUTPUT, independent of how the input is windowed.
215
+ *
216
+ * Narrow rows — `(canonical, date, 3 metrics)` — page at 50k (≈16MiB at the
217
+ * worker's `cols×64` heuristic, well under both guards). WIDE rows carry a
218
+ * `GROUP_CONCAT` variants string (up to ~10 variants × ~60 chars) the heuristic
219
+ * under-counts, so they page smaller to keep the real IPC payload bounded.
220
+ */
221
+ declare const ROLLUP_PAGE_ROWS = 50000;
222
+ declare const ROLLUP_PAGE_ROWS_WIDE = 20000;
203
223
  /**
204
224
  * UTC day-aligned [startMs, endMs] span a partition covers. Returns null for
205
225
  * `hourly/` partitions and anything unrecognised — those are excluded from
@@ -234,11 +254,18 @@ declare function partitionsInRange(parts: Array<{
234
254
  * Run a full-history aggregation in byte-bounded windows and concat the rows.
235
255
  * Each window's SQL MUST date-filter to `[w.start, w.end]` (see `sqlFor`) so a
236
256
  * tier file spanning a window boundary doesn't double-count calendar dates.
257
+ *
258
+ * `paginate` additionally pages each window's OUTPUT (see `runPagedQuery`) so a
259
+ * window whose GROUP cardinality is high — `(query_canonical × date)` on a large
260
+ * site — can't ship an oversized result even though its input bytes fit a window.
261
+ * Date-windowing bounds the per-query scan; output paging bounds the IPC payload.
262
+ * The two are orthogonal and compose. When `paginate` is set, `sqlFor` MUST emit
263
+ * no trailing `ORDER BY`/`LIMIT` and `paginate.orderBy` MUST be a total order.
237
264
  */
238
265
  declare function runWindowed(opts: {
239
266
  engine: RollupEngine;
240
267
  ctx: TenantCtx;
241
- table: import('@gscdump/engine/contracts').TableName;
268
+ table: TableName$1;
242
269
  searchType?: SearchType;
243
270
  sqlFor: (w: {
244
271
  start: string;
@@ -249,7 +276,11 @@ declare function runWindowed(opts: {
249
276
  * windowed `FILES`). Use to JOIN a non-windowed sidecar (e.g. the query
250
277
  * dimension parquet via `{ QUERY_DIM: { keys: [...] } }`) inside `sqlFor`.
251
278
  */
252
- extraFileSets?: Record<string, FileSetRef>;
279
+ extraFileSets?: Record<string, FileSetRef>; /** Page each window's output by a total-order key. See `runPagedQuery`. */
280
+ paginate?: {
281
+ orderBy: string;
282
+ pageRows: number;
283
+ };
253
284
  }): Promise<Row$1[]>;
254
285
  /**
255
286
  * Daily totals across the full history. One row per (date, table) with
@@ -415,4 +446,4 @@ declare const DEFAULT_ROLLUPS: readonly RollupDef[];
415
446
  * (CLI: `gscdump rollups --with-canonical`).
416
447
  */
417
448
  declare const CANONICAL_ROLLUPS: readonly RollupDef[];
418
- export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
449
+ export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ParquetRollupPointer, ROLLUP_PAGE_ROWS, ROLLUP_PAGE_ROWS_WIDE, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/dist/rollups.mjs CHANGED
@@ -127,6 +127,8 @@ function utcDateMinusDays(at, days) {
127
127
  return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
128
128
  }
129
129
  const WINDOW_BYTE_BUDGET = 10 * 1024 * 1024;
130
+ const ROLLUP_PAGE_ROWS = 5e4;
131
+ const ROLLUP_PAGE_ROWS_WIDE = 2e4;
130
132
  const DAY_RE = /^daily\/(\d{4})-(\d{2})-(\d{2})$/;
131
133
  const WEEK_RE = /^weekly\/(\d{4})-(\d{2})-(\d{2})$/;
132
134
  const MONTH_RE = /^monthly\/(\d{4})-(\d{2})$/;
@@ -226,6 +228,21 @@ function partitionsInRange(parts, start, end) {
226
228
  }
227
229
  return out;
228
230
  }
231
+ async function runPagedQuery(opts) {
232
+ const out = [];
233
+ for (let offset = 0;; offset += opts.pageRows) {
234
+ const result = await opts.engine.runSQL({
235
+ ctx: opts.ctx,
236
+ table: opts.table,
237
+ fileSets: opts.fileSets,
238
+ sql: `${opts.coreSql}\nORDER BY ${opts.orderBy}\nLIMIT ${opts.pageRows} OFFSET ${offset}`,
239
+ ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
240
+ });
241
+ out.push(...result.rows);
242
+ if (result.rows.length < opts.pageRows) break;
243
+ }
244
+ return out;
245
+ }
229
246
  async function runWindowed(opts) {
230
247
  const windows = planRollupWindows(await opts.engine.listPartitions({
231
248
  ctx: opts.ctx,
@@ -234,16 +251,30 @@ async function runWindowed(opts) {
234
251
  }));
235
252
  const rows = [];
236
253
  for (const w of windows) {
254
+ const fileSets = {
255
+ FILES: {
256
+ table: opts.table,
257
+ partitions: w.partitions
258
+ },
259
+ ...opts.extraFileSets
260
+ };
261
+ if (opts.paginate) {
262
+ rows.push(...await runPagedQuery({
263
+ engine: opts.engine,
264
+ ctx: opts.ctx,
265
+ table: opts.table,
266
+ ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {},
267
+ fileSets,
268
+ coreSql: opts.sqlFor(w),
269
+ orderBy: opts.paginate.orderBy,
270
+ pageRows: opts.paginate.pageRows
271
+ }));
272
+ continue;
273
+ }
237
274
  const result = await opts.engine.runSQL({
238
275
  ctx: opts.ctx,
239
276
  table: opts.table,
240
- fileSets: {
241
- FILES: {
242
- table: opts.table,
243
- partitions: w.partitions
244
- },
245
- ...opts.extraFileSets
246
- },
277
+ fileSets,
247
278
  sql: opts.sqlFor(w),
248
279
  ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
249
280
  });
@@ -572,15 +603,18 @@ const queryCanonicalVariantsRollup = {
572
603
  });
573
604
  if (parts.length === 0) return [];
574
605
  const partitions = parts.map((p) => p.partition);
575
- return (await engine.runSQL({
606
+ return (await runPagedQuery({
607
+ engine,
576
608
  ctx,
577
609
  table: "queries",
610
+ ...searchType !== void 0 ? { searchType } : {},
578
611
  fileSets: { FILES: {
579
612
  table: "queries",
580
613
  partitions
581
614
  } },
582
- ...searchType !== void 0 ? { searchType } : {},
583
- sql: `
615
+ orderBy: "joinKey",
616
+ pageRows: ROLLUP_PAGE_ROWS_WIDE,
617
+ coreSql: `
584
618
  WITH per_variant AS (
585
619
  SELECT
586
620
  COALESCE(NULLIF(query_canonical, ''), query) AS joinKey,
@@ -601,7 +635,7 @@ const queryCanonicalVariantsRollup = {
601
635
  FROM per_variant
602
636
  GROUP BY joinKey
603
637
  `
604
- })).rows.map((r) => ({
638
+ })).map((r) => ({
605
639
  joinKey: String(r.joinKey),
606
640
  variantCount: BigInt(r.variantCount),
607
641
  canonicalName: r.canonicalName == null ? null : String(r.canonicalName),
@@ -654,6 +688,10 @@ const queryCanonicalDailyRollup = {
654
688
  table: "queries",
655
689
  keys: [dimStore.parquetKey(ctx)]
656
690
  } } } : {},
691
+ paginate: {
692
+ orderBy: "date, query_canonical",
693
+ pageRows: ROLLUP_PAGE_ROWS
694
+ },
657
695
  sqlFor: useDim ? (w) => `
658
696
  SELECT
659
697
  ${canonExpr} AS query_canonical,
@@ -997,4 +1035,4 @@ const DEFAULT_ROLLUPS = [
997
1035
  sitemapChanges28dRollup
998
1036
  ];
999
1037
  const CANONICAL_ROLLUPS = [queryCanonicalVariantsRollup, queryCanonicalDailyRollup];
1000
- export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
1038
+ export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ROLLUP_PAGE_ROWS, ROLLUP_PAGE_ROWS_WIDE, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.31.3",
4
+ "version": "0.31.4",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -191,8 +191,8 @@
191
191
  "hyparquet": "^1.26.1",
192
192
  "hyparquet-writer": "^0.16.1",
193
193
  "proper-lockfile": "^4.1.2",
194
- "gscdump": "0.31.3",
195
- "@gscdump/contracts": "0.31.3"
194
+ "gscdump": "0.31.4",
195
+ "@gscdump/contracts": "0.31.4"
196
196
  },
197
197
  "devDependencies": {
198
198
  "@duckdb/duckdb-wasm": "^1.32.0",