@gscdump/engine 0.31.3 → 0.31.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/rollups.d.mts +35 -4
- package/dist/rollups.mjs +50 -12
- package/package.json +3 -3
package/dist/rollups.d.mts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { DataSource, FileSetRef, Row as Row$1 } from "./_chunks/storage.mjs";
|
|
1
|
+
import { DataSource, FileSetRef, Row as Row$1, TableName as TableName$1 } from "./_chunks/storage.mjs";
|
|
2
2
|
import { ColumnDef as ColumnDef$1 } from "./_chunks/schema.mjs";
|
|
3
3
|
import { EngineError } from "./_chunks/errors.mjs";
|
|
4
4
|
import { SearchType } from "gscdump/query";
|
|
@@ -200,6 +200,26 @@ declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRol
|
|
|
200
200
|
* production and raise if headroom allows.
|
|
201
201
|
*/
|
|
202
202
|
declare const WINDOW_BYTE_BUDGET: number;
|
|
203
|
+
/**
|
|
204
|
+
* Per-page OUTPUT row cap for key-paginated rollups (`runWindowed({ paginate })`
|
|
205
|
+
* and `runPagedQuery`). `planRollupWindows` bounds the *input* parquet bytes a
|
|
206
|
+
* window scans, which is a fine proxy for output size on fact aggregations whose
|
|
207
|
+
* grain matches the input (one output row per input date). It is NOT a proxy for
|
|
208
|
+
* aggregations that COLLAPSE to a smaller-cardinality grain whose row count is
|
|
209
|
+
* driven by a high-cardinality GROUP key — `(query_canonical × date)` and
|
|
210
|
+
* `(query_canonical)` — where output rows scale with distinct canonicals, not
|
|
211
|
+
* input bytes. For those, each `runSQL` result (shipped as an Arrow IPC stream
|
|
212
|
+
* over the Workers service-binding RPC; 28MiB guard in `@gscdump/cloudflare`,
|
|
213
|
+
* duckdb-worker `assertResultBudget` at 24MiB / 100k rows) must be bounded by
|
|
214
|
+
* paging the OUTPUT, independent of how the input is windowed.
|
|
215
|
+
*
|
|
216
|
+
* Narrow rows — `(canonical, date, 3 metrics)` — page at 50k (≈16MiB at the
|
|
217
|
+
* worker's `cols×64` heuristic, well under both guards). WIDE rows carry a
|
|
218
|
+
* `GROUP_CONCAT` variants string (up to ~10 variants × ~60 chars) the heuristic
|
|
219
|
+
* under-counts, so they page smaller to keep the real IPC payload bounded.
|
|
220
|
+
*/
|
|
221
|
+
declare const ROLLUP_PAGE_ROWS = 50000;
|
|
222
|
+
declare const ROLLUP_PAGE_ROWS_WIDE = 20000;
|
|
203
223
|
/**
|
|
204
224
|
* UTC day-aligned [startMs, endMs] span a partition covers. Returns null for
|
|
205
225
|
* `hourly/` partitions and anything unrecognised — those are excluded from
|
|
@@ -234,11 +254,18 @@ declare function partitionsInRange(parts: Array<{
|
|
|
234
254
|
* Run a full-history aggregation in byte-bounded windows and concat the rows.
|
|
235
255
|
* Each window's SQL MUST date-filter to `[w.start, w.end]` (see `sqlFor`) so a
|
|
236
256
|
* tier file spanning a window boundary doesn't double-count calendar dates.
|
|
257
|
+
*
|
|
258
|
+
* `paginate` additionally pages each window's OUTPUT (see `runPagedQuery`) so a
|
|
259
|
+
* window whose GROUP cardinality is high — `(query_canonical × date)` on a large
|
|
260
|
+
* site — can't ship an oversized result even though its input bytes fit a window.
|
|
261
|
+
* Date-windowing bounds the per-query scan; output paging bounds the IPC payload.
|
|
262
|
+
* The two are orthogonal and compose. When `paginate` is set, `sqlFor` MUST emit
|
|
263
|
+
* no trailing `ORDER BY`/`LIMIT` and `paginate.orderBy` MUST be a total order.
|
|
237
264
|
*/
|
|
238
265
|
declare function runWindowed(opts: {
|
|
239
266
|
engine: RollupEngine;
|
|
240
267
|
ctx: TenantCtx;
|
|
241
|
-
table:
|
|
268
|
+
table: TableName$1;
|
|
242
269
|
searchType?: SearchType;
|
|
243
270
|
sqlFor: (w: {
|
|
244
271
|
start: string;
|
|
@@ -249,7 +276,11 @@ declare function runWindowed(opts: {
|
|
|
249
276
|
* windowed `FILES`). Use to JOIN a non-windowed sidecar (e.g. the query
|
|
250
277
|
* dimension parquet via `{ QUERY_DIM: { keys: [...] } }`) inside `sqlFor`.
|
|
251
278
|
*/
|
|
252
|
-
extraFileSets?: Record<string, FileSetRef>;
|
|
279
|
+
extraFileSets?: Record<string, FileSetRef>; /** Page each window's output by a total-order key. See `runPagedQuery`. */
|
|
280
|
+
paginate?: {
|
|
281
|
+
orderBy: string;
|
|
282
|
+
pageRows: number;
|
|
283
|
+
};
|
|
253
284
|
}): Promise<Row$1[]>;
|
|
254
285
|
/**
|
|
255
286
|
* Daily totals across the full history. One row per (date, table) with
|
|
@@ -415,4 +446,4 @@ declare const DEFAULT_ROLLUPS: readonly RollupDef[];
|
|
|
415
446
|
* (CLI: `gscdump rollups --with-canonical`).
|
|
416
447
|
*/
|
|
417
448
|
declare const CANONICAL_ROLLUPS: readonly RollupDef[];
|
|
418
|
-
export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
449
|
+
export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ParquetRollupPointer, ROLLUP_PAGE_ROWS, ROLLUP_PAGE_ROWS_WIDE, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
package/dist/rollups.mjs
CHANGED
|
@@ -127,6 +127,8 @@ function utcDateMinusDays(at, days) {
|
|
|
127
127
|
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
128
128
|
}
|
|
129
129
|
const WINDOW_BYTE_BUDGET = 10 * 1024 * 1024;
|
|
130
|
+
const ROLLUP_PAGE_ROWS = 5e4;
|
|
131
|
+
const ROLLUP_PAGE_ROWS_WIDE = 2e4;
|
|
130
132
|
const DAY_RE = /^daily\/(\d{4})-(\d{2})-(\d{2})$/;
|
|
131
133
|
const WEEK_RE = /^weekly\/(\d{4})-(\d{2})-(\d{2})$/;
|
|
132
134
|
const MONTH_RE = /^monthly\/(\d{4})-(\d{2})$/;
|
|
@@ -226,6 +228,21 @@ function partitionsInRange(parts, start, end) {
|
|
|
226
228
|
}
|
|
227
229
|
return out;
|
|
228
230
|
}
|
|
231
|
+
async function runPagedQuery(opts) {
|
|
232
|
+
const out = [];
|
|
233
|
+
for (let offset = 0;; offset += opts.pageRows) {
|
|
234
|
+
const result = await opts.engine.runSQL({
|
|
235
|
+
ctx: opts.ctx,
|
|
236
|
+
table: opts.table,
|
|
237
|
+
fileSets: opts.fileSets,
|
|
238
|
+
sql: `${opts.coreSql}\nORDER BY ${opts.orderBy}\nLIMIT ${opts.pageRows} OFFSET ${offset}`,
|
|
239
|
+
...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
|
|
240
|
+
});
|
|
241
|
+
out.push(...result.rows);
|
|
242
|
+
if (result.rows.length < opts.pageRows) break;
|
|
243
|
+
}
|
|
244
|
+
return out;
|
|
245
|
+
}
|
|
229
246
|
async function runWindowed(opts) {
|
|
230
247
|
const windows = planRollupWindows(await opts.engine.listPartitions({
|
|
231
248
|
ctx: opts.ctx,
|
|
@@ -234,16 +251,30 @@ async function runWindowed(opts) {
|
|
|
234
251
|
}));
|
|
235
252
|
const rows = [];
|
|
236
253
|
for (const w of windows) {
|
|
254
|
+
const fileSets = {
|
|
255
|
+
FILES: {
|
|
256
|
+
table: opts.table,
|
|
257
|
+
partitions: w.partitions
|
|
258
|
+
},
|
|
259
|
+
...opts.extraFileSets
|
|
260
|
+
};
|
|
261
|
+
if (opts.paginate) {
|
|
262
|
+
rows.push(...await runPagedQuery({
|
|
263
|
+
engine: opts.engine,
|
|
264
|
+
ctx: opts.ctx,
|
|
265
|
+
table: opts.table,
|
|
266
|
+
...opts.searchType !== void 0 ? { searchType: opts.searchType } : {},
|
|
267
|
+
fileSets,
|
|
268
|
+
coreSql: opts.sqlFor(w),
|
|
269
|
+
orderBy: opts.paginate.orderBy,
|
|
270
|
+
pageRows: opts.paginate.pageRows
|
|
271
|
+
}));
|
|
272
|
+
continue;
|
|
273
|
+
}
|
|
237
274
|
const result = await opts.engine.runSQL({
|
|
238
275
|
ctx: opts.ctx,
|
|
239
276
|
table: opts.table,
|
|
240
|
-
fileSets
|
|
241
|
-
FILES: {
|
|
242
|
-
table: opts.table,
|
|
243
|
-
partitions: w.partitions
|
|
244
|
-
},
|
|
245
|
-
...opts.extraFileSets
|
|
246
|
-
},
|
|
277
|
+
fileSets,
|
|
247
278
|
sql: opts.sqlFor(w),
|
|
248
279
|
...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
|
|
249
280
|
});
|
|
@@ -572,15 +603,18 @@ const queryCanonicalVariantsRollup = {
|
|
|
572
603
|
});
|
|
573
604
|
if (parts.length === 0) return [];
|
|
574
605
|
const partitions = parts.map((p) => p.partition);
|
|
575
|
-
return (await
|
|
606
|
+
return (await runPagedQuery({
|
|
607
|
+
engine,
|
|
576
608
|
ctx,
|
|
577
609
|
table: "queries",
|
|
610
|
+
...searchType !== void 0 ? { searchType } : {},
|
|
578
611
|
fileSets: { FILES: {
|
|
579
612
|
table: "queries",
|
|
580
613
|
partitions
|
|
581
614
|
} },
|
|
582
|
-
|
|
583
|
-
|
|
615
|
+
orderBy: "joinKey",
|
|
616
|
+
pageRows: ROLLUP_PAGE_ROWS_WIDE,
|
|
617
|
+
coreSql: `
|
|
584
618
|
WITH per_variant AS (
|
|
585
619
|
SELECT
|
|
586
620
|
COALESCE(NULLIF(query_canonical, ''), query) AS joinKey,
|
|
@@ -601,7 +635,7 @@ const queryCanonicalVariantsRollup = {
|
|
|
601
635
|
FROM per_variant
|
|
602
636
|
GROUP BY joinKey
|
|
603
637
|
`
|
|
604
|
-
})).
|
|
638
|
+
})).map((r) => ({
|
|
605
639
|
joinKey: String(r.joinKey),
|
|
606
640
|
variantCount: BigInt(r.variantCount),
|
|
607
641
|
canonicalName: r.canonicalName == null ? null : String(r.canonicalName),
|
|
@@ -654,6 +688,10 @@ const queryCanonicalDailyRollup = {
|
|
|
654
688
|
table: "queries",
|
|
655
689
|
keys: [dimStore.parquetKey(ctx)]
|
|
656
690
|
} } } : {},
|
|
691
|
+
paginate: {
|
|
692
|
+
orderBy: "date, query_canonical",
|
|
693
|
+
pageRows: ROLLUP_PAGE_ROWS
|
|
694
|
+
},
|
|
657
695
|
sqlFor: useDim ? (w) => `
|
|
658
696
|
SELECT
|
|
659
697
|
${canonExpr} AS query_canonical,
|
|
@@ -997,4 +1035,4 @@ const DEFAULT_ROLLUPS = [
|
|
|
997
1035
|
sitemapChanges28dRollup
|
|
998
1036
|
];
|
|
999
1037
|
const CANONICAL_ROLLUPS = [queryCanonicalVariantsRollup, queryCanonicalDailyRollup];
|
|
1000
|
-
export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
1038
|
+
export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ROLLUP_PAGE_ROWS, ROLLUP_PAGE_ROWS_WIDE, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.31.
|
|
4
|
+
"version": "0.31.4",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -191,8 +191,8 @@
|
|
|
191
191
|
"hyparquet": "^1.26.1",
|
|
192
192
|
"hyparquet-writer": "^0.16.1",
|
|
193
193
|
"proper-lockfile": "^4.1.2",
|
|
194
|
-
"gscdump": "0.31.
|
|
195
|
-
"@gscdump/contracts": "0.31.
|
|
194
|
+
"gscdump": "0.31.4",
|
|
195
|
+
"@gscdump/contracts": "0.31.4"
|
|
196
196
|
},
|
|
197
197
|
"devDependencies": {
|
|
198
198
|
"@duckdb/duckdb-wasm": "^1.32.0",
|