@gscdump/engine 0.19.3 → 0.19.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/dispatch.mjs +13 -3
- package/dist/_chunks/schema.mjs +1 -1
- package/dist/adapters/hyparquet.d.mts +14 -2
- package/dist/adapters/hyparquet.mjs +5 -2
- package/dist/entities.mjs +1 -1
- package/dist/rollups.d.mts +72 -1
- package/dist/rollups.mjs +278 -75
- package/package.json +3 -3
|
@@ -17,13 +17,23 @@ function assertSatisfies(analyzer, source) {
|
|
|
17
17
|
if (missing.length > 0) throw new AnalyzerCapabilityError(analyzer.id, missing);
|
|
18
18
|
}
|
|
19
19
|
async function runAnalyzerFromSource(source, params, registry) {
|
|
20
|
-
|
|
20
|
+
let analyzer = registry.resolveAnalyzer(params.type, sourceHas(source, "executeSql"));
|
|
21
21
|
if (!analyzer) throw new AnalyzerCapabilityError(params.type, ["executeSql"]);
|
|
22
22
|
assertSatisfies(analyzer, source);
|
|
23
|
-
const
|
|
23
|
+
const buildCtx = {
|
|
24
24
|
adapter: source.adapter,
|
|
25
25
|
siteId: source.siteId
|
|
26
|
-
}
|
|
26
|
+
};
|
|
27
|
+
let plan;
|
|
28
|
+
try {
|
|
29
|
+
plan = analyzer.build(params, buildCtx);
|
|
30
|
+
} catch (err) {
|
|
31
|
+
const rowsVariant = err?.name === "UnresolvableDatasetError" ? registry.getAnalyzerVariants(params.type)?.rows : void 0;
|
|
32
|
+
if (!rowsVariant) throw err;
|
|
33
|
+
assertSatisfies(rowsVariant, source);
|
|
34
|
+
analyzer = rowsVariant;
|
|
35
|
+
plan = rowsVariant.build(params, buildCtx);
|
|
36
|
+
}
|
|
27
37
|
if (plan.kind === "rows") return runRowsPlanAgainstSource(source, analyzer, plan, params);
|
|
28
38
|
return runSqlPlanAgainstSource(source, analyzer, plan, params);
|
|
29
39
|
}
|
package/dist/_chunks/schema.mjs
CHANGED
|
@@ -145,7 +145,7 @@ function inferTable(dimensions) {
|
|
|
145
145
|
if (dims.has("country")) return "countries";
|
|
146
146
|
if (dims.has("device")) return "devices";
|
|
147
147
|
if (dims.has("searchAppearance")) return "search_appearance";
|
|
148
|
-
return "
|
|
148
|
+
return "pages";
|
|
149
149
|
}
|
|
150
150
|
function naturalKeyColumns(table) {
|
|
151
151
|
return TABLE_METADATA[table].sortKey;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { C as Row, M as TableName, i as DataSource, m as ParquetCodec, t as CodecCtx } from "../_chunks/storage.mjs";
|
|
2
2
|
import { t as ColumnDef } from "../_chunks/schema.mjs";
|
|
3
|
+
import { ParquetQueryFilter } from "hyparquet";
|
|
3
4
|
declare function encodeRowsToParquet(table: TableName, rows: readonly Row[]): Uint8Array;
|
|
4
5
|
interface EncodeFlexOptions {
|
|
5
6
|
/** Columns defining the output schema + order. */
|
|
@@ -17,7 +18,18 @@ interface EncodeFlexOptions {
|
|
|
17
18
|
* merges cleanly with fact-table reads.
|
|
18
19
|
*/
|
|
19
20
|
declare function encodeRowsToParquetFlex(rows: readonly Row[], opts: EncodeFlexOptions): Uint8Array;
|
|
20
|
-
|
|
21
|
+
interface DecodeParquetOptions {
|
|
22
|
+
/**
|
|
23
|
+
* Row filter pushed down into the parquet reader. hyparquet evaluates this
|
|
24
|
+
* per row group — pruning groups whose column statistics can't match and
|
|
25
|
+
* materialising only matching rows — so a filtered decode of a large file
|
|
26
|
+
* holds at most one row group plus the matches in memory, never the whole
|
|
27
|
+
* file. Use this whenever the caller needs a sub-slice of a big parquet
|
|
28
|
+
* (e.g. one feedpath out of a site-wide sitemap-urls index).
|
|
29
|
+
*/
|
|
30
|
+
filter?: ParquetQueryFilter;
|
|
31
|
+
}
|
|
32
|
+
declare function decodeParquetToRows(bytes: Uint8Array, opts?: DecodeParquetOptions): Promise<Row[]>;
|
|
21
33
|
interface HyparquetCodecOptions {
|
|
22
34
|
/**
|
|
23
35
|
* Override `readRows`. Useful when reads should be delegated to a faster
|
|
@@ -27,4 +39,4 @@ interface HyparquetCodecOptions {
|
|
|
27
39
|
readRows?: (ctx: CodecCtx, key: string, dataSource: DataSource) => Promise<Row[]>;
|
|
28
40
|
}
|
|
29
41
|
declare function createHyparquetCodec(options?: HyparquetCodecOptions): ParquetCodec;
|
|
30
|
-
export { EncodeFlexOptions, HyparquetCodecOptions, createHyparquetCodec, decodeParquetToRows, encodeRowsToParquet, encodeRowsToParquetFlex };
|
|
42
|
+
export { DecodeParquetOptions, EncodeFlexOptions, HyparquetCodecOptions, createHyparquetCodec, decodeParquetToRows, encodeRowsToParquet, encodeRowsToParquetFlex };
|
|
@@ -103,9 +103,12 @@ function asyncBufferFromBytes(bytes) {
|
|
|
103
103
|
}
|
|
104
104
|
};
|
|
105
105
|
}
|
|
106
|
-
async function decodeParquetToRows(bytes) {
|
|
106
|
+
async function decodeParquetToRows(bytes, opts = {}) {
|
|
107
107
|
if (bytes.byteLength === 0) return [];
|
|
108
|
-
return await parquetReadObjects({
|
|
108
|
+
return await parquetReadObjects({
|
|
109
|
+
file: asyncBufferFromBytes(bytes),
|
|
110
|
+
...opts.filter ? { filter: opts.filter } : {}
|
|
111
|
+
});
|
|
109
112
|
}
|
|
110
113
|
function createHyparquetCodec(options = {}) {
|
|
111
114
|
return {
|
package/dist/entities.mjs
CHANGED
|
@@ -418,7 +418,7 @@ function createSitemapStore(opts) {
|
|
|
418
418
|
const fpHash = hash(feedpath);
|
|
419
419
|
const includeRemoved = opts?.includeRemoved ?? false;
|
|
420
420
|
const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
|
|
421
|
-
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
421
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes, { filter: { feedpath_hash: { $eq: fpHash } } }) : [];
|
|
422
422
|
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
423
423
|
const live = /* @__PURE__ */ new Map();
|
|
424
424
|
const removedMap = /* @__PURE__ */ new Map();
|
package/dist/rollups.d.mts
CHANGED
|
@@ -27,6 +27,21 @@ interface RollupEngine {
|
|
|
27
27
|
}) => Promise<{
|
|
28
28
|
rows: import('@gscdump/engine/contracts').Row[];
|
|
29
29
|
}>;
|
|
30
|
+
/**
|
|
31
|
+
* Read the live manifest for a (tenant, table[, searchType]) cohort —
|
|
32
|
+
* cheap, no parquet decode. Builders use this to chunk a full-history scan
|
|
33
|
+
* into byte-bounded windows so a single `runSQL` call never has to ship
|
|
34
|
+
* more than ~14MB of decoded rows across the Workers service-binding RPC
|
|
35
|
+
* (32MiB hard cap).
|
|
36
|
+
*/
|
|
37
|
+
listPartitions: (opts: {
|
|
38
|
+
ctx: TenantCtx;
|
|
39
|
+
table: import('@gscdump/engine/contracts').TableName;
|
|
40
|
+
searchType?: SearchType;
|
|
41
|
+
}) => Promise<Array<{
|
|
42
|
+
partition: string;
|
|
43
|
+
bytes: number;
|
|
44
|
+
}>>;
|
|
30
45
|
}
|
|
31
46
|
/**
|
|
32
47
|
* One rollup definition. Build runs SQL over the tenant's facts and/or reads
|
|
@@ -150,8 +165,64 @@ interface RebuildRollupResult {
|
|
|
150
165
|
/** Parquet payload byte size when `format === 'parquet'`. */
|
|
151
166
|
parquetBytes?: number;
|
|
152
167
|
builtAt: number;
|
|
168
|
+
/**
|
|
169
|
+
* Set when this def's build/encode/write failed. The runner records the
|
|
170
|
+
* failure and continues with the remaining defs so one bad rollup never
|
|
171
|
+
* aborts the rest. Successful defs have no `error`.
|
|
172
|
+
*/
|
|
173
|
+
error?: string;
|
|
153
174
|
}
|
|
154
175
|
declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRollupResult[]>;
|
|
176
|
+
/**
|
|
177
|
+
* Target decoded-bytes budget per window. Sits well under the 28MiB executor
|
|
178
|
+
* guard so headroom remains for SQL + result rows.
|
|
179
|
+
*/
|
|
180
|
+
declare const WINDOW_BYTE_BUDGET: number;
|
|
181
|
+
/**
|
|
182
|
+
* UTC day-aligned [startMs, endMs] span a partition covers. Returns null for
|
|
183
|
+
* `hourly/` partitions and anything unrecognised — those are excluded from
|
|
184
|
+
* windowed planning.
|
|
185
|
+
*/
|
|
186
|
+
declare function partitionDaySpan(partition: string): {
|
|
187
|
+
startMs: number;
|
|
188
|
+
endMs: number;
|
|
189
|
+
} | null;
|
|
190
|
+
/**
|
|
191
|
+
* Plan byte-bounded windows over a partition set. Each window names the
|
|
192
|
+
* partitions whose span intersects it; a coarse tier file can land in two
|
|
193
|
+
* windows, so every windowed SQL MUST also date-filter to the window bounds.
|
|
194
|
+
*/
|
|
195
|
+
declare function planRollupWindows(parts: Array<{
|
|
196
|
+
partition: string;
|
|
197
|
+
bytes: number;
|
|
198
|
+
}>, clampRange?: {
|
|
199
|
+
start: string;
|
|
200
|
+
end: string;
|
|
201
|
+
}): Array<{
|
|
202
|
+
start: string;
|
|
203
|
+
end: string;
|
|
204
|
+
partitions: string[];
|
|
205
|
+
}>;
|
|
206
|
+
/** Partition strings whose span intersects the inclusive [start, end] date range. */
|
|
207
|
+
declare function partitionsInRange(parts: Array<{
|
|
208
|
+
partition: string;
|
|
209
|
+
bytes: number;
|
|
210
|
+
}>, start: string, end: string): string[];
|
|
211
|
+
/**
|
|
212
|
+
* Run a full-history aggregation in byte-bounded windows and concat the rows.
|
|
213
|
+
* Each window's SQL MUST date-filter to `[w.start, w.end]` (see `sqlFor`) so a
|
|
214
|
+
* tier file spanning a window boundary doesn't double-count calendar dates.
|
|
215
|
+
*/
|
|
216
|
+
declare function runWindowed(opts: {
|
|
217
|
+
engine: RollupEngine;
|
|
218
|
+
ctx: TenantCtx;
|
|
219
|
+
table: import('@gscdump/engine/contracts').TableName;
|
|
220
|
+
searchType?: SearchType;
|
|
221
|
+
sqlFor: (w: {
|
|
222
|
+
start: string;
|
|
223
|
+
end: string;
|
|
224
|
+
}) => string;
|
|
225
|
+
}): Promise<Row$1[]>;
|
|
155
226
|
/**
|
|
156
227
|
* Daily totals across the full history. One row per (date, table) with
|
|
157
228
|
* clicks + impressions + position. Powers sparklines and headline totals.
|
|
@@ -267,4 +338,4 @@ declare function rebuildDailyFromHourly(opts: RebuildDailyFromHourlyOptions): Pr
|
|
|
267
338
|
rowsWritten: number;
|
|
268
339
|
}>;
|
|
269
340
|
declare const DEFAULT_ROLLUPS: readonly RollupDef[];
|
|
270
|
-
export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
341
|
+
export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
package/dist/rollups.mjs
CHANGED
|
@@ -34,69 +34,76 @@ async function readLatestRollup(bucket, ctx, id, searchType) {
|
|
|
34
34
|
async function rebuildRollups(opts) {
|
|
35
35
|
const now = opts.now ?? (() => Date.now());
|
|
36
36
|
const results = [];
|
|
37
|
-
const searchType = opts.searchType;
|
|
38
|
-
if (searchType !== void 0) {
|
|
39
|
-
for (const def of opts.defs) if (def.sliceOrthogonal === true) throw new Error(`rollup def '${def.id}' is slice-orthogonal; do not pass searchType`);
|
|
40
|
-
}
|
|
41
37
|
for (const def of opts.defs) {
|
|
42
38
|
const builtAt = now();
|
|
43
|
-
const
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
|
|
52
|
-
const rows = payload ?? [];
|
|
53
|
-
const parquetBytes = encodeRowsToParquetFlex(rows, {
|
|
54
|
-
columns: def.parquetColumns,
|
|
55
|
-
sortKey: def.parquetSortKey
|
|
39
|
+
const defSearchType = def.sliceOrthogonal === true ? void 0 : opts.searchType;
|
|
40
|
+
try {
|
|
41
|
+
const payload = await def.build({
|
|
42
|
+
engine: opts.engine,
|
|
43
|
+
ctx: opts.ctx,
|
|
44
|
+
dataSource: opts.dataSource,
|
|
45
|
+
builtAt,
|
|
46
|
+
...defSearchType !== void 0 ? { searchType: defSearchType } : {}
|
|
56
47
|
});
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
48
|
+
if (def.format === "parquet") {
|
|
49
|
+
if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
|
|
50
|
+
const rows = payload ?? [];
|
|
51
|
+
const parquetBytes = encodeRowsToParquetFlex(rows, {
|
|
52
|
+
columns: def.parquetColumns,
|
|
53
|
+
sortKey: def.parquetSortKey
|
|
54
|
+
});
|
|
55
|
+
const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt, defSearchType);
|
|
56
|
+
await opts.dataSource.write(parquetKey, parquetBytes);
|
|
57
|
+
const pointer = {
|
|
58
|
+
parquetKey,
|
|
59
|
+
rowCount: rows.length
|
|
60
|
+
};
|
|
61
|
+
const envelope = {
|
|
62
|
+
version: 1,
|
|
63
|
+
id: def.id,
|
|
64
|
+
builtAt,
|
|
65
|
+
windowDays: def.windowDays,
|
|
66
|
+
payload: pointer
|
|
67
|
+
};
|
|
68
|
+
const envelopeBytes = new TextEncoder().encode(JSON.stringify(envelope));
|
|
69
|
+
const key = rollupKey(opts.ctx, def.id, builtAt, defSearchType);
|
|
70
|
+
await opts.dataSource.write(key, envelopeBytes);
|
|
71
|
+
results.push({
|
|
72
|
+
id: def.id,
|
|
73
|
+
objectKey: key,
|
|
74
|
+
parquetKey,
|
|
75
|
+
bytes: envelopeBytes.byteLength,
|
|
76
|
+
parquetBytes: parquetBytes.byteLength,
|
|
77
|
+
builtAt
|
|
78
|
+
});
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
63
81
|
const envelope = {
|
|
64
82
|
version: 1,
|
|
65
83
|
id: def.id,
|
|
66
84
|
builtAt,
|
|
67
85
|
windowDays: def.windowDays,
|
|
68
|
-
payload
|
|
86
|
+
payload
|
|
69
87
|
};
|
|
70
|
-
const
|
|
71
|
-
const
|
|
72
|
-
|
|
88
|
+
const json = JSON.stringify(envelope);
|
|
89
|
+
const bytes = new TextEncoder().encode(json);
|
|
90
|
+
const key = rollupKey(opts.ctx, def.id, builtAt, defSearchType);
|
|
91
|
+
await opts.dataSource.write(key, bytes);
|
|
73
92
|
results.push({
|
|
74
93
|
id: def.id,
|
|
75
94
|
objectKey: key,
|
|
76
|
-
|
|
77
|
-
bytes: envelopeBytes.byteLength,
|
|
78
|
-
parquetBytes: parquetBytes.byteLength,
|
|
95
|
+
bytes: bytes.byteLength,
|
|
79
96
|
builtAt
|
|
80
97
|
});
|
|
81
|
-
|
|
98
|
+
} catch (err) {
|
|
99
|
+
results.push({
|
|
100
|
+
id: def.id,
|
|
101
|
+
objectKey: "",
|
|
102
|
+
bytes: 0,
|
|
103
|
+
builtAt,
|
|
104
|
+
error: err instanceof Error ? err.stack || err.message : String(err)
|
|
105
|
+
});
|
|
82
106
|
}
|
|
83
|
-
const envelope = {
|
|
84
|
-
version: 1,
|
|
85
|
-
id: def.id,
|
|
86
|
-
builtAt,
|
|
87
|
-
windowDays: def.windowDays,
|
|
88
|
-
payload
|
|
89
|
-
};
|
|
90
|
-
const json = JSON.stringify(envelope);
|
|
91
|
-
const bytes = new TextEncoder().encode(json);
|
|
92
|
-
const key = rollupKey(opts.ctx, def.id, builtAt, searchType);
|
|
93
|
-
await opts.dataSource.write(key, bytes);
|
|
94
|
-
results.push({
|
|
95
|
-
id: def.id,
|
|
96
|
-
objectKey: key,
|
|
97
|
-
bytes: bytes.byteLength,
|
|
98
|
-
builtAt
|
|
99
|
-
});
|
|
100
107
|
}
|
|
101
108
|
return results;
|
|
102
109
|
}
|
|
@@ -104,42 +111,183 @@ function utcDateMinusDays(at, days) {
|
|
|
104
111
|
const d = new Date(at - days * MS_PER_DAY);
|
|
105
112
|
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
106
113
|
}
|
|
114
|
+
const WINDOW_BYTE_BUDGET = 14 * 1024 * 1024;
|
|
115
|
+
const DAY_RE = /^daily\/(\d{4})-(\d{2})-(\d{2})$/;
|
|
116
|
+
const WEEK_RE = /^weekly\/(\d{4})-(\d{2})-(\d{2})$/;
|
|
117
|
+
const MONTH_RE = /^monthly\/(\d{4})-(\d{2})$/;
|
|
118
|
+
const QUARTER_RE = /^quarterly\/(\d{4})-Q([1-4])$/;
|
|
119
|
+
function isoDate(ms) {
|
|
120
|
+
const d = new Date(ms);
|
|
121
|
+
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
122
|
+
}
|
|
123
|
+
function partitionDaySpan(partition) {
|
|
124
|
+
const day = DAY_RE.exec(partition);
|
|
125
|
+
if (day) {
|
|
126
|
+
const ms = Date.UTC(Number(day[1]), Number(day[2]) - 1, Number(day[3]));
|
|
127
|
+
return {
|
|
128
|
+
startMs: ms,
|
|
129
|
+
endMs: ms
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
const week = WEEK_RE.exec(partition);
|
|
133
|
+
if (week) {
|
|
134
|
+
const ms = Date.UTC(Number(week[1]), Number(week[2]) - 1, Number(week[3]));
|
|
135
|
+
return {
|
|
136
|
+
startMs: ms,
|
|
137
|
+
endMs: ms + 6 * MS_PER_DAY
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
const month = MONTH_RE.exec(partition);
|
|
141
|
+
if (month) {
|
|
142
|
+
const y = Number(month[1]);
|
|
143
|
+
const m = Number(month[2]) - 1;
|
|
144
|
+
return {
|
|
145
|
+
startMs: Date.UTC(y, m, 1),
|
|
146
|
+
endMs: Date.UTC(y, m + 1, 1) - MS_PER_DAY
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
const quarter = QUARTER_RE.exec(partition);
|
|
150
|
+
if (quarter) {
|
|
151
|
+
const y = Number(quarter[1]);
|
|
152
|
+
const startMonth = (Number(quarter[2]) - 1) * 3;
|
|
153
|
+
return {
|
|
154
|
+
startMs: Date.UTC(y, startMonth, 1),
|
|
155
|
+
endMs: Date.UTC(y, startMonth + 3, 1) - MS_PER_DAY
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
return null;
|
|
159
|
+
}
|
|
160
|
+
function clamp(n, lo, hi) {
|
|
161
|
+
return Math.max(lo, Math.min(hi, n));
|
|
162
|
+
}
|
|
163
|
+
function planRollupWindows(parts, clampRange) {
|
|
164
|
+
const clampStartMs = clampRange ? Date.parse(`${clampRange.start}T00:00:00Z`) : void 0;
|
|
165
|
+
const clampEndMs = clampRange ? Date.parse(`${clampRange.end}T00:00:00Z`) : void 0;
|
|
166
|
+
const spans = [];
|
|
167
|
+
for (const p of parts) {
|
|
168
|
+
const span = partitionDaySpan(p.partition);
|
|
169
|
+
if (!span) continue;
|
|
170
|
+
if (clampStartMs !== void 0 && clampEndMs !== void 0) {
|
|
171
|
+
if (span.endMs < clampStartMs || span.startMs > clampEndMs) continue;
|
|
172
|
+
}
|
|
173
|
+
spans.push({
|
|
174
|
+
partition: p.partition,
|
|
175
|
+
bytes: p.bytes,
|
|
176
|
+
startMs: span.startMs,
|
|
177
|
+
endMs: span.endMs
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
if (spans.length === 0) return [];
|
|
181
|
+
let rangeStartMs = Math.min(...spans.map((s) => s.startMs));
|
|
182
|
+
let rangeEndMs = Math.max(...spans.map((s) => s.endMs));
|
|
183
|
+
if (clampStartMs !== void 0) rangeStartMs = Math.max(rangeStartMs, clampStartMs);
|
|
184
|
+
if (clampEndMs !== void 0) rangeEndMs = Math.min(rangeEndMs, clampEndMs);
|
|
185
|
+
const totalBytes = spans.reduce((a, s) => a + s.bytes, 0);
|
|
186
|
+
const spanDays = Math.floor((rangeEndMs - rangeStartMs) / MS_PER_DAY) + 1;
|
|
187
|
+
const bytesPerDay = Math.max(1, totalBytes / spanDays);
|
|
188
|
+
const windowDays = clamp(Math.floor(WINDOW_BYTE_BUDGET / bytesPerDay), 7, 400);
|
|
189
|
+
const windows = [];
|
|
190
|
+
let cursorMs = rangeStartMs;
|
|
191
|
+
while (cursorMs <= rangeEndMs) {
|
|
192
|
+
const windowEndMs = Math.min(cursorMs + (windowDays - 1) * MS_PER_DAY, rangeEndMs);
|
|
193
|
+
const partitions = spans.filter((s) => s.endMs >= cursorMs && s.startMs <= windowEndMs).map((s) => s.partition);
|
|
194
|
+
if (partitions.length > 0) windows.push({
|
|
195
|
+
start: isoDate(cursorMs),
|
|
196
|
+
end: isoDate(windowEndMs),
|
|
197
|
+
partitions
|
|
198
|
+
});
|
|
199
|
+
cursorMs = windowEndMs + MS_PER_DAY;
|
|
200
|
+
}
|
|
201
|
+
return windows;
|
|
202
|
+
}
|
|
203
|
+
function partitionsInRange(parts, start, end) {
|
|
204
|
+
const startMs = Date.parse(`${start}T00:00:00Z`);
|
|
205
|
+
const endMs = Date.parse(`${end}T00:00:00Z`);
|
|
206
|
+
const out = [];
|
|
207
|
+
for (const p of parts) {
|
|
208
|
+
const span = partitionDaySpan(p.partition);
|
|
209
|
+
if (!span) continue;
|
|
210
|
+
if (span.endMs >= startMs && span.startMs <= endMs) out.push(p.partition);
|
|
211
|
+
}
|
|
212
|
+
return out;
|
|
213
|
+
}
|
|
214
|
+
async function runWindowed(opts) {
|
|
215
|
+
const windows = planRollupWindows(await opts.engine.listPartitions({
|
|
216
|
+
ctx: opts.ctx,
|
|
217
|
+
table: opts.table,
|
|
218
|
+
...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
|
|
219
|
+
}));
|
|
220
|
+
const rows = [];
|
|
221
|
+
for (const w of windows) {
|
|
222
|
+
const result = await opts.engine.runSQL({
|
|
223
|
+
ctx: opts.ctx,
|
|
224
|
+
table: opts.table,
|
|
225
|
+
fileSets: { FILES: {
|
|
226
|
+
table: opts.table,
|
|
227
|
+
partitions: w.partitions
|
|
228
|
+
} },
|
|
229
|
+
sql: opts.sqlFor(w),
|
|
230
|
+
...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
|
|
231
|
+
});
|
|
232
|
+
rows.push(...result.rows);
|
|
233
|
+
}
|
|
234
|
+
return rows;
|
|
235
|
+
}
|
|
107
236
|
const dailyTotalsRollup = {
|
|
108
237
|
id: "daily_totals",
|
|
109
238
|
windowDays: null,
|
|
110
239
|
async build({ engine, ctx, searchType }) {
|
|
111
|
-
const
|
|
240
|
+
const pageRows = await runWindowed({
|
|
241
|
+
engine,
|
|
112
242
|
ctx,
|
|
113
243
|
table: "pages",
|
|
114
|
-
|
|
115
|
-
|
|
244
|
+
...searchType !== void 0 ? { searchType } : {},
|
|
245
|
+
sqlFor: (w) => `
|
|
116
246
|
SELECT
|
|
117
247
|
date,
|
|
118
248
|
SUM(clicks)::BIGINT AS clicks,
|
|
119
249
|
SUM(impressions)::BIGINT AS impressions,
|
|
120
250
|
SUM(sum_position)::DOUBLE AS sum_position
|
|
121
251
|
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
252
|
+
WHERE date >= '${w.start}' AND date <= '${w.end}'
|
|
122
253
|
GROUP BY date
|
|
123
254
|
ORDER BY date
|
|
124
|
-
|
|
125
|
-
...searchType !== void 0 ? { searchType } : {}
|
|
255
|
+
`
|
|
126
256
|
});
|
|
127
|
-
const
|
|
257
|
+
const keywordRows = await runWindowed({
|
|
258
|
+
engine,
|
|
128
259
|
ctx,
|
|
129
260
|
table: "keywords",
|
|
130
|
-
|
|
131
|
-
|
|
261
|
+
...searchType !== void 0 ? { searchType } : {},
|
|
262
|
+
sqlFor: (w) => `
|
|
132
263
|
SELECT
|
|
133
264
|
date,
|
|
134
265
|
SUM(impressions)::BIGINT AS impressions
|
|
135
266
|
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
267
|
+
WHERE date >= '${w.start}' AND date <= '${w.end}'
|
|
136
268
|
GROUP BY date
|
|
137
|
-
|
|
138
|
-
...searchType !== void 0 ? { searchType } : {}
|
|
269
|
+
`
|
|
139
270
|
});
|
|
271
|
+
const pagesByDate = /* @__PURE__ */ new Map();
|
|
272
|
+
for (const r of pageRows) {
|
|
273
|
+
const date = String(r.date);
|
|
274
|
+
const cur = pagesByDate.get(date) ?? {
|
|
275
|
+
date,
|
|
276
|
+
clicks: BigInt(0),
|
|
277
|
+
impressions: BigInt(0),
|
|
278
|
+
sum_position: 0
|
|
279
|
+
};
|
|
280
|
+
cur.clicks += BigInt(r.clicks);
|
|
281
|
+
cur.impressions += BigInt(r.impressions);
|
|
282
|
+
cur.sum_position += Number(r.sum_position);
|
|
283
|
+
pagesByDate.set(date, cur);
|
|
284
|
+
}
|
|
140
285
|
const keywordImpressionsByDate = /* @__PURE__ */ new Map();
|
|
141
|
-
for (const r of
|
|
142
|
-
|
|
286
|
+
for (const r of keywordRows) {
|
|
287
|
+
const date = String(r.date);
|
|
288
|
+
keywordImpressionsByDate.set(date, (keywordImpressionsByDate.get(date) ?? BigInt(0)) + BigInt(r.impressions));
|
|
289
|
+
}
|
|
290
|
+
return Array.from(pagesByDate.values()).sort((a, b) => a.date < b.date ? -1 : 1).map((r) => {
|
|
143
291
|
const totalImpressions = BigInt(r.impressions);
|
|
144
292
|
const queryImpressions = keywordImpressionsByDate.get(String(r.date)) ?? BigInt(0);
|
|
145
293
|
const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
|
|
@@ -157,27 +305,38 @@ const weeklyTotalsRollup = {
|
|
|
157
305
|
id: "weekly_totals",
|
|
158
306
|
windowDays: null,
|
|
159
307
|
async build({ engine, ctx, searchType }) {
|
|
160
|
-
|
|
308
|
+
const rows = await runWindowed({
|
|
309
|
+
engine,
|
|
161
310
|
ctx,
|
|
162
311
|
table: "pages",
|
|
163
|
-
fileSets: { FILES: { table: "pages" } },
|
|
164
312
|
...searchType !== void 0 ? { searchType } : {},
|
|
165
|
-
|
|
313
|
+
sqlFor: (w) => `
|
|
166
314
|
SELECT
|
|
167
315
|
strftime(date_trunc('week', date::DATE), '%Y-%m-%d') AS week,
|
|
168
316
|
SUM(clicks)::BIGINT AS clicks,
|
|
169
317
|
SUM(impressions)::BIGINT AS impressions,
|
|
170
318
|
SUM(sum_position)::DOUBLE AS sum_position
|
|
171
319
|
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
320
|
+
WHERE date >= '${w.start}' AND date <= '${w.end}'
|
|
172
321
|
GROUP BY 1
|
|
173
322
|
ORDER BY 1
|
|
174
323
|
`
|
|
175
|
-
})
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
324
|
+
});
|
|
325
|
+
const byWeek = /* @__PURE__ */ new Map();
|
|
326
|
+
for (const r of rows) {
|
|
327
|
+
const week = String(r.week);
|
|
328
|
+
const cur = byWeek.get(week) ?? {
|
|
329
|
+
week,
|
|
330
|
+
clicks: 0,
|
|
331
|
+
impressions: 0,
|
|
332
|
+
sum_position: 0
|
|
333
|
+
};
|
|
334
|
+
cur.clicks += Number(r.clicks);
|
|
335
|
+
cur.impressions += Number(r.impressions);
|
|
336
|
+
cur.sum_position += Number(r.sum_position);
|
|
337
|
+
byWeek.set(week, cur);
|
|
338
|
+
}
|
|
339
|
+
return Array.from(byWeek.values()).sort((a, b) => a.week < b.week ? -1 : 1);
|
|
181
340
|
}
|
|
182
341
|
};
|
|
183
342
|
const topPages28dRollup = {
|
|
@@ -185,10 +344,19 @@ const topPages28dRollup = {
|
|
|
185
344
|
windowDays: 28,
|
|
186
345
|
async build({ engine, ctx, builtAt, searchType }) {
|
|
187
346
|
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
347
|
+
const partitions = partitionsInRange(await engine.listPartitions({
|
|
348
|
+
ctx,
|
|
349
|
+
table: "pages",
|
|
350
|
+
...searchType !== void 0 ? { searchType } : {}
|
|
351
|
+
}), cutoff, utcDateMinusDays(builtAt, 0));
|
|
352
|
+
if (partitions.length === 0) return [];
|
|
188
353
|
return (await engine.runSQL({
|
|
189
354
|
ctx,
|
|
190
355
|
table: "pages",
|
|
191
|
-
fileSets: { FILES: {
|
|
356
|
+
fileSets: { FILES: {
|
|
357
|
+
table: "pages",
|
|
358
|
+
partitions
|
|
359
|
+
} },
|
|
192
360
|
...searchType !== void 0 ? { searchType } : {},
|
|
193
361
|
sql: `
|
|
194
362
|
SELECT
|
|
@@ -215,10 +383,19 @@ const topCountries28dRollup = {
|
|
|
215
383
|
windowDays: 28,
|
|
216
384
|
async build({ engine, ctx, builtAt, searchType }) {
|
|
217
385
|
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
386
|
+
const partitions = partitionsInRange(await engine.listPartitions({
|
|
387
|
+
ctx,
|
|
388
|
+
table: "countries",
|
|
389
|
+
...searchType !== void 0 ? { searchType } : {}
|
|
390
|
+
}), cutoff, utcDateMinusDays(builtAt, 0));
|
|
391
|
+
if (partitions.length === 0) return [];
|
|
218
392
|
return (await engine.runSQL({
|
|
219
393
|
ctx,
|
|
220
394
|
table: "countries",
|
|
221
|
-
fileSets: { FILES: {
|
|
395
|
+
fileSets: { FILES: {
|
|
396
|
+
table: "countries",
|
|
397
|
+
partitions
|
|
398
|
+
} },
|
|
222
399
|
...searchType !== void 0 ? { searchType } : {},
|
|
223
400
|
sql: `
|
|
224
401
|
SELECT
|
|
@@ -245,10 +422,19 @@ const topKeywords28dRollup = {
|
|
|
245
422
|
windowDays: 28,
|
|
246
423
|
async build({ engine, ctx, builtAt, searchType }) {
|
|
247
424
|
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
425
|
+
const partitions = partitionsInRange(await engine.listPartitions({
|
|
426
|
+
ctx,
|
|
427
|
+
table: "keywords",
|
|
428
|
+
...searchType !== void 0 ? { searchType } : {}
|
|
429
|
+
}), cutoff, utcDateMinusDays(builtAt, 0));
|
|
430
|
+
if (partitions.length === 0) return [];
|
|
248
431
|
return (await engine.runSQL({
|
|
249
432
|
ctx,
|
|
250
433
|
table: "keywords",
|
|
251
|
-
fileSets: { FILES: {
|
|
434
|
+
fileSets: { FILES: {
|
|
435
|
+
table: "keywords",
|
|
436
|
+
partitions
|
|
437
|
+
} },
|
|
252
438
|
...searchType !== void 0 ? { searchType } : {},
|
|
253
439
|
sql: `
|
|
254
440
|
SELECT
|
|
@@ -299,10 +485,19 @@ const topKeywords28dParquetRollup = {
|
|
|
299
485
|
parquetSortKey: ["clicks"],
|
|
300
486
|
async build({ engine, ctx, builtAt, searchType }) {
|
|
301
487
|
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
488
|
+
const partitions = partitionsInRange(await engine.listPartitions({
|
|
489
|
+
ctx,
|
|
490
|
+
table: "keywords",
|
|
491
|
+
...searchType !== void 0 ? { searchType } : {}
|
|
492
|
+
}), cutoff, utcDateMinusDays(builtAt, 0));
|
|
493
|
+
if (partitions.length === 0) return [];
|
|
302
494
|
return (await engine.runSQL({
|
|
303
495
|
ctx,
|
|
304
496
|
table: "keywords",
|
|
305
|
-
fileSets: { FILES: {
|
|
497
|
+
fileSets: { FILES: {
|
|
498
|
+
table: "keywords",
|
|
499
|
+
partitions
|
|
500
|
+
} },
|
|
306
501
|
...searchType !== void 0 ? { searchType } : {},
|
|
307
502
|
sql: `
|
|
308
503
|
SELECT
|
|
@@ -423,11 +618,19 @@ const indexPercentRollup = {
|
|
|
423
618
|
days: []
|
|
424
619
|
};
|
|
425
620
|
const cutoff = utcDateMinusDays(builtAt, 90);
|
|
621
|
+
const pagesPartitions = partitionsInRange(await engine.listPartitions({
|
|
622
|
+
ctx,
|
|
623
|
+
table: "pages",
|
|
624
|
+
...searchType !== void 0 ? { searchType } : {}
|
|
625
|
+
}), cutoff, utcDateMinusDays(builtAt, 0));
|
|
426
626
|
const numerator = await engine.runSQL({
|
|
427
627
|
ctx,
|
|
428
628
|
table: "pages",
|
|
429
629
|
fileSets: {
|
|
430
|
-
PAGES: {
|
|
630
|
+
PAGES: {
|
|
631
|
+
table: "pages",
|
|
632
|
+
partitions: pagesPartitions
|
|
633
|
+
},
|
|
431
634
|
URLS: {
|
|
432
635
|
table: "pages",
|
|
433
636
|
keys: [urlsKey]
|
|
@@ -626,4 +829,4 @@ const DEFAULT_ROLLUPS = [
|
|
|
626
829
|
sitemapHealthRollup,
|
|
627
830
|
sitemapChanges28dRollup
|
|
628
831
|
];
|
|
629
|
-
export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
832
|
+
export { DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.19.
|
|
4
|
+
"version": "0.19.6",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -169,8 +169,8 @@
|
|
|
169
169
|
"dependencies": {
|
|
170
170
|
"drizzle-orm": "^0.45.2",
|
|
171
171
|
"proper-lockfile": "^4.1.2",
|
|
172
|
-
"
|
|
173
|
-
"gscdump": "0.19.
|
|
172
|
+
"gscdump": "0.19.6",
|
|
173
|
+
"@gscdump/contracts": "0.19.6"
|
|
174
174
|
},
|
|
175
175
|
"devDependencies": {
|
|
176
176
|
"@duckdb/duckdb-wasm": "^1.32.0",
|