@gscdump/engine 0.25.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/compaction.mjs +247 -0
- package/dist/_chunks/engine.mjs +19 -1
- package/dist/_chunks/parquet-plan.mjs +3 -244
- package/dist/_chunks/resolver.mjs +1 -1
- package/dist/_chunks/storage.d.mts +66 -1
- package/dist/compaction-public.d.mts +15 -0
- package/dist/compaction-public.mjs +5 -0
- package/dist/index.mjs +2 -1
- package/dist/planner.mjs +2 -1
- package/package.json +8 -3
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { dayPartition, inferSearchType, mondayOfWeek, monthPartition, objectKey, quarterOfMonth, quarterPartition, weekPartition } from "./layout.mjs";
|
|
2
|
+
import { currentSchemaVersion } from "./schema.mjs";
|
|
3
|
+
import { MS_PER_DAY } from "gscdump";
|
|
4
|
+
const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
|
|
5
|
+
const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
|
|
6
|
+
const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
|
|
7
|
+
const QUARTERLY_PARTITION_RE = /^quarterly\/(\d{4})-Q([1-4])$/;
|
|
8
|
+
const DEFAULT_THRESHOLDS = {
|
|
9
|
+
raw: 7,
|
|
10
|
+
d7: 30,
|
|
11
|
+
d30: 90
|
|
12
|
+
};
|
|
13
|
+
function countRawDailies(entries) {
|
|
14
|
+
return entries.filter((e) => e.tier === "raw" || e.tier == null && e.partition.startsWith("daily/")).length;
|
|
15
|
+
}
|
|
16
|
+
const PENDING_WINDOW_DAYS = 4;
|
|
17
|
+
const STAGES = [
|
|
18
|
+
{
|
|
19
|
+
inputTier: "raw",
|
|
20
|
+
outputTier: "d7",
|
|
21
|
+
cutoffDays: DEFAULT_THRESHOLDS.raw,
|
|
22
|
+
bucketKey: (e) => {
|
|
23
|
+
const m = e.partition.match(DAILY_PARTITION_RE);
|
|
24
|
+
if (!m) return void 0;
|
|
25
|
+
return mondayOfWeek(m[1]);
|
|
26
|
+
},
|
|
27
|
+
bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
|
|
28
|
+
outputPartition: weekPartition
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
inputTier: "d7",
|
|
32
|
+
outputTier: "d30",
|
|
33
|
+
cutoffDays: DEFAULT_THRESHOLDS.d7,
|
|
34
|
+
bucketKey: (e) => {
|
|
35
|
+
const m = e.partition.match(WEEKLY_PARTITION_RE);
|
|
36
|
+
if (!m) return void 0;
|
|
37
|
+
return m[1].slice(0, 7);
|
|
38
|
+
},
|
|
39
|
+
bucketLatestMs: monthEndMs,
|
|
40
|
+
outputPartition: monthPartition
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
inputTier: "d30",
|
|
44
|
+
outputTier: "d90",
|
|
45
|
+
cutoffDays: DEFAULT_THRESHOLDS.d30,
|
|
46
|
+
bucketKey: (e) => {
|
|
47
|
+
const m = e.partition.match(MONTHLY_PARTITION_RE);
|
|
48
|
+
if (!m) return void 0;
|
|
49
|
+
return quarterOfMonth(m[1]);
|
|
50
|
+
},
|
|
51
|
+
bucketLatestMs: quarterEndMs,
|
|
52
|
+
outputPartition: quarterPartition
|
|
53
|
+
}
|
|
54
|
+
];
|
|
55
|
+
async function compactTieredImpl(deps, ctx, now, overrides = {}) {
|
|
56
|
+
const thresholds = {
|
|
57
|
+
...DEFAULT_THRESHOLDS,
|
|
58
|
+
...overrides
|
|
59
|
+
};
|
|
60
|
+
const stagesWithThresholds = STAGES.map((s) => ({
|
|
61
|
+
...s,
|
|
62
|
+
cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
|
|
63
|
+
}));
|
|
64
|
+
for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
|
|
65
|
+
}
|
|
66
|
+
async function runStage(deps, ctx, stage, now) {
|
|
67
|
+
const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
|
|
68
|
+
const candidates = await deps.manifestStore.listLive({
|
|
69
|
+
userId: ctx.userId,
|
|
70
|
+
siteId: ctx.siteId,
|
|
71
|
+
table: ctx.table,
|
|
72
|
+
tier: stage.inputTier
|
|
73
|
+
});
|
|
74
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
75
|
+
for (const entry of candidates) {
|
|
76
|
+
if (entry.partition.startsWith("hourly/")) continue;
|
|
77
|
+
const key = stage.bucketKey(entry);
|
|
78
|
+
if (!key) continue;
|
|
79
|
+
if (stage.bucketLatestMs(key) >= cutoff) continue;
|
|
80
|
+
const compositeKey = `${inferSearchType(entry)}\0${key}`;
|
|
81
|
+
if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
|
|
82
|
+
buckets.get(compositeKey).push(entry);
|
|
83
|
+
}
|
|
84
|
+
for (const [compositeKey, entries] of buckets) {
|
|
85
|
+
const [searchType, bucket] = compositeKey.split("\0");
|
|
86
|
+
const targetPartition = stage.outputPartition(bucket);
|
|
87
|
+
if (entries.length === 1 && entries[0].partition === targetPartition) continue;
|
|
88
|
+
await deps.manifestStore.withLock({
|
|
89
|
+
userId: ctx.userId,
|
|
90
|
+
siteId: ctx.siteId,
|
|
91
|
+
table: ctx.table,
|
|
92
|
+
partition: targetPartition
|
|
93
|
+
}, async () => {
|
|
94
|
+
const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
|
|
95
|
+
const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
|
|
96
|
+
const newEntry = {
|
|
97
|
+
userId: ctx.userId,
|
|
98
|
+
siteId: ctx.siteId,
|
|
99
|
+
table: ctx.table,
|
|
100
|
+
partition: targetPartition,
|
|
101
|
+
objectKey: key,
|
|
102
|
+
rowCount,
|
|
103
|
+
bytes,
|
|
104
|
+
createdAt: now,
|
|
105
|
+
schemaVersion: currentSchemaVersion(ctx.table),
|
|
106
|
+
tier: stage.outputTier,
|
|
107
|
+
...searchType !== "web" ? { searchType } : {}
|
|
108
|
+
};
|
|
109
|
+
await deps.manifestStore.registerVersion(newEntry, entries);
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
function enumeratePartitions(startDate, endDate) {
|
|
114
|
+
const out = [];
|
|
115
|
+
const [sy, sm, sd] = startDate.split("-").map(Number);
|
|
116
|
+
const [ey, em, ed] = endDate.split("-").map(Number);
|
|
117
|
+
const start = Date.UTC(sy, sm - 1, sd);
|
|
118
|
+
const end = Date.UTC(ey, em - 1, ed);
|
|
119
|
+
if (end < start) return out;
|
|
120
|
+
const seenWeeks = /* @__PURE__ */ new Set();
|
|
121
|
+
const seenMonths = /* @__PURE__ */ new Set();
|
|
122
|
+
const seenQuarters = /* @__PURE__ */ new Set();
|
|
123
|
+
for (let t = start; t <= end; t += 864e5) {
|
|
124
|
+
const d = new Date(t);
|
|
125
|
+
const y = d.getUTCFullYear();
|
|
126
|
+
const m = String(d.getUTCMonth() + 1).padStart(2, "0");
|
|
127
|
+
const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
128
|
+
const isoMonth = `${y}-${m}`;
|
|
129
|
+
out.push(dayPartition(isoDay));
|
|
130
|
+
const monday = mondayOfWeek(isoDay);
|
|
131
|
+
if (!seenWeeks.has(monday)) {
|
|
132
|
+
seenWeeks.add(monday);
|
|
133
|
+
out.push(weekPartition(monday));
|
|
134
|
+
}
|
|
135
|
+
if (!seenMonths.has(isoMonth)) {
|
|
136
|
+
seenMonths.add(isoMonth);
|
|
137
|
+
out.push(monthPartition(isoMonth));
|
|
138
|
+
}
|
|
139
|
+
const quarter = quarterOfMonth(isoMonth);
|
|
140
|
+
if (!seenQuarters.has(quarter)) {
|
|
141
|
+
seenQuarters.add(quarter);
|
|
142
|
+
out.push(quarterPartition(quarter));
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return out;
|
|
146
|
+
}
|
|
147
|
+
function partitionSpan(partition) {
|
|
148
|
+
let m = partition.match(DAILY_PARTITION_RE);
|
|
149
|
+
if (m) {
|
|
150
|
+
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
151
|
+
return {
|
|
152
|
+
rank: 0,
|
|
153
|
+
startMs: ms,
|
|
154
|
+
endMs: ms
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
m = partition.match(WEEKLY_PARTITION_RE);
|
|
158
|
+
if (m) {
|
|
159
|
+
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
160
|
+
return {
|
|
161
|
+
rank: 1,
|
|
162
|
+
startMs: ms,
|
|
163
|
+
endMs: ms + 6 * MS_PER_DAY
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
m = partition.match(MONTHLY_PARTITION_RE);
|
|
167
|
+
if (m) {
|
|
168
|
+
const [y, mo] = m[1].split("-").map(Number);
|
|
169
|
+
return {
|
|
170
|
+
rank: 2,
|
|
171
|
+
startMs: Date.UTC(y, mo - 1, 1),
|
|
172
|
+
endMs: Date.UTC(y, mo, 0)
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
m = partition.match(QUARTERLY_PARTITION_RE);
|
|
176
|
+
if (m) {
|
|
177
|
+
const y = Number(m[1]);
|
|
178
|
+
const q = Number(m[2]);
|
|
179
|
+
return {
|
|
180
|
+
rank: 3,
|
|
181
|
+
startMs: Date.UTC(y, (q - 1) * 3, 1),
|
|
182
|
+
endMs: Date.UTC(y, q * 3, 0)
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
function splitOverlappingTiers(entries, queryRange) {
|
|
187
|
+
const rangeStartMs = queryRange ? Date.parse(`${queryRange.start}T00:00:00Z`) : void 0;
|
|
188
|
+
const rangeEndMs = queryRange ? Date.parse(`${queryRange.end}T00:00:00Z`) : void 0;
|
|
189
|
+
const spanned = [];
|
|
190
|
+
const kept = [];
|
|
191
|
+
const subsumed = [];
|
|
192
|
+
for (const entry of entries) {
|
|
193
|
+
const span = partitionSpan(entry.partition);
|
|
194
|
+
if (!span) {
|
|
195
|
+
kept.push(entry);
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
const days = [];
|
|
199
|
+
for (let t = span.startMs; t <= span.endMs; t += MS_PER_DAY) {
|
|
200
|
+
if (rangeStartMs !== void 0 && (t < rangeStartMs || t > rangeEndMs)) continue;
|
|
201
|
+
days.push(t);
|
|
202
|
+
}
|
|
203
|
+
if (queryRange && days.length === 0) {
|
|
204
|
+
subsumed.push(entry);
|
|
205
|
+
continue;
|
|
206
|
+
}
|
|
207
|
+
spanned.push({
|
|
208
|
+
entry,
|
|
209
|
+
rank: span.rank,
|
|
210
|
+
days
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
spanned.sort((a, b) => a.rank - b.rank || b.entry.createdAt - a.entry.createdAt);
|
|
214
|
+
const coveredBySearchType = /* @__PURE__ */ new Map();
|
|
215
|
+
for (const { entry, days } of spanned) {
|
|
216
|
+
const slice = inferSearchType(entry);
|
|
217
|
+
let covered = coveredBySearchType.get(slice);
|
|
218
|
+
if (!covered) {
|
|
219
|
+
covered = /* @__PURE__ */ new Set();
|
|
220
|
+
coveredBySearchType.set(slice, covered);
|
|
221
|
+
}
|
|
222
|
+
if (days.every((d) => covered.has(d))) {
|
|
223
|
+
subsumed.push(entry);
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
kept.push(entry);
|
|
227
|
+
for (const d of days) covered.add(d);
|
|
228
|
+
}
|
|
229
|
+
return {
|
|
230
|
+
kept,
|
|
231
|
+
subsumed
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
function dedupeOverlappingTiers(entries, queryRange) {
|
|
235
|
+
return splitOverlappingTiers(entries, queryRange).kept;
|
|
236
|
+
}
|
|
237
|
+
function monthEndMs(month) {
|
|
238
|
+
const [y, m] = month.split("-").map(Number);
|
|
239
|
+
return Date.UTC(y, m, 0, 23, 59, 59, 999);
|
|
240
|
+
}
|
|
241
|
+
function quarterEndMs(quarter) {
|
|
242
|
+
const [yStr, qStr] = quarter.split("-Q");
|
|
243
|
+
const y = Number(yStr);
|
|
244
|
+
const q = Number(qStr);
|
|
245
|
+
return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
|
|
246
|
+
}
|
|
247
|
+
export { compactTieredImpl, countRawDailies, dedupeOverlappingTiers, enumeratePartitions, splitOverlappingTiers };
|
package/dist/_chunks/engine.mjs
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
|
|
2
2
|
import { SCHEMAS, currentSchemaVersion, dedupeByNaturalKey } from "./schema.mjs";
|
|
3
|
-
import { compactTieredImpl,
|
|
3
|
+
import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
|
|
4
|
+
import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
|
|
4
5
|
import { sqlEscape } from "../sql-bind.mjs";
|
|
5
6
|
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
6
7
|
import { normalizeUrl } from "gscdump";
|
|
@@ -485,6 +486,22 @@ function createStorageEngine(opts) {
|
|
|
485
486
|
codec
|
|
486
487
|
}, ctx, (ctx.now ?? defaultNow)(), thresholds);
|
|
487
488
|
}
|
|
489
|
+
async function reconcileSubsumed(ctx) {
|
|
490
|
+
const { subsumed } = splitOverlappingTiers(await manifestStore.listLive({
|
|
491
|
+
userId: ctx.userId,
|
|
492
|
+
siteId: ctx.siteId,
|
|
493
|
+
table: ctx.table
|
|
494
|
+
}));
|
|
495
|
+
if (subsumed.length === 0) return {
|
|
496
|
+
retired: 0,
|
|
497
|
+
partitions: []
|
|
498
|
+
};
|
|
499
|
+
await manifestStore.registerVersions([], subsumed);
|
|
500
|
+
return {
|
|
501
|
+
retired: subsumed.length,
|
|
502
|
+
partitions: subsumed.map((e) => e.partition)
|
|
503
|
+
};
|
|
504
|
+
}
|
|
488
505
|
async function gcOrphans(ctx, graceMs) {
|
|
489
506
|
return gcOrphansImpl({
|
|
490
507
|
dataSource,
|
|
@@ -590,6 +607,7 @@ function createStorageEngine(opts) {
|
|
|
590
607
|
query,
|
|
591
608
|
runSQL,
|
|
592
609
|
compactTiered,
|
|
610
|
+
reconcileSubsumed,
|
|
593
611
|
gcOrphans,
|
|
594
612
|
purgeTenant,
|
|
595
613
|
purgeUrls,
|
|
@@ -1,248 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { dimensionToColumn } from "./schema.mjs";
|
|
2
|
+
import { enumeratePartitions } from "./compaction.mjs";
|
|
3
3
|
import { METRIC_EXPR, escapeLike, topLevelPagePredicateSql } from "../sql-fragments.mjs";
|
|
4
4
|
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
5
|
-
import { MS_PER_DAY } from "gscdump";
|
|
6
|
-
const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
|
|
7
|
-
const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
|
|
8
|
-
const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
|
|
9
|
-
const QUARTERLY_PARTITION_RE = /^quarterly\/(\d{4})-Q([1-4])$/;
|
|
10
|
-
const DEFAULT_THRESHOLDS = {
|
|
11
|
-
raw: 7,
|
|
12
|
-
d7: 30,
|
|
13
|
-
d30: 90
|
|
14
|
-
};
|
|
15
|
-
const PENDING_WINDOW_DAYS = 4;
|
|
16
|
-
const STAGES = [
|
|
17
|
-
{
|
|
18
|
-
inputTier: "raw",
|
|
19
|
-
outputTier: "d7",
|
|
20
|
-
cutoffDays: DEFAULT_THRESHOLDS.raw,
|
|
21
|
-
bucketKey: (e) => {
|
|
22
|
-
const m = e.partition.match(DAILY_PARTITION_RE);
|
|
23
|
-
if (!m) return void 0;
|
|
24
|
-
return mondayOfWeek(m[1]);
|
|
25
|
-
},
|
|
26
|
-
bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
|
|
27
|
-
outputPartition: weekPartition
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
inputTier: "d7",
|
|
31
|
-
outputTier: "d30",
|
|
32
|
-
cutoffDays: DEFAULT_THRESHOLDS.d7,
|
|
33
|
-
bucketKey: (e) => {
|
|
34
|
-
const m = e.partition.match(WEEKLY_PARTITION_RE);
|
|
35
|
-
if (!m) return void 0;
|
|
36
|
-
return m[1].slice(0, 7);
|
|
37
|
-
},
|
|
38
|
-
bucketLatestMs: monthEndMs,
|
|
39
|
-
outputPartition: monthPartition
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
inputTier: "d30",
|
|
43
|
-
outputTier: "d90",
|
|
44
|
-
cutoffDays: DEFAULT_THRESHOLDS.d30,
|
|
45
|
-
bucketKey: (e) => {
|
|
46
|
-
const m = e.partition.match(MONTHLY_PARTITION_RE);
|
|
47
|
-
if (!m) return void 0;
|
|
48
|
-
return quarterOfMonth(m[1]);
|
|
49
|
-
},
|
|
50
|
-
bucketLatestMs: quarterEndMs,
|
|
51
|
-
outputPartition: quarterPartition
|
|
52
|
-
}
|
|
53
|
-
];
|
|
54
|
-
async function compactTieredImpl(deps, ctx, now, overrides = {}) {
|
|
55
|
-
const thresholds = {
|
|
56
|
-
...DEFAULT_THRESHOLDS,
|
|
57
|
-
...overrides
|
|
58
|
-
};
|
|
59
|
-
const stagesWithThresholds = STAGES.map((s) => ({
|
|
60
|
-
...s,
|
|
61
|
-
cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
|
|
62
|
-
}));
|
|
63
|
-
for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
|
|
64
|
-
}
|
|
65
|
-
async function runStage(deps, ctx, stage, now) {
|
|
66
|
-
const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
|
|
67
|
-
const candidates = await deps.manifestStore.listLive({
|
|
68
|
-
userId: ctx.userId,
|
|
69
|
-
siteId: ctx.siteId,
|
|
70
|
-
table: ctx.table,
|
|
71
|
-
tier: stage.inputTier
|
|
72
|
-
});
|
|
73
|
-
const buckets = /* @__PURE__ */ new Map();
|
|
74
|
-
for (const entry of candidates) {
|
|
75
|
-
if (entry.partition.startsWith("hourly/")) continue;
|
|
76
|
-
const key = stage.bucketKey(entry);
|
|
77
|
-
if (!key) continue;
|
|
78
|
-
if (stage.bucketLatestMs(key) >= cutoff) continue;
|
|
79
|
-
const compositeKey = `${inferSearchType(entry)}\0${key}`;
|
|
80
|
-
if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
|
|
81
|
-
buckets.get(compositeKey).push(entry);
|
|
82
|
-
}
|
|
83
|
-
for (const [compositeKey, entries] of buckets) {
|
|
84
|
-
const [searchType, bucket] = compositeKey.split("\0");
|
|
85
|
-
const targetPartition = stage.outputPartition(bucket);
|
|
86
|
-
if (entries.length === 1 && entries[0].partition === targetPartition) continue;
|
|
87
|
-
await deps.manifestStore.withLock({
|
|
88
|
-
userId: ctx.userId,
|
|
89
|
-
siteId: ctx.siteId,
|
|
90
|
-
table: ctx.table,
|
|
91
|
-
partition: targetPartition
|
|
92
|
-
}, async () => {
|
|
93
|
-
const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
|
|
94
|
-
const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
|
|
95
|
-
const newEntry = {
|
|
96
|
-
userId: ctx.userId,
|
|
97
|
-
siteId: ctx.siteId,
|
|
98
|
-
table: ctx.table,
|
|
99
|
-
partition: targetPartition,
|
|
100
|
-
objectKey: key,
|
|
101
|
-
rowCount,
|
|
102
|
-
bytes,
|
|
103
|
-
createdAt: now,
|
|
104
|
-
schemaVersion: currentSchemaVersion(ctx.table),
|
|
105
|
-
tier: stage.outputTier,
|
|
106
|
-
...searchType !== "web" ? { searchType } : {}
|
|
107
|
-
};
|
|
108
|
-
await deps.manifestStore.registerVersion(newEntry, entries);
|
|
109
|
-
});
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
function enumeratePartitions(startDate, endDate) {
|
|
113
|
-
const out = [];
|
|
114
|
-
const [sy, sm, sd] = startDate.split("-").map(Number);
|
|
115
|
-
const [ey, em, ed] = endDate.split("-").map(Number);
|
|
116
|
-
const start = Date.UTC(sy, sm - 1, sd);
|
|
117
|
-
const end = Date.UTC(ey, em - 1, ed);
|
|
118
|
-
if (end < start) return out;
|
|
119
|
-
const seenWeeks = /* @__PURE__ */ new Set();
|
|
120
|
-
const seenMonths = /* @__PURE__ */ new Set();
|
|
121
|
-
const seenQuarters = /* @__PURE__ */ new Set();
|
|
122
|
-
for (let t = start; t <= end; t += 864e5) {
|
|
123
|
-
const d = new Date(t);
|
|
124
|
-
const y = d.getUTCFullYear();
|
|
125
|
-
const m = String(d.getUTCMonth() + 1).padStart(2, "0");
|
|
126
|
-
const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
127
|
-
const isoMonth = `${y}-${m}`;
|
|
128
|
-
out.push(dayPartition(isoDay));
|
|
129
|
-
const monday = mondayOfWeek(isoDay);
|
|
130
|
-
if (!seenWeeks.has(monday)) {
|
|
131
|
-
seenWeeks.add(monday);
|
|
132
|
-
out.push(weekPartition(monday));
|
|
133
|
-
}
|
|
134
|
-
if (!seenMonths.has(isoMonth)) {
|
|
135
|
-
seenMonths.add(isoMonth);
|
|
136
|
-
out.push(monthPartition(isoMonth));
|
|
137
|
-
}
|
|
138
|
-
const quarter = quarterOfMonth(isoMonth);
|
|
139
|
-
if (!seenQuarters.has(quarter)) {
|
|
140
|
-
seenQuarters.add(quarter);
|
|
141
|
-
out.push(quarterPartition(quarter));
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
return out;
|
|
145
|
-
}
|
|
146
|
-
function partitionSpan(partition) {
|
|
147
|
-
let m = partition.match(DAILY_PARTITION_RE);
|
|
148
|
-
if (m) {
|
|
149
|
-
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
150
|
-
return {
|
|
151
|
-
rank: 0,
|
|
152
|
-
startMs: ms,
|
|
153
|
-
endMs: ms
|
|
154
|
-
};
|
|
155
|
-
}
|
|
156
|
-
m = partition.match(WEEKLY_PARTITION_RE);
|
|
157
|
-
if (m) {
|
|
158
|
-
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
159
|
-
return {
|
|
160
|
-
rank: 1,
|
|
161
|
-
startMs: ms,
|
|
162
|
-
endMs: ms + 6 * MS_PER_DAY
|
|
163
|
-
};
|
|
164
|
-
}
|
|
165
|
-
m = partition.match(MONTHLY_PARTITION_RE);
|
|
166
|
-
if (m) {
|
|
167
|
-
const [y, mo] = m[1].split("-").map(Number);
|
|
168
|
-
return {
|
|
169
|
-
rank: 2,
|
|
170
|
-
startMs: Date.UTC(y, mo - 1, 1),
|
|
171
|
-
endMs: Date.UTC(y, mo, 0)
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
m = partition.match(QUARTERLY_PARTITION_RE);
|
|
175
|
-
if (m) {
|
|
176
|
-
const y = Number(m[1]);
|
|
177
|
-
const q = Number(m[2]);
|
|
178
|
-
return {
|
|
179
|
-
rank: 3,
|
|
180
|
-
startMs: Date.UTC(y, (q - 1) * 3, 1),
|
|
181
|
-
endMs: Date.UTC(y, q * 3, 0)
|
|
182
|
-
};
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
function splitOverlappingTiers(entries, queryRange) {
|
|
186
|
-
const rangeStartMs = queryRange ? Date.parse(`${queryRange.start}T00:00:00Z`) : void 0;
|
|
187
|
-
const rangeEndMs = queryRange ? Date.parse(`${queryRange.end}T00:00:00Z`) : void 0;
|
|
188
|
-
const spanned = [];
|
|
189
|
-
const kept = [];
|
|
190
|
-
const subsumed = [];
|
|
191
|
-
for (const entry of entries) {
|
|
192
|
-
const span = partitionSpan(entry.partition);
|
|
193
|
-
if (!span) {
|
|
194
|
-
kept.push(entry);
|
|
195
|
-
continue;
|
|
196
|
-
}
|
|
197
|
-
const days = [];
|
|
198
|
-
for (let t = span.startMs; t <= span.endMs; t += MS_PER_DAY) {
|
|
199
|
-
if (rangeStartMs !== void 0 && (t < rangeStartMs || t > rangeEndMs)) continue;
|
|
200
|
-
days.push(t);
|
|
201
|
-
}
|
|
202
|
-
if (queryRange && days.length === 0) {
|
|
203
|
-
subsumed.push(entry);
|
|
204
|
-
continue;
|
|
205
|
-
}
|
|
206
|
-
spanned.push({
|
|
207
|
-
entry,
|
|
208
|
-
rank: span.rank,
|
|
209
|
-
days
|
|
210
|
-
});
|
|
211
|
-
}
|
|
212
|
-
spanned.sort((a, b) => a.rank - b.rank || b.entry.createdAt - a.entry.createdAt);
|
|
213
|
-
const coveredBySearchType = /* @__PURE__ */ new Map();
|
|
214
|
-
for (const { entry, days } of spanned) {
|
|
215
|
-
const slice = inferSearchType(entry);
|
|
216
|
-
let covered = coveredBySearchType.get(slice);
|
|
217
|
-
if (!covered) {
|
|
218
|
-
covered = /* @__PURE__ */ new Set();
|
|
219
|
-
coveredBySearchType.set(slice, covered);
|
|
220
|
-
}
|
|
221
|
-
if (days.every((d) => covered.has(d))) {
|
|
222
|
-
subsumed.push(entry);
|
|
223
|
-
continue;
|
|
224
|
-
}
|
|
225
|
-
kept.push(entry);
|
|
226
|
-
for (const d of days) covered.add(d);
|
|
227
|
-
}
|
|
228
|
-
return {
|
|
229
|
-
kept,
|
|
230
|
-
subsumed
|
|
231
|
-
};
|
|
232
|
-
}
|
|
233
|
-
function dedupeOverlappingTiers(entries, queryRange) {
|
|
234
|
-
return splitOverlappingTiers(entries, queryRange).kept;
|
|
235
|
-
}
|
|
236
|
-
function monthEndMs(month) {
|
|
237
|
-
const [y, m] = month.split("-").map(Number);
|
|
238
|
-
return Date.UTC(y, m, 0, 23, 59, 59, 999);
|
|
239
|
-
}
|
|
240
|
-
function quarterEndMs(quarter) {
|
|
241
|
-
const [yStr, qStr] = quarter.split("-Q");
|
|
242
|
-
const y = Number(yStr);
|
|
243
|
-
const q = Number(qStr);
|
|
244
|
-
return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
|
|
245
|
-
}
|
|
246
5
|
const FILES_PLACEHOLDER = "{{FILES}}";
|
|
247
6
|
function buildDimensionWhere(filters, table) {
|
|
248
7
|
const clauses = [];
|
|
@@ -377,4 +136,4 @@ function substituteNamedFiles(sql, sets) {
|
|
|
377
136
|
for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
|
|
378
137
|
return out;
|
|
379
138
|
}
|
|
380
|
-
export { FILES_PLACEHOLDER,
|
|
139
|
+
export { FILES_PLACEHOLDER, compileLogicalQueryPlan, resolveParquetSQL, substituteNamedFiles };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { SCHEMAS, drizzleSchema } from "./schema.mjs";
|
|
2
|
-
import { enumeratePartitions } from "./
|
|
2
|
+
import { enumeratePartitions } from "./compaction.mjs";
|
|
3
3
|
import { escapeLike } from "../sql-fragments.mjs";
|
|
4
4
|
import "../planner.mjs";
|
|
5
5
|
import { UnresolvableDatasetError, buildLogicalComparisonPlan, buildLogicalPlan, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
|
|
@@ -12,6 +12,48 @@ interface CompactionThresholds {
|
|
|
12
12
|
d30?: number;
|
|
13
13
|
}
|
|
14
14
|
declare function enumeratePartitions(startDate: string, endDate: string): string[];
|
|
15
|
+
/**
|
|
16
|
+
* Split manifest entries into the set worth reading (`kept`) and the set whose
|
|
17
|
+
* every covered day is already served by a finer-or-newer live entry
|
|
18
|
+
* (`subsumed`).
|
|
19
|
+
*
|
|
20
|
+
* Tiered compaction (daily→weekly→monthly→quarterly) is meant to retire its
|
|
21
|
+
* inputs, but coarse files can outlive their finer counterparts: a D1→R2
|
|
22
|
+
* backfill writes daily files that compact to monthly while a later re-sync
|
|
23
|
+
* writes fresh daily/weekly for the same dates, and same-partition re-writes
|
|
24
|
+
* leave a stale prior version live. All stay live, the resolver unions every
|
|
25
|
+
* live tier whose partition intersects the range, and `union_by_name` sums the
|
|
26
|
+
* overlap — impressions/clicks double-count.
|
|
27
|
+
*
|
|
28
|
+
* Entries are walked finest-tier-first, newest-first within a tier, so a
|
|
29
|
+
* coarse or stale file is dropped only when every day it covers is already
|
|
30
|
+
* claimed. Subsumption is evaluated per searchType — a `web` monthly never
|
|
31
|
+
* cancels a `discover` weekly, they cover disjoint data. Partial
|
|
32
|
+
* month-boundary overlap (a weekly straddling two months alongside a kept
|
|
33
|
+
* monthly) still double-counts those boundary days — eliminating that needs
|
|
34
|
+
* per-file date predicates in the SQL, tracked separately. Unrecognised
|
|
35
|
+
* partition shapes (`hourly/`, sidecar keys) are always kept.
|
|
36
|
+
*
|
|
37
|
+
* `queryRange` clamps every entry's day-span to the window the caller will
|
|
38
|
+
* actually read. This is required when `entries` came from a partition-
|
|
39
|
+
* filtered `listLive` (`runSQL` enumerates only the partitions intersecting
|
|
40
|
+
* the query): a `monthly/2026-04` whose Apr 27-30 falls past the query end
|
|
41
|
+
* must not be judged "unsubsumed" just because `weekly/2026-04-27` wasn't
|
|
42
|
+
* enumerated — those out-of-window days are SQL-filtered to nothing anyway.
|
|
43
|
+
* Omit `queryRange` when `entries` is the full manifest (e.g. analysis-sources).
|
|
44
|
+
*/
|
|
45
|
+
declare function splitOverlappingTiers(entries: ManifestEntry[], queryRange?: {
|
|
46
|
+
start: string;
|
|
47
|
+
end: string;
|
|
48
|
+
}): {
|
|
49
|
+
kept: ManifestEntry[];
|
|
50
|
+
subsumed: ManifestEntry[];
|
|
51
|
+
};
|
|
52
|
+
/** Entries worth reading — see {@link splitOverlappingTiers}. */
|
|
53
|
+
declare function dedupeOverlappingTiers(entries: ManifestEntry[], queryRange?: {
|
|
54
|
+
start: string;
|
|
55
|
+
end: string;
|
|
56
|
+
}): ManifestEntry[];
|
|
15
57
|
interface WriteCtx extends TenantCtx {
|
|
16
58
|
table: TableName;
|
|
17
59
|
date?: string;
|
|
@@ -428,6 +470,29 @@ interface StorageEngine {
|
|
|
428
470
|
*/
|
|
429
471
|
runSQL: (opts: RunSQLOptions) => Promise<QueryResult>;
|
|
430
472
|
compactTiered: (ctx: WriteCtx, thresholds?: CompactionThresholds) => Promise<void>;
|
|
473
|
+
/**
|
|
474
|
+
* Write-time half of the manifest tier invariant: retire every live entry
|
|
475
|
+
* whose every covered day is already served by a finer-or-newer live entry.
|
|
476
|
+
*
|
|
477
|
+
* `compactTiered` retires the inputs it merges, but cannot retire a coarse
|
|
478
|
+
* partition that outlived the finer files it should have superseded (a D1→R2
|
|
479
|
+
* backfill writing coarse directly, a re-sync landing fresh dailies after a
|
|
480
|
+
* month already rolled up). Those stale overlaps make the query resolver
|
|
481
|
+
* union the same dates twice. Subsumption is evaluated per searchType, over
|
|
482
|
+
* the full live set (so a `web` monthly never cancels a `discover` weekly),
|
|
483
|
+
* then the subsumed set is retired via the manifest's `registerVersions([], …)`
|
|
484
|
+
* primitive — atomic, no inserts. Safe by construction: it only drops files
|
|
485
|
+
* whose days are already covered, so no data is lost.
|
|
486
|
+
*
|
|
487
|
+
* Reads and retires through the engine's own manifest store, so it is
|
|
488
|
+
* read-your-writes-consistent with the `compactTiered` that precedes it.
|
|
489
|
+
* Returns audit counters. Hosts running a cached manifest store must bust
|
|
490
|
+
* their cache afterwards — the engine has no knowledge of host-side caching.
|
|
491
|
+
*/
|
|
492
|
+
reconcileSubsumed: (ctx: WriteCtx) => Promise<{
|
|
493
|
+
retired: number;
|
|
494
|
+
partitions: string[];
|
|
495
|
+
}>;
|
|
431
496
|
gcOrphans: (ctx: GcCtx, graceMs: number) => Promise<{
|
|
432
497
|
deleted: number;
|
|
433
498
|
}>;
|
|
@@ -476,4 +541,4 @@ interface EngineOptions {
|
|
|
476
541
|
executor: QueryExecutor;
|
|
477
542
|
now?: () => number;
|
|
478
543
|
}
|
|
479
|
-
export { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, enumeratePartitions };
|
|
544
|
+
export { CodecCtx, CompactionThresholds, CompactionTier, DataSource, EngineOptions, FileSetRef, GcCtx, type Grain$1 as Grain, ListLiveFilter, LockScope, ManifestEntry, ManifestPurgeResult, ManifestStore, ParquetCodec, PurgeFilter, PurgeResult, PurgeUrlsResult, QueryCtx, QueryExecuteOptions, QueryExecuteResult, QueryExecutor, QueryResult, type Row$1 as Row, RunSQLOptions, type SearchType$1 as SearchType, StorageEngine, SyncState, SyncStateDetail, SyncStateFilter, SyncStateKind, SyncStateScope, type TableName$1 as TableName, type TenantCtx$1 as TenantCtx, Watermark, WatermarkFilter, WatermarkScope, WriteCtx, WriteResult, dedupeOverlappingTiers, enumeratePartitions, splitOverlappingTiers };
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { dedupeOverlappingTiers, splitOverlappingTiers } from "./_chunks/storage.mjs";
|
|
2
|
+
/**
|
|
3
|
+
* Host-policy predicate: true once a table's live raw-daily count crosses the
|
|
4
|
+
* engine's daily→weekly compaction gate. Wraps the internal threshold so hosts
|
|
5
|
+
* decide "is compaction due?" without importing the constant or the counter.
|
|
6
|
+
*
|
|
7
|
+
* Pure — pass the entries the host already fetched (typically via its own cached
|
|
8
|
+
* manifest store, so the hot-path check stays on the host's cache rather than
|
|
9
|
+
* forcing an uncached read through the engine).
|
|
10
|
+
*/
|
|
11
|
+
declare function isRawDailyCompactionDue(entries: ReadonlyArray<{
|
|
12
|
+
tier?: string | null;
|
|
13
|
+
partition: string;
|
|
14
|
+
}>): boolean;
|
|
15
|
+
export { dedupeOverlappingTiers, isRawDailyCompactionDue, splitOverlappingTiers };
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { countRawDailies, dedupeOverlappingTiers, splitOverlappingTiers } from "./_chunks/compaction.mjs";
|
|
2
|
+
function isRawDailyCompactionDue(entries) {
|
|
3
|
+
return countRawDailies(entries) > 7;
|
|
4
|
+
}
|
|
5
|
+
export { dedupeOverlappingTiers, isRawDailyCompactionDue, splitOverlappingTiers };
|
package/dist/index.mjs
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { ENGINE_QUERY_CAPABILITIES, coerceRow, coerceRows, createSqlQuerySource } from "./_chunks/source.mjs";
|
|
2
2
|
import { DEFAULT_SEARCH_TYPE, dayPartition, hourPartition, inferLegacyTier, inferSearchType, objectKey } from "./_chunks/layout.mjs";
|
|
3
3
|
import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, page_queries, pages, queries } from "./_chunks/schema.mjs";
|
|
4
|
-
import {
|
|
4
|
+
import { enumeratePartitions } from "./_chunks/compaction.mjs";
|
|
5
|
+
import { FILES_PLACEHOLDER, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
|
|
5
6
|
import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
|
|
6
7
|
import { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "./_chunks/engine.mjs";
|
|
7
8
|
import { assembleDatesRow, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
|
package/dist/planner.mjs
CHANGED
|
@@ -1,2 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { enumeratePartitions } from "./_chunks/compaction.mjs";
|
|
2
|
+
import { FILES_PLACEHOLDER, compileLogicalQueryPlan, resolveParquetSQL, substituteNamedFiles } from "./_chunks/parquet-plan.mjs";
|
|
2
3
|
export { FILES_PLACEHOLDER, compileLogicalQueryPlan, enumeratePartitions, resolveParquetSQL, substituteNamedFiles };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.25.
|
|
4
|
+
"version": "0.25.1",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -41,6 +41,11 @@
|
|
|
41
41
|
"import": "./dist/planner.mjs",
|
|
42
42
|
"default": "./dist/planner.mjs"
|
|
43
43
|
},
|
|
44
|
+
"./compaction": {
|
|
45
|
+
"types": "./dist/compaction-public.d.mts",
|
|
46
|
+
"import": "./dist/compaction-public.mjs",
|
|
47
|
+
"default": "./dist/compaction-public.mjs"
|
|
48
|
+
},
|
|
44
49
|
"./schema": {
|
|
45
50
|
"types": "./dist/schema.d.mts",
|
|
46
51
|
"import": "./dist/schema.mjs",
|
|
@@ -180,8 +185,8 @@
|
|
|
180
185
|
"drizzle-orm": "1.0.0-rc.3",
|
|
181
186
|
"icebird": "^0.8.6",
|
|
182
187
|
"proper-lockfile": "^4.1.2",
|
|
183
|
-
"gscdump": "0.25.
|
|
184
|
-
"
|
|
188
|
+
"@gscdump/contracts": "0.25.1",
|
|
189
|
+
"gscdump": "0.25.1"
|
|
185
190
|
},
|
|
186
191
|
"devDependencies": {
|
|
187
192
|
"@duckdb/duckdb-wasm": "^1.32.0",
|