@gscdump/engine 0.24.1 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/compaction.mjs +247 -0
- package/dist/_chunks/engine.mjs +22 -4
- package/dist/_chunks/parquet-plan.mjs +3 -248
- package/dist/_chunks/resolver.mjs +3 -3
- package/dist/_chunks/{iceberg-schema.mjs → schema2.mjs} +9 -2
- package/dist/_chunks/sink.d.mts +11 -1
- package/dist/_chunks/source.mjs +1 -1
- package/dist/_chunks/storage.d.mts +24 -33
- package/dist/adapters/filesystem.mjs +1 -1
- package/dist/adapters/node.mjs +1 -1
- package/dist/adapters/r2-manifest.mjs +1 -1
- package/dist/compaction-public.d.mts +15 -0
- package/dist/compaction-public.mjs +5 -0
- package/dist/iceberg/index.d.mts +12 -0
- package/dist/iceberg/index.mjs +269 -0
- package/dist/index.d.mts +30 -29
- package/dist/index.mjs +5 -272
- package/dist/planner.mjs +2 -1
- package/dist/rollups.mjs +1 -1
- package/dist/sink-node.d.mts +1 -1
- package/dist/sink-node.mjs +1 -1
- package/package.json +13 -8
- package/dist/_chunks/{storage.mjs → layout.mjs} +11 -11
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import { dayPartition, inferSearchType, mondayOfWeek, monthPartition, objectKey, quarterOfMonth, quarterPartition, weekPartition } from "./layout.mjs";
|
|
2
|
+
import { currentSchemaVersion } from "./schema.mjs";
|
|
3
|
+
import { MS_PER_DAY } from "gscdump";
|
|
4
|
+
const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
|
|
5
|
+
const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
|
|
6
|
+
const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
|
|
7
|
+
const QUARTERLY_PARTITION_RE = /^quarterly\/(\d{4})-Q([1-4])$/;
|
|
8
|
+
const DEFAULT_THRESHOLDS = {
|
|
9
|
+
raw: 7,
|
|
10
|
+
d7: 30,
|
|
11
|
+
d30: 90
|
|
12
|
+
};
|
|
13
|
+
function countRawDailies(entries) {
|
|
14
|
+
return entries.filter((e) => e.tier === "raw" || e.tier == null && e.partition.startsWith("daily/")).length;
|
|
15
|
+
}
|
|
16
|
+
const PENDING_WINDOW_DAYS = 4;
|
|
17
|
+
const STAGES = [
|
|
18
|
+
{
|
|
19
|
+
inputTier: "raw",
|
|
20
|
+
outputTier: "d7",
|
|
21
|
+
cutoffDays: DEFAULT_THRESHOLDS.raw,
|
|
22
|
+
bucketKey: (e) => {
|
|
23
|
+
const m = e.partition.match(DAILY_PARTITION_RE);
|
|
24
|
+
if (!m) return void 0;
|
|
25
|
+
return mondayOfWeek(m[1]);
|
|
26
|
+
},
|
|
27
|
+
bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
|
|
28
|
+
outputPartition: weekPartition
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
inputTier: "d7",
|
|
32
|
+
outputTier: "d30",
|
|
33
|
+
cutoffDays: DEFAULT_THRESHOLDS.d7,
|
|
34
|
+
bucketKey: (e) => {
|
|
35
|
+
const m = e.partition.match(WEEKLY_PARTITION_RE);
|
|
36
|
+
if (!m) return void 0;
|
|
37
|
+
return m[1].slice(0, 7);
|
|
38
|
+
},
|
|
39
|
+
bucketLatestMs: monthEndMs,
|
|
40
|
+
outputPartition: monthPartition
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
inputTier: "d30",
|
|
44
|
+
outputTier: "d90",
|
|
45
|
+
cutoffDays: DEFAULT_THRESHOLDS.d30,
|
|
46
|
+
bucketKey: (e) => {
|
|
47
|
+
const m = e.partition.match(MONTHLY_PARTITION_RE);
|
|
48
|
+
if (!m) return void 0;
|
|
49
|
+
return quarterOfMonth(m[1]);
|
|
50
|
+
},
|
|
51
|
+
bucketLatestMs: quarterEndMs,
|
|
52
|
+
outputPartition: quarterPartition
|
|
53
|
+
}
|
|
54
|
+
];
|
|
55
|
+
async function compactTieredImpl(deps, ctx, now, overrides = {}) {
|
|
56
|
+
const thresholds = {
|
|
57
|
+
...DEFAULT_THRESHOLDS,
|
|
58
|
+
...overrides
|
|
59
|
+
};
|
|
60
|
+
const stagesWithThresholds = STAGES.map((s) => ({
|
|
61
|
+
...s,
|
|
62
|
+
cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
|
|
63
|
+
}));
|
|
64
|
+
for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
|
|
65
|
+
}
|
|
66
|
+
async function runStage(deps, ctx, stage, now) {
|
|
67
|
+
const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
|
|
68
|
+
const candidates = await deps.manifestStore.listLive({
|
|
69
|
+
userId: ctx.userId,
|
|
70
|
+
siteId: ctx.siteId,
|
|
71
|
+
table: ctx.table,
|
|
72
|
+
tier: stage.inputTier
|
|
73
|
+
});
|
|
74
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
75
|
+
for (const entry of candidates) {
|
|
76
|
+
if (entry.partition.startsWith("hourly/")) continue;
|
|
77
|
+
const key = stage.bucketKey(entry);
|
|
78
|
+
if (!key) continue;
|
|
79
|
+
if (stage.bucketLatestMs(key) >= cutoff) continue;
|
|
80
|
+
const compositeKey = `${inferSearchType(entry)}\0${key}`;
|
|
81
|
+
if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
|
|
82
|
+
buckets.get(compositeKey).push(entry);
|
|
83
|
+
}
|
|
84
|
+
for (const [compositeKey, entries] of buckets) {
|
|
85
|
+
const [searchType, bucket] = compositeKey.split("\0");
|
|
86
|
+
const targetPartition = stage.outputPartition(bucket);
|
|
87
|
+
if (entries.length === 1 && entries[0].partition === targetPartition) continue;
|
|
88
|
+
await deps.manifestStore.withLock({
|
|
89
|
+
userId: ctx.userId,
|
|
90
|
+
siteId: ctx.siteId,
|
|
91
|
+
table: ctx.table,
|
|
92
|
+
partition: targetPartition
|
|
93
|
+
}, async () => {
|
|
94
|
+
const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
|
|
95
|
+
const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
|
|
96
|
+
const newEntry = {
|
|
97
|
+
userId: ctx.userId,
|
|
98
|
+
siteId: ctx.siteId,
|
|
99
|
+
table: ctx.table,
|
|
100
|
+
partition: targetPartition,
|
|
101
|
+
objectKey: key,
|
|
102
|
+
rowCount,
|
|
103
|
+
bytes,
|
|
104
|
+
createdAt: now,
|
|
105
|
+
schemaVersion: currentSchemaVersion(ctx.table),
|
|
106
|
+
tier: stage.outputTier,
|
|
107
|
+
...searchType !== "web" ? { searchType } : {}
|
|
108
|
+
};
|
|
109
|
+
await deps.manifestStore.registerVersion(newEntry, entries);
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
function enumeratePartitions(startDate, endDate) {
|
|
114
|
+
const out = [];
|
|
115
|
+
const [sy, sm, sd] = startDate.split("-").map(Number);
|
|
116
|
+
const [ey, em, ed] = endDate.split("-").map(Number);
|
|
117
|
+
const start = Date.UTC(sy, sm - 1, sd);
|
|
118
|
+
const end = Date.UTC(ey, em - 1, ed);
|
|
119
|
+
if (end < start) return out;
|
|
120
|
+
const seenWeeks = /* @__PURE__ */ new Set();
|
|
121
|
+
const seenMonths = /* @__PURE__ */ new Set();
|
|
122
|
+
const seenQuarters = /* @__PURE__ */ new Set();
|
|
123
|
+
for (let t = start; t <= end; t += 864e5) {
|
|
124
|
+
const d = new Date(t);
|
|
125
|
+
const y = d.getUTCFullYear();
|
|
126
|
+
const m = String(d.getUTCMonth() + 1).padStart(2, "0");
|
|
127
|
+
const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
128
|
+
const isoMonth = `${y}-${m}`;
|
|
129
|
+
out.push(dayPartition(isoDay));
|
|
130
|
+
const monday = mondayOfWeek(isoDay);
|
|
131
|
+
if (!seenWeeks.has(monday)) {
|
|
132
|
+
seenWeeks.add(monday);
|
|
133
|
+
out.push(weekPartition(monday));
|
|
134
|
+
}
|
|
135
|
+
if (!seenMonths.has(isoMonth)) {
|
|
136
|
+
seenMonths.add(isoMonth);
|
|
137
|
+
out.push(monthPartition(isoMonth));
|
|
138
|
+
}
|
|
139
|
+
const quarter = quarterOfMonth(isoMonth);
|
|
140
|
+
if (!seenQuarters.has(quarter)) {
|
|
141
|
+
seenQuarters.add(quarter);
|
|
142
|
+
out.push(quarterPartition(quarter));
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return out;
|
|
146
|
+
}
|
|
147
|
+
function partitionSpan(partition) {
|
|
148
|
+
let m = partition.match(DAILY_PARTITION_RE);
|
|
149
|
+
if (m) {
|
|
150
|
+
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
151
|
+
return {
|
|
152
|
+
rank: 0,
|
|
153
|
+
startMs: ms,
|
|
154
|
+
endMs: ms
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
m = partition.match(WEEKLY_PARTITION_RE);
|
|
158
|
+
if (m) {
|
|
159
|
+
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
160
|
+
return {
|
|
161
|
+
rank: 1,
|
|
162
|
+
startMs: ms,
|
|
163
|
+
endMs: ms + 6 * MS_PER_DAY
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
m = partition.match(MONTHLY_PARTITION_RE);
|
|
167
|
+
if (m) {
|
|
168
|
+
const [y, mo] = m[1].split("-").map(Number);
|
|
169
|
+
return {
|
|
170
|
+
rank: 2,
|
|
171
|
+
startMs: Date.UTC(y, mo - 1, 1),
|
|
172
|
+
endMs: Date.UTC(y, mo, 0)
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
m = partition.match(QUARTERLY_PARTITION_RE);
|
|
176
|
+
if (m) {
|
|
177
|
+
const y = Number(m[1]);
|
|
178
|
+
const q = Number(m[2]);
|
|
179
|
+
return {
|
|
180
|
+
rank: 3,
|
|
181
|
+
startMs: Date.UTC(y, (q - 1) * 3, 1),
|
|
182
|
+
endMs: Date.UTC(y, q * 3, 0)
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
function splitOverlappingTiers(entries, queryRange) {
|
|
187
|
+
const rangeStartMs = queryRange ? Date.parse(`${queryRange.start}T00:00:00Z`) : void 0;
|
|
188
|
+
const rangeEndMs = queryRange ? Date.parse(`${queryRange.end}T00:00:00Z`) : void 0;
|
|
189
|
+
const spanned = [];
|
|
190
|
+
const kept = [];
|
|
191
|
+
const subsumed = [];
|
|
192
|
+
for (const entry of entries) {
|
|
193
|
+
const span = partitionSpan(entry.partition);
|
|
194
|
+
if (!span) {
|
|
195
|
+
kept.push(entry);
|
|
196
|
+
continue;
|
|
197
|
+
}
|
|
198
|
+
const days = [];
|
|
199
|
+
for (let t = span.startMs; t <= span.endMs; t += MS_PER_DAY) {
|
|
200
|
+
if (rangeStartMs !== void 0 && (t < rangeStartMs || t > rangeEndMs)) continue;
|
|
201
|
+
days.push(t);
|
|
202
|
+
}
|
|
203
|
+
if (queryRange && days.length === 0) {
|
|
204
|
+
subsumed.push(entry);
|
|
205
|
+
continue;
|
|
206
|
+
}
|
|
207
|
+
spanned.push({
|
|
208
|
+
entry,
|
|
209
|
+
rank: span.rank,
|
|
210
|
+
days
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
spanned.sort((a, b) => a.rank - b.rank || b.entry.createdAt - a.entry.createdAt);
|
|
214
|
+
const coveredBySearchType = /* @__PURE__ */ new Map();
|
|
215
|
+
for (const { entry, days } of spanned) {
|
|
216
|
+
const slice = inferSearchType(entry);
|
|
217
|
+
let covered = coveredBySearchType.get(slice);
|
|
218
|
+
if (!covered) {
|
|
219
|
+
covered = /* @__PURE__ */ new Set();
|
|
220
|
+
coveredBySearchType.set(slice, covered);
|
|
221
|
+
}
|
|
222
|
+
if (days.every((d) => covered.has(d))) {
|
|
223
|
+
subsumed.push(entry);
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
kept.push(entry);
|
|
227
|
+
for (const d of days) covered.add(d);
|
|
228
|
+
}
|
|
229
|
+
return {
|
|
230
|
+
kept,
|
|
231
|
+
subsumed
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
function dedupeOverlappingTiers(entries, queryRange) {
|
|
235
|
+
return splitOverlappingTiers(entries, queryRange).kept;
|
|
236
|
+
}
|
|
237
|
+
function monthEndMs(month) {
|
|
238
|
+
const [y, m] = month.split("-").map(Number);
|
|
239
|
+
return Date.UTC(y, m, 0, 23, 59, 59, 999);
|
|
240
|
+
}
|
|
241
|
+
function quarterEndMs(quarter) {
|
|
242
|
+
const [yStr, qStr] = quarter.split("-Q");
|
|
243
|
+
const y = Number(yStr);
|
|
244
|
+
const q = Number(qStr);
|
|
245
|
+
return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
|
|
246
|
+
}
|
|
247
|
+
export { compactTieredImpl, countRawDailies, dedupeOverlappingTiers, enumeratePartitions, splitOverlappingTiers };
|
package/dist/_chunks/engine.mjs
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
|
|
1
2
|
import { SCHEMAS, currentSchemaVersion, dedupeByNaturalKey } from "./schema.mjs";
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
3
|
+
import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
|
|
4
|
+
import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
|
|
4
5
|
import { sqlEscape } from "../sql-bind.mjs";
|
|
5
6
|
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
6
|
-
import { normalizeUrl } from "gscdump
|
|
7
|
+
import { normalizeUrl } from "gscdump";
|
|
7
8
|
async function encodeBytes(db, table, rows) {
|
|
8
9
|
const inName = db.makeTempPath("json");
|
|
9
10
|
const outName = db.makeTempPath("parquet");
|
|
@@ -485,6 +486,22 @@ function createStorageEngine(opts) {
|
|
|
485
486
|
codec
|
|
486
487
|
}, ctx, (ctx.now ?? defaultNow)(), thresholds);
|
|
487
488
|
}
|
|
489
|
+
async function reconcileSubsumed(ctx) {
|
|
490
|
+
const { subsumed } = splitOverlappingTiers(await manifestStore.listLive({
|
|
491
|
+
userId: ctx.userId,
|
|
492
|
+
siteId: ctx.siteId,
|
|
493
|
+
table: ctx.table
|
|
494
|
+
}));
|
|
495
|
+
if (subsumed.length === 0) return {
|
|
496
|
+
retired: 0,
|
|
497
|
+
partitions: []
|
|
498
|
+
};
|
|
499
|
+
await manifestStore.registerVersions([], subsumed);
|
|
500
|
+
return {
|
|
501
|
+
retired: subsumed.length,
|
|
502
|
+
partitions: subsumed.map((e) => e.partition)
|
|
503
|
+
};
|
|
504
|
+
}
|
|
488
505
|
async function gcOrphans(ctx, graceMs) {
|
|
489
506
|
return gcOrphansImpl({
|
|
490
507
|
dataSource,
|
|
@@ -590,6 +607,7 @@ function createStorageEngine(opts) {
|
|
|
590
607
|
query,
|
|
591
608
|
runSQL,
|
|
592
609
|
compactTiered,
|
|
610
|
+
reconcileSubsumed,
|
|
593
611
|
gcOrphans,
|
|
594
612
|
purgeTenant,
|
|
595
613
|
purgeUrls,
|
|
@@ -601,4 +619,4 @@ function createStorageEngine(opts) {
|
|
|
601
619
|
readObject: (key) => dataSource.read(key)
|
|
602
620
|
};
|
|
603
621
|
}
|
|
604
|
-
export { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine
|
|
622
|
+
export { MAX_DAY_BYTES, canonicalEmptyParquetSchema, createDuckDBCodec, createDuckDBExecutor, createStorageEngine };
|
|
@@ -1,252 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { dimensionToColumn } from "./schema.mjs";
|
|
2
|
+
import { enumeratePartitions } from "./compaction.mjs";
|
|
3
3
|
import { METRIC_EXPR, escapeLike, topLevelPagePredicateSql } from "../sql-fragments.mjs";
|
|
4
|
-
import { MS_PER_DAY } from "gscdump";
|
|
5
4
|
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
6
|
-
const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
|
|
7
|
-
const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
|
|
8
|
-
const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
|
|
9
|
-
const QUARTERLY_PARTITION_RE = /^quarterly\/(\d{4})-Q([1-4])$/;
|
|
10
|
-
const DEFAULT_THRESHOLDS = {
|
|
11
|
-
raw: 7,
|
|
12
|
-
d7: 30,
|
|
13
|
-
d30: 90
|
|
14
|
-
};
|
|
15
|
-
const RAW_DAILY_COMPACT_THRESHOLD = 7;
|
|
16
|
-
function countRawDailies(entries) {
|
|
17
|
-
return entries.filter((e) => e.tier === "raw" || e.tier == null && e.partition.startsWith("daily/")).length;
|
|
18
|
-
}
|
|
19
|
-
const PENDING_WINDOW_DAYS = 4;
|
|
20
|
-
const STAGES = [
|
|
21
|
-
{
|
|
22
|
-
inputTier: "raw",
|
|
23
|
-
outputTier: "d7",
|
|
24
|
-
cutoffDays: DEFAULT_THRESHOLDS.raw,
|
|
25
|
-
bucketKey: (e) => {
|
|
26
|
-
const m = e.partition.match(DAILY_PARTITION_RE);
|
|
27
|
-
if (!m) return void 0;
|
|
28
|
-
return mondayOfWeek(m[1]);
|
|
29
|
-
},
|
|
30
|
-
bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
|
|
31
|
-
outputPartition: weekPartition
|
|
32
|
-
},
|
|
33
|
-
{
|
|
34
|
-
inputTier: "d7",
|
|
35
|
-
outputTier: "d30",
|
|
36
|
-
cutoffDays: DEFAULT_THRESHOLDS.d7,
|
|
37
|
-
bucketKey: (e) => {
|
|
38
|
-
const m = e.partition.match(WEEKLY_PARTITION_RE);
|
|
39
|
-
if (!m) return void 0;
|
|
40
|
-
return m[1].slice(0, 7);
|
|
41
|
-
},
|
|
42
|
-
bucketLatestMs: monthEndMs,
|
|
43
|
-
outputPartition: monthPartition
|
|
44
|
-
},
|
|
45
|
-
{
|
|
46
|
-
inputTier: "d30",
|
|
47
|
-
outputTier: "d90",
|
|
48
|
-
cutoffDays: DEFAULT_THRESHOLDS.d30,
|
|
49
|
-
bucketKey: (e) => {
|
|
50
|
-
const m = e.partition.match(MONTHLY_PARTITION_RE);
|
|
51
|
-
if (!m) return void 0;
|
|
52
|
-
return quarterOfMonth(m[1]);
|
|
53
|
-
},
|
|
54
|
-
bucketLatestMs: quarterEndMs,
|
|
55
|
-
outputPartition: quarterPartition
|
|
56
|
-
}
|
|
57
|
-
];
|
|
58
|
-
async function compactTieredImpl(deps, ctx, now, overrides = {}) {
|
|
59
|
-
const thresholds = {
|
|
60
|
-
...DEFAULT_THRESHOLDS,
|
|
61
|
-
...overrides
|
|
62
|
-
};
|
|
63
|
-
const stagesWithThresholds = STAGES.map((s) => ({
|
|
64
|
-
...s,
|
|
65
|
-
cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
|
|
66
|
-
}));
|
|
67
|
-
for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
|
|
68
|
-
}
|
|
69
|
-
async function runStage(deps, ctx, stage, now) {
|
|
70
|
-
const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
|
|
71
|
-
const candidates = await deps.manifestStore.listLive({
|
|
72
|
-
userId: ctx.userId,
|
|
73
|
-
siteId: ctx.siteId,
|
|
74
|
-
table: ctx.table,
|
|
75
|
-
tier: stage.inputTier
|
|
76
|
-
});
|
|
77
|
-
const buckets = /* @__PURE__ */ new Map();
|
|
78
|
-
for (const entry of candidates) {
|
|
79
|
-
if (entry.partition.startsWith("hourly/")) continue;
|
|
80
|
-
const key = stage.bucketKey(entry);
|
|
81
|
-
if (!key) continue;
|
|
82
|
-
if (stage.bucketLatestMs(key) >= cutoff) continue;
|
|
83
|
-
const compositeKey = `${inferSearchType(entry)}\0${key}`;
|
|
84
|
-
if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
|
|
85
|
-
buckets.get(compositeKey).push(entry);
|
|
86
|
-
}
|
|
87
|
-
for (const [compositeKey, entries] of buckets) {
|
|
88
|
-
const [searchType, bucket] = compositeKey.split("\0");
|
|
89
|
-
const targetPartition = stage.outputPartition(bucket);
|
|
90
|
-
if (entries.length === 1 && entries[0].partition === targetPartition) continue;
|
|
91
|
-
await deps.manifestStore.withLock({
|
|
92
|
-
userId: ctx.userId,
|
|
93
|
-
siteId: ctx.siteId,
|
|
94
|
-
table: ctx.table,
|
|
95
|
-
partition: targetPartition
|
|
96
|
-
}, async () => {
|
|
97
|
-
const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
|
|
98
|
-
const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
|
|
99
|
-
const newEntry = {
|
|
100
|
-
userId: ctx.userId,
|
|
101
|
-
siteId: ctx.siteId,
|
|
102
|
-
table: ctx.table,
|
|
103
|
-
partition: targetPartition,
|
|
104
|
-
objectKey: key,
|
|
105
|
-
rowCount,
|
|
106
|
-
bytes,
|
|
107
|
-
createdAt: now,
|
|
108
|
-
schemaVersion: currentSchemaVersion(ctx.table),
|
|
109
|
-
tier: stage.outputTier,
|
|
110
|
-
...searchType !== "web" ? { searchType } : {}
|
|
111
|
-
};
|
|
112
|
-
await deps.manifestStore.registerVersion(newEntry, entries);
|
|
113
|
-
});
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
function enumeratePartitions(startDate, endDate) {
|
|
117
|
-
const out = [];
|
|
118
|
-
const [sy, sm, sd] = startDate.split("-").map(Number);
|
|
119
|
-
const [ey, em, ed] = endDate.split("-").map(Number);
|
|
120
|
-
const start = Date.UTC(sy, sm - 1, sd);
|
|
121
|
-
const end = Date.UTC(ey, em - 1, ed);
|
|
122
|
-
if (end < start) return out;
|
|
123
|
-
const seenWeeks = /* @__PURE__ */ new Set();
|
|
124
|
-
const seenMonths = /* @__PURE__ */ new Set();
|
|
125
|
-
const seenQuarters = /* @__PURE__ */ new Set();
|
|
126
|
-
for (let t = start; t <= end; t += 864e5) {
|
|
127
|
-
const d = new Date(t);
|
|
128
|
-
const y = d.getUTCFullYear();
|
|
129
|
-
const m = String(d.getUTCMonth() + 1).padStart(2, "0");
|
|
130
|
-
const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
131
|
-
const isoMonth = `${y}-${m}`;
|
|
132
|
-
out.push(dayPartition(isoDay));
|
|
133
|
-
const monday = mondayOfWeek(isoDay);
|
|
134
|
-
if (!seenWeeks.has(monday)) {
|
|
135
|
-
seenWeeks.add(monday);
|
|
136
|
-
out.push(weekPartition(monday));
|
|
137
|
-
}
|
|
138
|
-
if (!seenMonths.has(isoMonth)) {
|
|
139
|
-
seenMonths.add(isoMonth);
|
|
140
|
-
out.push(monthPartition(isoMonth));
|
|
141
|
-
}
|
|
142
|
-
const quarter = quarterOfMonth(isoMonth);
|
|
143
|
-
if (!seenQuarters.has(quarter)) {
|
|
144
|
-
seenQuarters.add(quarter);
|
|
145
|
-
out.push(quarterPartition(quarter));
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
return out;
|
|
149
|
-
}
|
|
150
|
-
function partitionSpan(partition) {
|
|
151
|
-
let m = partition.match(DAILY_PARTITION_RE);
|
|
152
|
-
if (m) {
|
|
153
|
-
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
154
|
-
return {
|
|
155
|
-
rank: 0,
|
|
156
|
-
startMs: ms,
|
|
157
|
-
endMs: ms
|
|
158
|
-
};
|
|
159
|
-
}
|
|
160
|
-
m = partition.match(WEEKLY_PARTITION_RE);
|
|
161
|
-
if (m) {
|
|
162
|
-
const ms = Date.parse(`${m[1]}T00:00:00Z`);
|
|
163
|
-
return {
|
|
164
|
-
rank: 1,
|
|
165
|
-
startMs: ms,
|
|
166
|
-
endMs: ms + 6 * MS_PER_DAY
|
|
167
|
-
};
|
|
168
|
-
}
|
|
169
|
-
m = partition.match(MONTHLY_PARTITION_RE);
|
|
170
|
-
if (m) {
|
|
171
|
-
const [y, mo] = m[1].split("-").map(Number);
|
|
172
|
-
return {
|
|
173
|
-
rank: 2,
|
|
174
|
-
startMs: Date.UTC(y, mo - 1, 1),
|
|
175
|
-
endMs: Date.UTC(y, mo, 0)
|
|
176
|
-
};
|
|
177
|
-
}
|
|
178
|
-
m = partition.match(QUARTERLY_PARTITION_RE);
|
|
179
|
-
if (m) {
|
|
180
|
-
const y = Number(m[1]);
|
|
181
|
-
const q = Number(m[2]);
|
|
182
|
-
return {
|
|
183
|
-
rank: 3,
|
|
184
|
-
startMs: Date.UTC(y, (q - 1) * 3, 1),
|
|
185
|
-
endMs: Date.UTC(y, q * 3, 0)
|
|
186
|
-
};
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
function splitOverlappingTiers(entries, queryRange) {
|
|
190
|
-
const rangeStartMs = queryRange ? Date.parse(`${queryRange.start}T00:00:00Z`) : void 0;
|
|
191
|
-
const rangeEndMs = queryRange ? Date.parse(`${queryRange.end}T00:00:00Z`) : void 0;
|
|
192
|
-
const spanned = [];
|
|
193
|
-
const kept = [];
|
|
194
|
-
const subsumed = [];
|
|
195
|
-
for (const entry of entries) {
|
|
196
|
-
const span = partitionSpan(entry.partition);
|
|
197
|
-
if (!span) {
|
|
198
|
-
kept.push(entry);
|
|
199
|
-
continue;
|
|
200
|
-
}
|
|
201
|
-
const days = [];
|
|
202
|
-
for (let t = span.startMs; t <= span.endMs; t += MS_PER_DAY) {
|
|
203
|
-
if (rangeStartMs !== void 0 && (t < rangeStartMs || t > rangeEndMs)) continue;
|
|
204
|
-
days.push(t);
|
|
205
|
-
}
|
|
206
|
-
if (queryRange && days.length === 0) {
|
|
207
|
-
subsumed.push(entry);
|
|
208
|
-
continue;
|
|
209
|
-
}
|
|
210
|
-
spanned.push({
|
|
211
|
-
entry,
|
|
212
|
-
rank: span.rank,
|
|
213
|
-
days
|
|
214
|
-
});
|
|
215
|
-
}
|
|
216
|
-
spanned.sort((a, b) => a.rank - b.rank || b.entry.createdAt - a.entry.createdAt);
|
|
217
|
-
const coveredBySearchType = /* @__PURE__ */ new Map();
|
|
218
|
-
for (const { entry, days } of spanned) {
|
|
219
|
-
const slice = inferSearchType(entry);
|
|
220
|
-
let covered = coveredBySearchType.get(slice);
|
|
221
|
-
if (!covered) {
|
|
222
|
-
covered = /* @__PURE__ */ new Set();
|
|
223
|
-
coveredBySearchType.set(slice, covered);
|
|
224
|
-
}
|
|
225
|
-
if (days.every((d) => covered.has(d))) {
|
|
226
|
-
subsumed.push(entry);
|
|
227
|
-
continue;
|
|
228
|
-
}
|
|
229
|
-
kept.push(entry);
|
|
230
|
-
for (const d of days) covered.add(d);
|
|
231
|
-
}
|
|
232
|
-
return {
|
|
233
|
-
kept,
|
|
234
|
-
subsumed
|
|
235
|
-
};
|
|
236
|
-
}
|
|
237
|
-
function dedupeOverlappingTiers(entries, queryRange) {
|
|
238
|
-
return splitOverlappingTiers(entries, queryRange).kept;
|
|
239
|
-
}
|
|
240
|
-
function monthEndMs(month) {
|
|
241
|
-
const [y, m] = month.split("-").map(Number);
|
|
242
|
-
return Date.UTC(y, m, 0, 23, 59, 59, 999);
|
|
243
|
-
}
|
|
244
|
-
function quarterEndMs(quarter) {
|
|
245
|
-
const [yStr, qStr] = quarter.split("-Q");
|
|
246
|
-
const y = Number(yStr);
|
|
247
|
-
const q = Number(qStr);
|
|
248
|
-
return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
|
|
249
|
-
}
|
|
250
5
|
const FILES_PLACEHOLDER = "{{FILES}}";
|
|
251
6
|
function buildDimensionWhere(filters, table) {
|
|
252
7
|
const clauses = [];
|
|
@@ -381,4 +136,4 @@ function substituteNamedFiles(sql, sets) {
|
|
|
381
136
|
for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
|
|
382
137
|
return out;
|
|
383
138
|
}
|
|
384
|
-
export { FILES_PLACEHOLDER,
|
|
139
|
+
export { FILES_PLACEHOLDER, compileLogicalQueryPlan, resolveParquetSQL, substituteNamedFiles };
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import { SCHEMAS, drizzleSchema } from "./schema.mjs";
|
|
2
|
-
import { enumeratePartitions } from "./
|
|
2
|
+
import { enumeratePartitions } from "./compaction.mjs";
|
|
3
3
|
import { escapeLike } from "../sql-fragments.mjs";
|
|
4
4
|
import "../planner.mjs";
|
|
5
|
-
import { PgDialect, pgTable, varchar } from "drizzle-orm/pg-core";
|
|
6
5
|
import { UnresolvableDatasetError, buildLogicalComparisonPlan, buildLogicalPlan, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
|
|
7
|
-
import { normalizeUrl } from "gscdump
|
|
6
|
+
import { normalizeUrl } from "gscdump";
|
|
7
|
+
import { PgDialect, pgTable, varchar } from "drizzle-orm/pg-core";
|
|
8
8
|
import { sql } from "drizzle-orm";
|
|
9
9
|
const DIMENSION_SURFACES = {
|
|
10
10
|
page: ["api", "stored"],
|
|
@@ -68,5 +68,12 @@ function icebergTableSpec(table) {
|
|
|
68
68
|
};
|
|
69
69
|
}
|
|
70
70
|
const ICEBERG_SCHEMAS = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t)]));
|
|
71
|
-
new Set(ICEBERG_TABLES);
|
|
72
|
-
|
|
71
|
+
const ICEBERG_TABLE_SET = new Set(ICEBERG_TABLES);
|
|
72
|
+
function isIcebergTable(table) {
|
|
73
|
+
return ICEBERG_TABLE_SET.has(table);
|
|
74
|
+
}
|
|
75
|
+
function assertIcebergTable(table) {
|
|
76
|
+
if (!isIcebergTable(table)) throw new Error(`Unknown Iceberg table '${table}'. Expected one of: ${ICEBERG_TABLES.join(", ")}`);
|
|
77
|
+
return table;
|
|
78
|
+
}
|
|
79
|
+
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable };
|
package/dist/_chunks/sink.d.mts
CHANGED
|
@@ -94,6 +94,16 @@ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
|
|
|
94
94
|
declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
|
|
95
95
|
/** All Iceberg table specs, keyed by table name. */
|
|
96
96
|
declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
|
|
97
|
+
/** True when `table` is one of the canonical {@link ICEBERG_TABLES}. */
|
|
98
|
+
declare function isIcebergTable(table: string): table is IcebergTableName;
|
|
99
|
+
/**
|
|
100
|
+
* Narrow an arbitrary table name to a canonical {@link IcebergTableName},
|
|
101
|
+
* throwing a clear error otherwise. Guards write paths that index
|
|
102
|
+
* `ICEBERG_SCHEMAS` (a `Record<IcebergTableName, …>`) — a non-canonical name
|
|
103
|
+
* silently yields `undefined` there, propagating a corrupt/empty spec into the
|
|
104
|
+
* Iceberg job instead of failing loudly.
|
|
105
|
+
*/
|
|
106
|
+
declare function assertIcebergTable(table: string): IcebergTableName;
|
|
97
107
|
/** icebird's lowercase Iceberg primitive types (subset we use). */
|
|
98
108
|
type IcebergPrimitiveType = 'string' | 'int' | 'long' | 'double' | 'date';
|
|
99
109
|
/** A field in an icebird table `Schema`. */
|
|
@@ -370,4 +380,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
|
|
|
370
380
|
/** S3-compatible warehouse location (POC: MinIO). */
|
|
371
381
|
warehouse: string;
|
|
372
382
|
}
|
|
373
|
-
export { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, listIcebergDataFiles, listIcebergTables };
|
|
383
|
+
export { CommitRetryOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
package/dist/_chunks/source.mjs
CHANGED