@gscdump/engine 0.28.2 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/duckdb.d.mts +1 -1
- package/dist/_chunks/engine.mjs +55 -7
- package/dist/_chunks/index.d.mts +6 -2
- package/dist/_chunks/libs/hyparquet-compressors.mjs +9 -9
- package/dist/_chunks/libs/icebird.mjs +6 -6
- package/dist/_chunks/schema.d.mts +16 -9
- package/dist/_chunks/schema.mjs +6 -3
- package/dist/_chunks/schema2.mjs +30 -3
- package/dist/_chunks/sink.d.mts +64 -10
- package/dist/_chunks/storage.d.mts +16 -0
- package/dist/_chunks/types.d.mts +7 -2
- package/dist/adapters/duckdb-node.mjs +18 -6
- package/dist/adapters/hyparquet.d.mts +10 -2
- package/dist/adapters/hyparquet.mjs +132 -42
- package/dist/adapters/node.mjs +1 -1
- package/dist/errors.mjs +1 -1
- package/dist/iceberg/index.d.mts +2 -2
- package/dist/iceberg/index.mjs +47 -29
- package/dist/ingest.mjs +5 -3
- package/dist/rollups.mjs +1 -1
- package/dist/schema.d.mts +2 -2
- package/dist/schema.mjs +2 -2
- package/dist/sql-fragments.d.mts +24 -1
- package/dist/sql-fragments.mjs +6 -1
- package/package.json +7 -7
|
@@ -19,7 +19,7 @@ declare function createDuckDBExecutor(factory: DuckDBFactory): QueryExecutor;
|
|
|
19
19
|
/**
|
|
20
20
|
* Canonical "empty-file" SELECT clause for a table. Codecs that need to
|
|
21
21
|
* emit a schema-correct empty Parquet can wrap this in:
|
|
22
|
-
* `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET)`
|
|
22
|
+
* `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET, COMPRESSION ZSTD)`
|
|
23
23
|
* to satisfy the ParquetCodec empty-rows invariant.
|
|
24
24
|
*/
|
|
25
25
|
declare function canonicalEmptyParquetSchema(table: TableName): string;
|
package/dist/_chunks/engine.mjs
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
|
|
2
|
-
import { SCHEMAS, currentSchemaVersion, dedupeByNaturalKey } from "./schema.mjs";
|
|
2
|
+
import { SCHEMAS, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
|
|
3
3
|
import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
|
|
4
|
+
import { dateReplaceClause as dateReplaceClause$1 } from "../sql-fragments.mjs";
|
|
4
5
|
import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
|
|
5
6
|
import { sqlEscape } from "../sql-bind.mjs";
|
|
6
7
|
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
@@ -13,7 +14,7 @@ async function encodeBytes(db, table, rows) {
|
|
|
13
14
|
await db.registerFileBuffer(inName, jsonBytes);
|
|
14
15
|
registered.push(inName);
|
|
15
16
|
try {
|
|
16
|
-
const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`;
|
|
17
|
+
const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`;
|
|
17
18
|
await db.query(sql);
|
|
18
19
|
registered.push(outName);
|
|
19
20
|
return await db.copyFileToBuffer(outName);
|
|
@@ -58,7 +59,7 @@ function createDuckDBCodec(factory) {
|
|
|
58
59
|
const outName = db.makeTempPath("parquet");
|
|
59
60
|
const fileList = inputUris.map((u) => `'${sqlEscape(u)}'`).join(", ");
|
|
60
61
|
try {
|
|
61
|
-
await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
|
|
62
|
+
await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`);
|
|
62
63
|
const bytes = await db.copyFileToBuffer(outName);
|
|
63
64
|
const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
|
|
64
65
|
const rowCount = Number(countRows[0]?.n ?? 0);
|
|
@@ -83,7 +84,7 @@ function createDuckDBCodec(factory) {
|
|
|
83
84
|
}
|
|
84
85
|
try {
|
|
85
86
|
const fileList = inNames.map((n) => `'${sqlEscape(n)}'`).join(", ");
|
|
86
|
-
await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
|
|
87
|
+
await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`);
|
|
87
88
|
registered.push(outName);
|
|
88
89
|
const bytes = await db.copyFileToBuffer(outName);
|
|
89
90
|
const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
|
|
@@ -162,9 +163,7 @@ function canonicalEmptyParquetSchema(table) {
|
|
|
162
163
|
}
|
|
163
164
|
function dateReplaceClause(table) {
|
|
164
165
|
if (!table) return "";
|
|
165
|
-
|
|
166
|
-
if (dateCols.length === 0) return "";
|
|
167
|
-
return `REPLACE (${dateCols.map((n) => `strftime(CAST(${n} AS DATE), '%Y-%m-%d') AS ${n}`).join(", ")})`;
|
|
166
|
+
return dateReplaceClause$1(dateColumnsFor(table), "string");
|
|
168
167
|
}
|
|
169
168
|
function columnList(table) {
|
|
170
169
|
return SCHEMAS[table].columns.map((c) => c.name).join(", ");
|
|
@@ -272,6 +271,52 @@ async function gcOrphansImpl(deps, now, graceMs, opts = {}) {
|
|
|
272
271
|
}
|
|
273
272
|
return { deleted: retired.length + sweptOrphans + hourlyDeleted };
|
|
274
273
|
}
|
|
274
|
+
const PUSHABLE_COLUMN = { query: "query" };
|
|
275
|
+
function txLeaf(leaf, columns) {
|
|
276
|
+
if (leaf.operator !== "equals") return null;
|
|
277
|
+
const column = PUSHABLE_COLUMN[leaf.dimension];
|
|
278
|
+
if (!column || !columns.has(column)) return null;
|
|
279
|
+
return { [column]: { $eq: leaf.expression } };
|
|
280
|
+
}
|
|
281
|
+
function txExact(node, columns) {
|
|
282
|
+
const groupType = node._groupType ?? "and";
|
|
283
|
+
const leafParts = [];
|
|
284
|
+
for (const leaf of node._filters) {
|
|
285
|
+
const t = txLeaf(leaf, columns);
|
|
286
|
+
if (!t) return null;
|
|
287
|
+
leafParts.push(t);
|
|
288
|
+
}
|
|
289
|
+
if (groupType === "or") {
|
|
290
|
+
if (node._nestedGroups?.length || leafParts.length === 0) return null;
|
|
291
|
+
return leafParts.length === 1 ? leafParts[0] : { $or: leafParts };
|
|
292
|
+
}
|
|
293
|
+
const parts = leafParts;
|
|
294
|
+
for (const group of node._nestedGroups ?? []) {
|
|
295
|
+
const t = txExact(group, columns);
|
|
296
|
+
if (!t) return null;
|
|
297
|
+
parts.push(t);
|
|
298
|
+
}
|
|
299
|
+
if (parts.length === 0) return null;
|
|
300
|
+
return parts.length === 1 ? parts[0] : { $and: parts };
|
|
301
|
+
}
|
|
302
|
+
function extractParquetPushdown(state, table) {
|
|
303
|
+
const filter = state?.filter;
|
|
304
|
+
const schema = SCHEMAS[table];
|
|
305
|
+
if (!filter || !schema) return void 0;
|
|
306
|
+
const columns = new Set(schema.columns.map((c) => c.name));
|
|
307
|
+
if ((filter._groupType ?? "and") === "or") return txExact(filter, columns) ?? void 0;
|
|
308
|
+
const parts = [];
|
|
309
|
+
for (const leaf of filter._filters) {
|
|
310
|
+
const t = txLeaf(leaf, columns);
|
|
311
|
+
if (t) parts.push(t);
|
|
312
|
+
}
|
|
313
|
+
for (const group of filter._nestedGroups ?? []) {
|
|
314
|
+
const t = txExact(group, columns);
|
|
315
|
+
if (t) parts.push(t);
|
|
316
|
+
}
|
|
317
|
+
if (parts.length === 0) return void 0;
|
|
318
|
+
return parts.length === 1 ? parts[0] : { $and: parts };
|
|
319
|
+
}
|
|
275
320
|
const URL_PURGE_TABLES = ["pages", "page_queries"];
|
|
276
321
|
const MAX_DAY_BYTES = 100 * 1024 * 1024;
|
|
277
322
|
const URL_COLUMNS = /* @__PURE__ */ new Set();
|
|
@@ -463,6 +508,7 @@ function createStorageEngine(opts) {
|
|
|
463
508
|
dataSource,
|
|
464
509
|
table,
|
|
465
510
|
signal: opts.signal,
|
|
511
|
+
...opts.pushdownFilters ? { pushdownFilters: opts.pushdownFilters } : {},
|
|
466
512
|
...profiler ? { profiler } : {}
|
|
467
513
|
});
|
|
468
514
|
endExec?.({ rows: result.rows.length });
|
|
@@ -476,6 +522,7 @@ function createStorageEngine(opts) {
|
|
|
476
522
|
const plan = buildLogicalPlan(state, { regex: true });
|
|
477
523
|
const table = ctx.table ?? plan.dataset;
|
|
478
524
|
const resolved = compileLogicalQueryPlan(plan, table);
|
|
525
|
+
const pushdown = extractParquetPushdown(state, table);
|
|
479
526
|
return runSQL({
|
|
480
527
|
ctx: {
|
|
481
528
|
userId: ctx.userId,
|
|
@@ -489,6 +536,7 @@ function createStorageEngine(opts) {
|
|
|
489
536
|
sql: resolved.sql,
|
|
490
537
|
params: resolved.params,
|
|
491
538
|
signal: ctx.signal,
|
|
539
|
+
...pushdown ? { pushdownFilters: { FILES: pushdown } } : {},
|
|
492
540
|
...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {},
|
|
493
541
|
...ctx.profiler ? { profiler: ctx.profiler } : {}
|
|
494
542
|
});
|
package/dist/_chunks/index.d.mts
CHANGED
|
@@ -60,8 +60,12 @@ interface CreateSqlQuerySourceOptions<TKey extends string> {
|
|
|
60
60
|
execute: (sql: string, params: unknown[]) => Promise<QueryRow[]>;
|
|
61
61
|
/** Tenant id for multi-tenant dialects; forwarded to `resolveToSQL`. */
|
|
62
62
|
siteId?: string | number;
|
|
63
|
-
/**
|
|
64
|
-
|
|
63
|
+
/**
|
|
64
|
+
* Search-type scope for multi-tenant dialects; forwarded to `resolveToSQL`.
|
|
65
|
+
* `number` = int-encoded code (`SEARCH_TYPE_INT`) for INT `search_type`
|
|
66
|
+
* catalogs (bound bare so the int partition prunes); `string` otherwise.
|
|
67
|
+
*/
|
|
68
|
+
searchType?: string | number;
|
|
65
69
|
/** Additional capability flags merged on top of `adapter.capabilities`. */
|
|
66
70
|
extraCapabilities?: Partial<SourceCapabilities>;
|
|
67
71
|
}
|
|
@@ -175,8 +175,8 @@ function nextTableBitSize(count, len, root_bits) {
|
|
|
175
175
|
}
|
|
176
176
|
function buildHuffmanTable(root_table, table, root_bits, code_lengths, code_lengths_size) {
|
|
177
177
|
const start_table = table;
|
|
178
|
-
const count = new Int32Array(16);
|
|
179
|
-
const offset = new Int32Array(16);
|
|
178
|
+
const count = /* @__PURE__ */ new Int32Array(16);
|
|
179
|
+
const offset = /* @__PURE__ */ new Int32Array(16);
|
|
180
180
|
const sorted = new Int32Array(code_lengths_size);
|
|
181
181
|
for (let i = 0; i < code_lengths_size; i++) count[code_lengths[i]]++;
|
|
182
182
|
offset[1] = 0;
|
|
@@ -220,7 +220,7 @@ function readHuffmanCode(alphabet_size, tables, table, br) {
|
|
|
220
220
|
if (simple_code_or_skip === 1) {
|
|
221
221
|
let max_bits_counter = alphabet_size - 1;
|
|
222
222
|
let max_bits = 0;
|
|
223
|
-
const symbols = new Int32Array(4);
|
|
223
|
+
const symbols = /* @__PURE__ */ new Int32Array(4);
|
|
224
224
|
const num_symbols = br.readBits(2) + 1;
|
|
225
225
|
while (max_bits_counter) {
|
|
226
226
|
max_bits_counter >>= 1;
|
|
@@ -505,7 +505,7 @@ const fixedDistanceExtraBits = new Uint8Array([
|
|
|
505
505
|
0
|
|
506
506
|
]);
|
|
507
507
|
function freb(eb, start) {
|
|
508
|
-
const base = new Uint16Array(31);
|
|
508
|
+
const base = /* @__PURE__ */ new Uint16Array(31);
|
|
509
509
|
for (let i = 0; i < 31; i++) base[i] = start += 1 << eb[i - 1];
|
|
510
510
|
const rev = new Int32Array(base[30]);
|
|
511
511
|
for (let i = 1; i < 30; i++) for (let j = base[i]; j < base[i + 1]; ++j) rev[j] = j - base[i] << 5 | i;
|
|
@@ -518,7 +518,7 @@ const { base: fixedLength, rev: revfl } = freb(fixedLengthExtraBits, 2);
|
|
|
518
518
|
fixedLength[28] = 258;
|
|
519
519
|
revfl[258] = 28;
|
|
520
520
|
const { base: fixedDistance } = freb(fixedDistanceExtraBits, 0);
|
|
521
|
-
const rev = new Uint16Array(32768);
|
|
521
|
+
const rev = /* @__PURE__ */ new Uint16Array(32768);
|
|
522
522
|
for (let i = 0; i < 32768; i++) {
|
|
523
523
|
let x = (i & 43690) >> 1 | (i & 21845) << 1;
|
|
524
524
|
x = (x & 52428) >> 2 | (x & 13107) << 2;
|
|
@@ -546,12 +546,12 @@ function huffMap(cd, maxBits, r) {
|
|
|
546
546
|
}
|
|
547
547
|
return co;
|
|
548
548
|
}
|
|
549
|
-
const fixedLengthTree = new Uint8Array(288);
|
|
549
|
+
const fixedLengthTree = /* @__PURE__ */ new Uint8Array(288);
|
|
550
550
|
for (let i = 0; i < 144; i++) fixedLengthTree[i] = 8;
|
|
551
551
|
for (let i = 144; i < 256; i++) fixedLengthTree[i] = 9;
|
|
552
552
|
for (let i = 256; i < 280; i++) fixedLengthTree[i] = 7;
|
|
553
553
|
for (let i = 280; i < 288; i++) fixedLengthTree[i] = 8;
|
|
554
|
-
const fixedDistanceTree = new Uint8Array(32);
|
|
554
|
+
const fixedDistanceTree = /* @__PURE__ */ new Uint8Array(32);
|
|
555
555
|
for (let i = 0; i < 32; i++) fixedDistanceTree[i] = 5;
|
|
556
556
|
const fixedLengthMap = /*#__PURE__*/ huffMap(fixedLengthTree, 9, 1);
|
|
557
557
|
const fixedDistanceMap = /*#__PURE__*/ huffMap(fixedDistanceTree, 5, 1);
|
|
@@ -2420,7 +2420,7 @@ function gzipStart(input, i) {
|
|
|
2420
2420
|
return i + (flag & 2);
|
|
2421
2421
|
}
|
|
2422
2422
|
function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
|
|
2423
|
-
let out = output ?? new Uint8Array(1024);
|
|
2423
|
+
let out = output ?? /* @__PURE__ */ new Uint8Array(1024);
|
|
2424
2424
|
if (!(input.length - inputIndex)) return out;
|
|
2425
2425
|
const payloadStart = gzipStart(input, inputIndex);
|
|
2426
2426
|
if (payloadStart === input.length - 8) return out;
|
|
@@ -2465,7 +2465,7 @@ function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
|
|
|
2465
2465
|
const tl = hLiteral + bits(input, pos + 5, 31) + 1;
|
|
2466
2466
|
pos += 14;
|
|
2467
2467
|
const lengthDistanceTree = new Uint8Array(tl);
|
|
2468
|
-
const codeLengthTree = new Uint8Array(19);
|
|
2468
|
+
const codeLengthTree = /* @__PURE__ */ new Uint8Array(19);
|
|
2469
2469
|
for (let i = 0; i < hcLengths; ++i) codeLengthTree[codeLengthIndexMap[i]] = bits(input, pos + i * 3, 7);
|
|
2470
2470
|
pos += hcLengths * 3;
|
|
2471
2471
|
const codeLengthBits = Math.max(...codeLengthTree);
|
|
@@ -962,7 +962,7 @@ function uuidToBytes(value, label) {
|
|
|
962
962
|
if (typeof value !== "string") throw new Error(`expected ${label}`);
|
|
963
963
|
const hex = value.toLowerCase().replace(/-/g, "");
|
|
964
964
|
if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error(`expected ${label}`);
|
|
965
|
-
const bytes = new Uint8Array(16);
|
|
965
|
+
const bytes = /* @__PURE__ */ new Uint8Array(16);
|
|
966
966
|
for (let i = 0; i < bytes.length; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
967
967
|
return bytes;
|
|
968
968
|
}
|
|
@@ -1074,7 +1074,7 @@ function bucketBytes(value, sourceType) {
|
|
|
1074
1074
|
else if (t === "timestamp" || t === "timestamptz") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value);
|
|
1075
1075
|
else if (t === "timestamp_ns" || t === "timestamptz_ns") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value) / 1000n;
|
|
1076
1076
|
else v = typeof value === "bigint" ? value : BigInt(value);
|
|
1077
|
-
const out = new Uint8Array(8);
|
|
1077
|
+
const out = /* @__PURE__ */ new Uint8Array(8);
|
|
1078
1078
|
new DataView(out.buffer).setBigInt64(0, v, true);
|
|
1079
1079
|
return out;
|
|
1080
1080
|
}
|
|
@@ -1835,7 +1835,7 @@ function avroWrite({ writer, schema, records, blockSize = 512, metadata }) {
|
|
|
1835
1835
|
writer.appendBytes(vb);
|
|
1836
1836
|
}
|
|
1837
1837
|
writer.appendVarInt(0);
|
|
1838
|
-
const sync = new Uint8Array(16);
|
|
1838
|
+
const sync = /* @__PURE__ */ new Uint8Array(16);
|
|
1839
1839
|
for (let i = 0; i < 16; i++) sync[i] = Math.random() * 256 | 0;
|
|
1840
1840
|
writer.appendBytes(sync);
|
|
1841
1841
|
for (let i = 0; i < records.length; i += blockSize) {
|
|
@@ -1940,7 +1940,7 @@ function appendZigZag64(writer, v) {
|
|
|
1940
1940
|
function uuidStringToBytes$1(value) {
|
|
1941
1941
|
const hex = value.toLowerCase().replace(/-/g, "");
|
|
1942
1942
|
if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error("expected uuid string");
|
|
1943
|
-
const bytes = new Uint8Array(16);
|
|
1943
|
+
const bytes = /* @__PURE__ */ new Uint8Array(16);
|
|
1944
1944
|
for (let i = 0; i < 16; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
1945
1945
|
return bytes;
|
|
1946
1946
|
}
|
|
@@ -2531,7 +2531,7 @@ function twosComplementMinBigEndian(value) {
|
|
|
2531
2531
|
function uuidStringToBytes(s) {
|
|
2532
2532
|
const hex = s.replace(/-/g, "");
|
|
2533
2533
|
if (hex.length !== 32) return void 0;
|
|
2534
|
-
const out = new Uint8Array(16);
|
|
2534
|
+
const out = /* @__PURE__ */ new Uint8Array(16);
|
|
2535
2535
|
for (let i = 0; i < 16; i++) {
|
|
2536
2536
|
const byte = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
2537
2537
|
if (Number.isNaN(byte)) return void 0;
|
|
@@ -3286,7 +3286,7 @@ function resolveParquetCodec(value) {
|
|
|
3286
3286
|
}
|
|
3287
3287
|
function newSnapshotId(metadata) {
|
|
3288
3288
|
const used = new Set((metadata?.snapshots ?? []).map((s) => BigInt(s["snapshot-id"])));
|
|
3289
|
-
const arr = new BigInt64Array(1);
|
|
3289
|
+
const arr = /* @__PURE__ */ new BigInt64Array(1);
|
|
3290
3290
|
for (let attempt = 0; attempt < 32; attempt++) {
|
|
3291
3291
|
globalThis.crypto.getRandomValues(arr);
|
|
3292
3292
|
const masked = arr[0] & 9007199254740991n;
|
|
@@ -1087,12 +1087,12 @@ declare const hourly_pages: import("drizzle-orm/pg-core").PgTableWithColumns<{
|
|
|
1087
1087
|
identity: undefined;
|
|
1088
1088
|
generated: undefined;
|
|
1089
1089
|
}>;
|
|
1090
|
-
hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").
|
|
1090
|
+
hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgIntegerBuilder>, {
|
|
1091
1091
|
name: string;
|
|
1092
1092
|
tableName: "hourly_pages";
|
|
1093
|
-
dataType: "
|
|
1094
|
-
data:
|
|
1095
|
-
driverParam: string;
|
|
1093
|
+
dataType: "number int32";
|
|
1094
|
+
data: number;
|
|
1095
|
+
driverParam: string | number;
|
|
1096
1096
|
notNull: true;
|
|
1097
1097
|
hasDefault: false;
|
|
1098
1098
|
isPrimaryKey: false;
|
|
@@ -2193,12 +2193,12 @@ declare const drizzleSchema: {
|
|
|
2193
2193
|
identity: undefined;
|
|
2194
2194
|
generated: undefined;
|
|
2195
2195
|
}>;
|
|
2196
|
-
hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").
|
|
2196
|
+
hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgIntegerBuilder>, {
|
|
2197
2197
|
name: string;
|
|
2198
2198
|
tableName: "hourly_pages";
|
|
2199
|
-
dataType: "
|
|
2200
|
-
data:
|
|
2201
|
-
driverParam: string;
|
|
2199
|
+
dataType: "number int32";
|
|
2200
|
+
data: number;
|
|
2201
|
+
driverParam: string | number;
|
|
2202
2202
|
notNull: true;
|
|
2203
2203
|
hasDefault: false;
|
|
2204
2204
|
isPrimaryKey: false;
|
|
@@ -2236,6 +2236,13 @@ declare const TABLE_METADATA: Record<TableName, {
|
|
|
2236
2236
|
declare const SCHEMAS: Record<TableName, TableSchema>;
|
|
2237
2237
|
declare function currentSchemaVersion(table: TableName): number;
|
|
2238
2238
|
declare function schemaFor(table: TableName): TableSchema;
|
|
2239
|
+
/**
|
|
2240
|
+
* DATE column names for a table. The single schema-derived source every read
|
|
2241
|
+
* path uses to build the legacy-VARCHAR date canonicalization (see
|
|
2242
|
+
* `dateReplaceClause` in `./sql-fragments`), so the engine codec and the CLI
|
|
2243
|
+
* `dump`/`export` commands agree on which columns to cast.
|
|
2244
|
+
*/
|
|
2245
|
+
declare function dateColumnsFor(table: TableName): string[];
|
|
2239
2246
|
declare function allTables(): readonly TableName[];
|
|
2240
2247
|
declare function inferTable(dimensions: readonly string[]): TableName;
|
|
2241
2248
|
/**
|
|
@@ -2259,4 +2266,4 @@ declare function naturalKeyColumns(table: TableName): readonly string[];
|
|
|
2259
2266
|
*/
|
|
2260
2267
|
declare function dedupeByNaturalKey(table: TableName, rows: readonly Row[]): Row[];
|
|
2261
2268
|
declare function dimensionToColumn(dim: string, _table: TableName): string;
|
|
2262
|
-
export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
|
2269
|
+
export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
package/dist/_chunks/schema.mjs
CHANGED
|
@@ -74,7 +74,7 @@ const search_appearance_page_queries = pgTable("search_appearance_page_queries",
|
|
|
74
74
|
});
|
|
75
75
|
const hourly_pages = pgTable("hourly_pages", {
|
|
76
76
|
url: varchar("url").notNull(),
|
|
77
|
-
hour:
|
|
77
|
+
hour: integer("hour").notNull(),
|
|
78
78
|
date: dateCol(),
|
|
79
79
|
...metricCols()
|
|
80
80
|
});
|
|
@@ -181,7 +181,7 @@ const TABLE_METADATA = {
|
|
|
181
181
|
"date",
|
|
182
182
|
"hour"
|
|
183
183
|
],
|
|
184
|
-
version:
|
|
184
|
+
version: 2
|
|
185
185
|
}
|
|
186
186
|
};
|
|
187
187
|
function pgSqlTypeToColumnType(sqlType) {
|
|
@@ -226,6 +226,9 @@ function currentSchemaVersion(table) {
|
|
|
226
226
|
function schemaFor(table) {
|
|
227
227
|
return SCHEMAS[table];
|
|
228
228
|
}
|
|
229
|
+
function dateColumnsFor(table) {
|
|
230
|
+
return SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
|
|
231
|
+
}
|
|
229
232
|
function allTables() {
|
|
230
233
|
return METRIC_TABLES;
|
|
231
234
|
}
|
|
@@ -260,4 +263,4 @@ function dimensionToColumn(dim, _table) {
|
|
|
260
263
|
if (dim === "queryCanonical") return "query_canonical";
|
|
261
264
|
return dim;
|
|
262
265
|
}
|
|
263
|
-
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
|
266
|
+
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
package/dist/_chunks/schema2.mjs
CHANGED
|
@@ -10,6 +10,15 @@ const ICEBERG_TABLES = [
|
|
|
10
10
|
"search_appearance_queries",
|
|
11
11
|
"search_appearance_page_queries"
|
|
12
12
|
];
|
|
13
|
+
const SEARCH_TYPE_INT = {
|
|
14
|
+
web: 1,
|
|
15
|
+
image: 2,
|
|
16
|
+
video: 3,
|
|
17
|
+
news: 4,
|
|
18
|
+
discover: 5,
|
|
19
|
+
googleNews: 6
|
|
20
|
+
};
|
|
21
|
+
const INT_SEARCH_TYPE = Object.fromEntries(Object.entries(SEARCH_TYPE_INT).map(([k, v]) => [v, k]));
|
|
13
22
|
const ICEBERG_PARTITION_COLUMNS = [{
|
|
14
23
|
name: "site_id",
|
|
15
24
|
type: "STRING",
|
|
@@ -21,6 +30,20 @@ const ICEBERG_PARTITION_COLUMNS = [{
|
|
|
21
30
|
required: true,
|
|
22
31
|
fieldId: 2
|
|
23
32
|
}];
|
|
33
|
+
function icebergPartitionColumns(encoding = "string") {
|
|
34
|
+
if (encoding === "string") return ICEBERG_PARTITION_COLUMNS;
|
|
35
|
+
return [{
|
|
36
|
+
name: "site_id",
|
|
37
|
+
type: "INT",
|
|
38
|
+
required: true,
|
|
39
|
+
fieldId: 1
|
|
40
|
+
}, {
|
|
41
|
+
name: "search_type",
|
|
42
|
+
type: "INT",
|
|
43
|
+
required: true,
|
|
44
|
+
fieldId: 2
|
|
45
|
+
}];
|
|
46
|
+
}
|
|
24
47
|
const ICEBERG_FIELD_ID_BASE = 3;
|
|
25
48
|
const ICEBERG_PARTITION_SPEC = [
|
|
26
49
|
{
|
|
@@ -48,7 +71,7 @@ function mapColumnType(t) {
|
|
|
48
71
|
case "DATE": return "DATE";
|
|
49
72
|
}
|
|
50
73
|
}
|
|
51
|
-
function icebergTableSpec(table) {
|
|
74
|
+
function icebergTableSpec(table, encoding = "string") {
|
|
52
75
|
const base = SCHEMAS[table];
|
|
53
76
|
const dataColumns = base.columns.map((col, i) => ({
|
|
54
77
|
name: col.name,
|
|
@@ -58,7 +81,7 @@ function icebergTableSpec(table) {
|
|
|
58
81
|
}));
|
|
59
82
|
return {
|
|
60
83
|
table,
|
|
61
|
-
columns: [...
|
|
84
|
+
columns: [...icebergPartitionColumns(encoding), ...dataColumns],
|
|
62
85
|
partitionSpec: ICEBERG_PARTITION_SPEC,
|
|
63
86
|
identityColumns: [
|
|
64
87
|
"site_id",
|
|
@@ -68,6 +91,10 @@ function icebergTableSpec(table) {
|
|
|
68
91
|
};
|
|
69
92
|
}
|
|
70
93
|
const ICEBERG_SCHEMAS = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t)]));
|
|
94
|
+
const ICEBERG_SCHEMAS_INT = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t, "int")]));
|
|
95
|
+
function icebergSchemasFor(encoding = "string") {
|
|
96
|
+
return encoding === "int" ? ICEBERG_SCHEMAS_INT : ICEBERG_SCHEMAS;
|
|
97
|
+
}
|
|
71
98
|
const ICEBERG_TABLE_SET = new Set(ICEBERG_TABLES);
|
|
72
99
|
function isIcebergTable(table) {
|
|
73
100
|
return ICEBERG_TABLE_SET.has(table);
|
|
@@ -76,4 +103,4 @@ function assertIcebergTable(table) {
|
|
|
76
103
|
if (!isIcebergTable(table)) throw new Error(`Unknown Iceberg table '${table}'. Expected one of: ${ICEBERG_TABLES.join(", ")}`);
|
|
77
104
|
return table;
|
|
78
105
|
}
|
|
79
|
-
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable };
|
|
106
|
+
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable };
|
package/dist/_chunks/sink.d.mts
CHANGED
|
@@ -50,6 +50,33 @@ interface IcebergColumn {
|
|
|
50
50
|
*/
|
|
51
51
|
fieldId: number;
|
|
52
52
|
}
|
|
53
|
+
/**
|
|
54
|
+
* Partition-key encoding for the two identity columns (`site_id`, `search_type`).
|
|
55
|
+
*
|
|
56
|
+
* - `'string'` (default, legacy): both columns are STRING. Correct, but R2 SQL's
|
|
57
|
+
* string min/max statistics are truncated in predicate pushdown, so a bare
|
|
58
|
+
* `WHERE site_id='<uuid>'` UNDERCOUNTS — callers must CONCAT(col,'') to stay
|
|
59
|
+
* correct, which defeats partition pruning.
|
|
60
|
+
* - `'int'`: BOTH `site_id` and `search_type` are INT. Integer statistics are
|
|
61
|
+
* fixed-width and never truncated, so `WHERE site_id=<n>` is both correct AND
|
|
62
|
+
* prunes (empirically confirmed 2026-06-19, gscdump.com probe-int64-partition;
|
|
63
|
+
* INT equality proven via the search_type column in the engine e2e canary). A
|
|
64
|
+
* small INT site_id is ample (≪ 2.1B sites) — no LONG/BigInt needed. The caller
|
|
65
|
+
* maps the UUID `site_id` ↔ int (app-owned, per-tenant serial) and uses
|
|
66
|
+
* {@link SEARCH_TYPE_INT} for `search_type` (engine-owned, fixed enum).
|
|
67
|
+
*
|
|
68
|
+
* New per-team catalogs are provisioned `'int'`; existing catalogs stay
|
|
69
|
+
* `'string'`. Purely additive: `'string'` is the default everywhere so existing
|
|
70
|
+
* tables, writers, and readers are unchanged.
|
|
71
|
+
*/
|
|
72
|
+
type PartitionKeyEncoding = 'string' | 'int';
|
|
73
|
+
/**
|
|
74
|
+
* Stable `search_type` enum → int map for `'int'`-encoded catalogs. Engine-owned
|
|
75
|
+
* and FROZEN: never renumber or reuse an id (it's the on-disk partition value).
|
|
76
|
+
*/
|
|
77
|
+
declare const SEARCH_TYPE_INT: Record<SearchType, number>;
|
|
78
|
+
/** Reverse of {@link SEARCH_TYPE_INT} — int → `search_type`, for read-result mapping. */
|
|
79
|
+
declare const INT_SEARCH_TYPE: Record<number, SearchType>;
|
|
53
80
|
/** Iceberg partition transform applied to a source column. */
|
|
54
81
|
type IcebergPartitionTransform = 'identity' | 'month';
|
|
55
82
|
interface IcebergPartitionField {
|
|
@@ -82,6 +109,15 @@ interface IcebergTableSpec {
|
|
|
82
109
|
* contiguously from id 3 (see `ICEBERG_FIELD_ID_BASE`).
|
|
83
110
|
*/
|
|
84
111
|
declare const ICEBERG_PARTITION_COLUMNS: readonly IcebergColumn[];
|
|
112
|
+
/**
|
|
113
|
+
* The two partition-identity columns for a given {@link PartitionKeyEncoding}.
|
|
114
|
+
* `'string'` returns {@link ICEBERG_PARTITION_COLUMNS} verbatim; `'int'` swaps
|
|
115
|
+
* BOTH to INT — `site_id` (the app's small `user_sites.int_id`; ≪ 2.1B sites, so
|
|
116
|
+
* INT is ample) and `search_type` (its fixed enum code). Integer identity columns
|
|
117
|
+
* avoid R2 SQL's truncated-string-stats equality undercount and restore pruning.
|
|
118
|
+
* Field ids are unchanged (1, 2) — only the column types differ.
|
|
119
|
+
*/
|
|
120
|
+
declare function icebergPartitionColumns(encoding?: PartitionKeyEncoding): readonly IcebergColumn[];
|
|
85
121
|
/**
|
|
86
122
|
* First field id used for per-table (non-partition) columns — immediately
|
|
87
123
|
* after the two partition-identity columns (`site_id`=1, `search_type`=2).
|
|
@@ -105,9 +141,13 @@ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
|
|
|
105
141
|
* CONTRACT NOTE: implementation agents must treat the RETURNED VALUE as the
|
|
106
142
|
* source of truth — do not hand-list columns elsewhere.
|
|
107
143
|
*/
|
|
108
|
-
declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
|
|
109
|
-
/** All Iceberg table specs, keyed by table name. */
|
|
144
|
+
declare function icebergTableSpec(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergTableSpec;
|
|
145
|
+
/** All Iceberg table specs (legacy `'string'` encoding), keyed by table name. */
|
|
110
146
|
declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
|
|
147
|
+
/** All Iceberg table specs in `'int'` encoding (INT site_id + INT search_type). */
|
|
148
|
+
declare const ICEBERG_SCHEMAS_INT: Record<IcebergTableName, IcebergTableSpec>;
|
|
149
|
+
/** Table specs for the given encoding (`'string'` default). */
|
|
150
|
+
declare function icebergSchemasFor(encoding?: PartitionKeyEncoding): Record<IcebergTableName, IcebergTableSpec>;
|
|
111
151
|
/** True when `table` is one of the canonical {@link ICEBERG_TABLES}. */
|
|
112
152
|
declare function isIcebergTable(table: string): table is IcebergTableName;
|
|
113
153
|
/**
|
|
@@ -172,14 +212,14 @@ interface IcebergConnection {
|
|
|
172
212
|
* `ICEBERG_SCHEMAS` contract. Field ids are advisory — R2 Data Catalog
|
|
173
213
|
* re-assigns them on `createTable` (see `ICEBERG_FIELD_ID_BASE`).
|
|
174
214
|
*/
|
|
175
|
-
declare function icebergSchemaFor(table: IcebergTableName): IcebergSchema;
|
|
215
|
+
declare function icebergSchemaFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergSchema;
|
|
176
216
|
/**
|
|
177
217
|
* Build the icebird `PartitionSpec` for one of the 5 fact tables: the locked
|
|
178
218
|
* spec `identity(site_id) + identity(search_type) + month(date)`. Each
|
|
179
219
|
* partition field's `source-id` is resolved to the real column field id from
|
|
180
220
|
* {@link icebergSchemaFor}.
|
|
181
221
|
*/
|
|
182
|
-
declare function icebergPartitionSpecFor(table: IcebergTableName): IcebergPartitionSpec;
|
|
222
|
+
declare function icebergPartitionSpecFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergPartitionSpec;
|
|
183
223
|
/** Options for {@link connectIcebergCatalog}. */
|
|
184
224
|
interface ConnectIcebergOptions {
|
|
185
225
|
/**
|
|
@@ -264,7 +304,7 @@ declare function ensureIcebergNamespace(conn: IcebergConnection): Promise<void>;
|
|
|
264
304
|
* than thrown so a partial run is observable; "table already exists" surfaces
|
|
265
305
|
* as a failed result. Used by the app's one-off provisioning script.
|
|
266
306
|
*/
|
|
267
|
-
declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[]): Promise<IcebergTableOpResult[]>;
|
|
307
|
+
declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[], encoding?: PartitionKeyEncoding): Promise<IcebergTableOpResult[]>;
|
|
268
308
|
/**
|
|
269
309
|
* List the table names currently in the catalog namespace.
|
|
270
310
|
*
|
|
@@ -284,10 +324,16 @@ interface IcebergListedDataFile {
|
|
|
284
324
|
}
|
|
285
325
|
interface ListIcebergDataFilesOptions {
|
|
286
326
|
table: IcebergTableName;
|
|
287
|
-
/** Partition identity column. */
|
|
288
|
-
siteId: string;
|
|
289
|
-
/** Partition identity column. */
|
|
290
|
-
searchType: string;
|
|
327
|
+
/** Partition identity column. `number` for `'int'`-encoded catalogs. */
|
|
328
|
+
siteId: string | number;
|
|
329
|
+
/** Partition identity column. `number` (int code) for `'int'`-encoded catalogs. */
|
|
330
|
+
searchType: string | number;
|
|
331
|
+
/**
|
|
332
|
+
* Partition-key encoding of the catalog. `'int'` changes how manifest-summary
|
|
333
|
+
* bounds are decoded (int bytes vs UTF-8) and how the per-file partition value
|
|
334
|
+
* is compared. Defaults to `'string'`.
|
|
335
|
+
*/
|
|
336
|
+
encoding?: PartitionKeyEncoding;
|
|
291
337
|
/**
|
|
292
338
|
* Inclusive date range. Every month touched by `[start, end]` is scanned;
|
|
293
339
|
* `month(date)` is the third partition transform.
|
|
@@ -437,6 +483,14 @@ interface IcebergAppendSinkOptions extends SinkOptions {
|
|
|
437
483
|
* uses the defaults; tests inject a synchronous `sleep`.
|
|
438
484
|
*/
|
|
439
485
|
commitRetry?: CommitRetryOptions;
|
|
486
|
+
/**
|
|
487
|
+
* Partition-key encoding (default `'string'`). `'int'` writes BOTH `site_id`
|
|
488
|
+
* and `search_type` as INT — the caller MUST pass the numeric `site_id` (a
|
|
489
|
+
* numeric string is fine; it's `Number()`-coerced) in `slice.ctx.siteId`. A
|
|
490
|
+
* small INT is ample (≪ 2.1B sites), so no LONG/BigInt is involved. See
|
|
491
|
+
* {@link import('./iceberg/schema').PartitionKeyEncoding}.
|
|
492
|
+
*/
|
|
493
|
+
encoding?: PartitionKeyEncoding;
|
|
440
494
|
}
|
|
441
495
|
/** `LocalIcebergSink` options — points at the local Iceberg REST catalog. */
|
|
442
496
|
interface LocalIcebergSinkOptions extends SinkOptions {
|
|
@@ -447,4 +501,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
|
|
|
447
501
|
/** S3-compatible warehouse location (POC: MinIO). */
|
|
448
502
|
warehouse: string;
|
|
449
503
|
}
|
|
450
|
-
export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
504
|
+
export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { ParquetQueryFilter } from "hyparquet";
|
|
1
2
|
import { BuilderState, SearchType, SearchType as SearchType$1 } from "gscdump/query";
|
|
2
3
|
import { Grain, Grain as Grain$1, Row, Row as Row$1, TableName, TableName as TableName$1, TenantCtx, TenantCtx as TenantCtx$1 } from "@gscdump/contracts";
|
|
3
4
|
/**
|
|
@@ -397,6 +398,15 @@ interface QueryExecuteOptions {
|
|
|
397
398
|
* the page_queries schema, not the analyzer's primary `table`.
|
|
398
399
|
*/
|
|
399
400
|
placeholderTables?: Record<string, TableName>;
|
|
401
|
+
/**
|
|
402
|
+
* Per-placeholder row-group pushdown filter, derived from the query's
|
|
403
|
+
* structured filter (see `extractParquetPushdown`). A pure-JS decode executor
|
|
404
|
+
* MAY pass it to the parquet reader to prune row groups and shrink the rows
|
|
405
|
+
* it materialises before the SQL WHERE re-applies. Pure optimization: the
|
|
406
|
+
* filter is a superset of the final predicate, so an executor that ignores it
|
|
407
|
+
* (e.g. native DuckDB, which pushes from the SQL itself) stays correct.
|
|
408
|
+
*/
|
|
409
|
+
pushdownFilters?: Record<string, ParquetQueryFilter>;
|
|
400
410
|
dataSource: DataSource;
|
|
401
411
|
table: TableName;
|
|
402
412
|
signal?: AbortSignal;
|
|
@@ -475,6 +485,12 @@ interface RunSQLOptions {
|
|
|
475
485
|
* Undefined keeps the legacy cross-type union.
|
|
476
486
|
*/
|
|
477
487
|
searchType?: SearchType;
|
|
488
|
+
/**
|
|
489
|
+
* Per-placeholder parquet pushdown filter, forwarded verbatim to the
|
|
490
|
+
* executor. Keyed by fileSet name (matching `fileSets`). See
|
|
491
|
+
* `QueryExecuteOptions.pushdownFilters` and `extractParquetPushdown`.
|
|
492
|
+
*/
|
|
493
|
+
pushdownFilters?: Record<string, ParquetQueryFilter>;
|
|
478
494
|
/**
|
|
479
495
|
* Optional read-path profiler. `runSQL` emits `manifest.list` +
|
|
480
496
|
* `executor.execute` spans and forwards it into the executor for the
|
package/dist/_chunks/types.d.mts
CHANGED
|
@@ -30,8 +30,13 @@ interface ResolverOptions<TableKey extends string = string> {
|
|
|
30
30
|
adapter: ResolverAdapter<TableKey>;
|
|
31
31
|
/** Optional site scope. Required for multi-tenant D1; omitted for parquet. */
|
|
32
32
|
siteId?: string | number;
|
|
33
|
-
/**
|
|
34
|
-
|
|
33
|
+
/**
|
|
34
|
+
* Optional searchType scope. Required for multi-tenant Iceberg; omitted for
|
|
35
|
+
* parquet. `number` is the int-encoded code (`SEARCH_TYPE_INT`) for catalogs
|
|
36
|
+
* whose `search_type` partition column is INT — bound bare (unquoted) so the
|
|
37
|
+
* int partition prunes; `string` for the default string-encoded catalogs.
|
|
38
|
+
*/
|
|
39
|
+
searchType?: string | number;
|
|
35
40
|
}
|
|
36
41
|
interface ResolvedSQL {
|
|
37
42
|
sql: string;
|
|
@@ -8,6 +8,7 @@ import { tmpdir } from "node:os";
|
|
|
8
8
|
import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
|
|
9
9
|
const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
|
|
10
10
|
let singleton = null;
|
|
11
|
+
let singletonOpts = null;
|
|
11
12
|
function bundles() {
|
|
12
13
|
return {
|
|
13
14
|
mvp: {
|
|
@@ -29,11 +30,19 @@ async function initialize(opts) {
|
|
|
29
30
|
conn: db.connect()
|
|
30
31
|
};
|
|
31
32
|
}
|
|
33
|
+
function getSingleton(opts) {
|
|
34
|
+
if (!singleton) {
|
|
35
|
+
singleton = initialize(opts);
|
|
36
|
+
singletonOpts = opts;
|
|
37
|
+
}
|
|
38
|
+
return singleton;
|
|
39
|
+
}
|
|
32
40
|
function createNodeDuckDBHandle(opts = {}) {
|
|
33
|
-
if (
|
|
41
|
+
if (singleton && opts.verbose !== void 0 && opts.verbose !== (singletonOpts?.verbose ?? false)) console.warn(`[gscdump] createNodeDuckDBHandle: ignoring verbose=${opts.verbose} — a shared DuckDB instance was already initialized with verbose=${singletonOpts?.verbose ?? false}. Call resetNodeDuckDB() before re-initializing to change it.`);
|
|
42
|
+
getSingleton(opts);
|
|
34
43
|
return {
|
|
35
44
|
async query(sql, params) {
|
|
36
|
-
const { conn } = await
|
|
45
|
+
const { conn } = await getSingleton(opts);
|
|
37
46
|
if (!params || params.length === 0) return arrowToRows(conn.query(sql));
|
|
38
47
|
const stmt = conn.prepare(sql);
|
|
39
48
|
try {
|
|
@@ -43,15 +52,15 @@ function createNodeDuckDBHandle(opts = {}) {
|
|
|
43
52
|
}
|
|
44
53
|
},
|
|
45
54
|
async registerFileBuffer(name, bytes) {
|
|
46
|
-
const { db } = await
|
|
55
|
+
const { db } = await getSingleton(opts);
|
|
47
56
|
db.registerFileBuffer(name, bytes);
|
|
48
57
|
},
|
|
49
58
|
async copyFileToBuffer(name) {
|
|
50
|
-
const { db } = await
|
|
59
|
+
const { db } = await getSingleton(opts);
|
|
51
60
|
return db.copyFileToBuffer(name);
|
|
52
61
|
},
|
|
53
62
|
async dropFiles(names) {
|
|
54
|
-
const { db } = await
|
|
63
|
+
const { db } = await getSingleton(opts);
|
|
55
64
|
for (const name of names) {
|
|
56
65
|
try {
|
|
57
66
|
db.dropFile(name);
|
|
@@ -69,9 +78,12 @@ function createNodeDuckDBHandle(opts = {}) {
|
|
|
69
78
|
function resetNodeDuckDB() {
|
|
70
79
|
const pending = singleton;
|
|
71
80
|
singleton = null;
|
|
81
|
+
singletonOpts = null;
|
|
72
82
|
pending?.then(({ db, conn }) => {
|
|
73
83
|
conn.close();
|
|
74
84
|
db.reset();
|
|
75
|
-
}).catch(() => {
|
|
85
|
+
}).catch((err) => {
|
|
86
|
+
console.warn("[gscdump] resetNodeDuckDB: failed to release DuckDB instance", err);
|
|
87
|
+
});
|
|
76
88
|
}
|
|
77
89
|
export { createNodeDuckDBHandle, resetNodeDuckDB };
|
|
@@ -24,10 +24,18 @@ interface DecodeParquetOptions {
|
|
|
24
24
|
* per row group — pruning groups whose column statistics can't match and
|
|
25
25
|
* materialising only matching rows — so a filtered decode of a large file
|
|
26
26
|
* holds at most one row group plus the matches in memory, never the whole
|
|
27
|
-
* file. Use
|
|
28
|
-
* (
|
|
27
|
+
* file. Use when a caller needs a sub-slice of a big parquet keyed on a
|
|
28
|
+
* clustered column (a row group's min/max stats only prune if the predicate
|
|
29
|
+
* column is the physical sort key — see `sortKey`/`clusterKey`).
|
|
29
30
|
*/
|
|
30
31
|
filter?: ParquetQueryFilter;
|
|
32
|
+
/**
|
|
33
|
+
* Project a subset of columns. hyparquet only fetches + decodes the named
|
|
34
|
+
* column chunks, so a read that needs 2 of 14 columns skips the other 12's
|
|
35
|
+
* pages entirely. Omit to read every column. Names not present in the file
|
|
36
|
+
* are ignored by the reader.
|
|
37
|
+
*/
|
|
38
|
+
columns?: readonly string[];
|
|
31
39
|
}
|
|
32
40
|
declare function decodeParquetToRows(bytes: Uint8Array, opts?: DecodeParquetOptions): Promise<Row[]>;
|
|
33
41
|
interface HyparquetCodecOptions {
|
|
@@ -1,14 +1,83 @@
|
|
|
1
1
|
import { SCHEMAS, TABLE_METADATA, dedupeByNaturalKey } from "../_chunks/schema.mjs";
|
|
2
2
|
import { parquetReadObjects } from "hyparquet";
|
|
3
|
-
import {
|
|
3
|
+
import { ByteWriter, parquetWriteRows } from "hyparquet-writer";
|
|
4
4
|
const ROW_GROUP_SIZE = 25e3;
|
|
5
5
|
function basicTypeFor(colType) {
|
|
6
|
-
if (colType === "VARCHAR"
|
|
6
|
+
if (colType === "VARCHAR") return "STRING";
|
|
7
7
|
if (colType === "BIGINT") return "INT64";
|
|
8
8
|
if (colType === "INTEGER") return "INT32";
|
|
9
9
|
if (colType === "DOUBLE") return "DOUBLE";
|
|
10
|
+
if (colType === "DATE") return "INT32";
|
|
10
11
|
throw new Error(`unsupported column type for parquet encoding: ${colType}`);
|
|
11
12
|
}
|
|
13
|
+
const EPOCH_DAY_MS = 864e5;
|
|
14
|
+
function toEpochDays(value) {
|
|
15
|
+
if (value === null || value === void 0) return null;
|
|
16
|
+
if (typeof value === "number") return value;
|
|
17
|
+
if (value instanceof Date) {
|
|
18
|
+
const ms = value.getTime();
|
|
19
|
+
if (Number.isNaN(ms)) throw new TypeError("encodeRowsToParquet: invalid Date for DATE column");
|
|
20
|
+
return Math.floor(ms / EPOCH_DAY_MS);
|
|
21
|
+
}
|
|
22
|
+
if (typeof value === "string") {
|
|
23
|
+
const ms = Date.parse(`${value}T00:00:00Z`);
|
|
24
|
+
if (Number.isNaN(ms)) throw new TypeError(`encodeRowsToParquet: invalid date string '${value}'`);
|
|
25
|
+
return Math.floor(ms / EPOCH_DAY_MS);
|
|
26
|
+
}
|
|
27
|
+
throw new TypeError(`encodeRowsToParquet: unsupported DATE value '${String(value)}'`);
|
|
28
|
+
}
|
|
29
|
+
function isoFromDate(d) {
|
|
30
|
+
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
31
|
+
}
|
|
32
|
+
function buildWriteSchema(columns) {
|
|
33
|
+
const schema = [{
|
|
34
|
+
name: "root",
|
|
35
|
+
num_children: columns.length
|
|
36
|
+
}];
|
|
37
|
+
for (const col of columns) {
|
|
38
|
+
const repetition_type = col.nullable ? "OPTIONAL" : "REQUIRED";
|
|
39
|
+
switch (col.type) {
|
|
40
|
+
case "DATE":
|
|
41
|
+
schema.push({
|
|
42
|
+
name: col.name,
|
|
43
|
+
type: "INT32",
|
|
44
|
+
converted_type: "DATE",
|
|
45
|
+
repetition_type
|
|
46
|
+
});
|
|
47
|
+
break;
|
|
48
|
+
case "VARCHAR":
|
|
49
|
+
schema.push({
|
|
50
|
+
name: col.name,
|
|
51
|
+
type: "BYTE_ARRAY",
|
|
52
|
+
converted_type: "UTF8",
|
|
53
|
+
repetition_type
|
|
54
|
+
});
|
|
55
|
+
break;
|
|
56
|
+
case "INTEGER":
|
|
57
|
+
schema.push({
|
|
58
|
+
name: col.name,
|
|
59
|
+
type: "INT32",
|
|
60
|
+
repetition_type
|
|
61
|
+
});
|
|
62
|
+
break;
|
|
63
|
+
case "BIGINT":
|
|
64
|
+
schema.push({
|
|
65
|
+
name: col.name,
|
|
66
|
+
type: "INT64",
|
|
67
|
+
repetition_type
|
|
68
|
+
});
|
|
69
|
+
break;
|
|
70
|
+
case "DOUBLE":
|
|
71
|
+
schema.push({
|
|
72
|
+
name: col.name,
|
|
73
|
+
type: "DOUBLE",
|
|
74
|
+
repetition_type
|
|
75
|
+
});
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return schema;
|
|
80
|
+
}
|
|
12
81
|
function coerceValue(value, type) {
|
|
13
82
|
if (value === null || value === void 0) return null;
|
|
14
83
|
if (type === "STRING") return typeof value === "string" ? value : String(value);
|
|
@@ -50,65 +119,86 @@ function sortRowsByClusterKey(table, rows) {
|
|
|
50
119
|
});
|
|
51
120
|
return copy;
|
|
52
121
|
}
|
|
122
|
+
function encodeOrderedRows(rows, columns, rowGroupSize) {
|
|
123
|
+
const schema = buildWriteSchema(columns);
|
|
124
|
+
const isDate = columns.map((col) => col.type === "DATE");
|
|
125
|
+
const types = columns.map((col) => basicTypeFor(col.type));
|
|
126
|
+
const columnSpecs = columns.map((col) => ({
|
|
127
|
+
name: col.name,
|
|
128
|
+
nullable: col.nullable,
|
|
129
|
+
columnIndex: true
|
|
130
|
+
}));
|
|
131
|
+
function* coercedRows() {
|
|
132
|
+
for (const r of rows) {
|
|
133
|
+
const out = {};
|
|
134
|
+
for (let c = 0; c < columns.length; c++) {
|
|
135
|
+
const name = columns[c].name;
|
|
136
|
+
out[name] = isDate[c] ? toEpochDays(r[name]) : coerceValue(r[name], types[c]);
|
|
137
|
+
}
|
|
138
|
+
yield out;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
const writer = new ByteWriter();
|
|
142
|
+
parquetWriteRows({
|
|
143
|
+
writer,
|
|
144
|
+
rows: coercedRows(),
|
|
145
|
+
columns: columnSpecs,
|
|
146
|
+
schema,
|
|
147
|
+
rowGroupSize
|
|
148
|
+
});
|
|
149
|
+
return new Uint8Array(writer.getBuffer());
|
|
150
|
+
}
|
|
53
151
|
function encodeRowsToParquet(table, rows) {
|
|
54
152
|
const schema = SCHEMAS[table];
|
|
55
|
-
|
|
56
|
-
const buffer = parquetWriteBuffer({
|
|
57
|
-
columnData: schema.columns.map((col) => {
|
|
58
|
-
const type = basicTypeFor(col.type);
|
|
59
|
-
const data = sorted.map((r) => coerceValue(r[col.name], type));
|
|
60
|
-
return {
|
|
61
|
-
name: col.name,
|
|
62
|
-
data,
|
|
63
|
-
type,
|
|
64
|
-
nullable: col.nullable,
|
|
65
|
-
columnIndex: true
|
|
66
|
-
};
|
|
67
|
-
}),
|
|
68
|
-
rowGroupSize: ROW_GROUP_SIZE
|
|
69
|
-
});
|
|
70
|
-
return new Uint8Array(buffer);
|
|
153
|
+
return encodeOrderedRows(sortRowsByClusterKey(table, rows), schema.columns, ROW_GROUP_SIZE);
|
|
71
154
|
}
|
|
72
155
|
function encodeRowsToParquetFlex(rows, opts) {
|
|
73
156
|
const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
|
|
74
|
-
|
|
157
|
+
return encodeOrderedRows(sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
|
|
75
158
|
for (const col of sortKey) {
|
|
76
159
|
const cmp = compareValues(a[col], b[col]);
|
|
77
160
|
if (cmp !== 0) return cmp;
|
|
78
161
|
}
|
|
79
162
|
return 0;
|
|
80
|
-
});
|
|
81
|
-
const buffer = parquetWriteBuffer({
|
|
82
|
-
columnData: columns.map((col) => {
|
|
83
|
-
const type = basicTypeFor(col.type);
|
|
84
|
-
const data = sorted.map((r) => coerceValue(r[col.name], type));
|
|
85
|
-
return {
|
|
86
|
-
name: col.name,
|
|
87
|
-
data,
|
|
88
|
-
type,
|
|
89
|
-
nullable: col.nullable,
|
|
90
|
-
columnIndex: true
|
|
91
|
-
};
|
|
92
|
-
}),
|
|
93
|
-
rowGroupSize
|
|
94
|
-
});
|
|
95
|
-
return new Uint8Array(buffer);
|
|
163
|
+
}), columns, rowGroupSize);
|
|
96
164
|
}
|
|
97
165
|
function asyncBufferFromBytes(bytes) {
|
|
98
|
-
const
|
|
166
|
+
const base = bytes.byteOffset;
|
|
167
|
+
const buf = bytes.buffer;
|
|
99
168
|
return {
|
|
100
|
-
byteLength:
|
|
169
|
+
byteLength: bytes.byteLength,
|
|
101
170
|
slice(start, end) {
|
|
102
|
-
|
|
171
|
+
const from = base + start;
|
|
172
|
+
const to = end === void 0 ? base + bytes.byteLength : base + end;
|
|
173
|
+
return buf.slice(from, to);
|
|
103
174
|
}
|
|
104
175
|
};
|
|
105
176
|
}
|
|
106
177
|
async function decodeParquetToRows(bytes, opts = {}) {
|
|
107
178
|
if (bytes.byteLength === 0) return [];
|
|
108
|
-
return await parquetReadObjects({
|
|
179
|
+
return normalizeDecodedDates(await parquetReadObjects({
|
|
109
180
|
file: asyncBufferFromBytes(bytes),
|
|
110
|
-
...opts.
|
|
111
|
-
|
|
181
|
+
...opts.columns ? { columns: [...opts.columns] } : {},
|
|
182
|
+
...opts.filter ? {
|
|
183
|
+
filter: opts.filter,
|
|
184
|
+
useBloomFilters: true
|
|
185
|
+
} : {}
|
|
186
|
+
}));
|
|
187
|
+
}
|
|
188
|
+
function normalizeDecodedDates(rows) {
|
|
189
|
+
if (rows.length === 0) return rows;
|
|
190
|
+
const dateCols = [];
|
|
191
|
+
const first = rows[0];
|
|
192
|
+
for (const k in first) if (first[k] instanceof Date) dateCols.push(k);
|
|
193
|
+
if (dateCols.length === 0) return rows;
|
|
194
|
+
for (const row of rows) {
|
|
195
|
+
const r = row;
|
|
196
|
+
for (const k of dateCols) {
|
|
197
|
+
const v = r[k];
|
|
198
|
+
if (v instanceof Date) r[k] = isoFromDate(v);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
return rows;
|
|
112
202
|
}
|
|
113
203
|
function createHyparquetCodec(options = {}) {
|
|
114
204
|
return {
|
|
@@ -135,7 +225,7 @@ function createHyparquetCodec(options = {}) {
|
|
|
135
225
|
const allRows = [];
|
|
136
226
|
for (const key of inputKeys) {
|
|
137
227
|
const rows = await decodeParquetToRows(await dataSource.read(key));
|
|
138
|
-
allRows.push(
|
|
228
|
+
for (let i = 0; i < rows.length; i++) allRows.push(rows[i]);
|
|
139
229
|
}
|
|
140
230
|
const rows = dedupeByNaturalKey(ctx.table, allRows);
|
|
141
231
|
const bytes = encodeRowsToParquet(ctx.table, rows);
|
package/dist/adapters/node.mjs
CHANGED
|
@@ -74,7 +74,7 @@ function snapshotAlias(fileName) {
|
|
|
74
74
|
if (!m?.[1]) throw new TypeError(`snapshotAlias: unrecognised filename ${JSON.stringify(fileName)}`);
|
|
75
75
|
return `cold_${m[1].replace("-", "_")}`;
|
|
76
76
|
}
|
|
77
|
-
const SNAPSHOT_TYPE_ERROR_KINDS = new Set([
|
|
77
|
+
const SNAPSHOT_TYPE_ERROR_KINDS = /* @__PURE__ */ new Set([
|
|
78
78
|
"invalid-snapshot-filename",
|
|
79
79
|
"unsupported-snapshot-index-version",
|
|
80
80
|
"invalid-schema-identifier",
|
package/dist/errors.mjs
CHANGED
package/dist/iceberg/index.d.mts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
|
|
1
|
+
import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
|
|
2
2
|
import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
|
|
3
3
|
type IcebergAppendSink = Sink;
|
|
4
4
|
/**
|
|
@@ -10,4 +10,4 @@ type IcebergAppendSink = Sink;
|
|
|
10
10
|
* with no rows never touches the network.
|
|
11
11
|
*/
|
|
12
12
|
declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
|
|
13
|
-
export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
|
13
|
+
export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
package/dist/iceberg/index.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { engineErrors } from "../errors.mjs";
|
|
2
|
-
import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
|
|
2
|
+
import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
|
|
3
3
|
import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
|
|
4
4
|
import { err, ok } from "gscdump/result";
|
|
5
5
|
async function cacheGet(cache, key, now) {
|
|
@@ -35,21 +35,25 @@ function decodeInt(bytes) {
|
|
|
35
35
|
if (u == null) return null;
|
|
36
36
|
return new DataView(u.buffer, u.byteOffset, u.byteLength).getInt32(0, true);
|
|
37
37
|
}
|
|
38
|
-
function buildPartitionFilter(siteId, searchType, wantedMonths) {
|
|
38
|
+
function buildPartitionFilter(siteId, searchType, wantedMonths, encoding = "string") {
|
|
39
39
|
return (partitions) => {
|
|
40
40
|
const parts = partitions;
|
|
41
41
|
if (!parts || parts.length === 0) return true;
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
const
|
|
45
|
-
const
|
|
46
|
-
if (
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
const
|
|
52
|
-
if (
|
|
42
|
+
if (encoding === "string") {
|
|
43
|
+
const siteStr = String(siteId);
|
|
44
|
+
const searchStr = String(searchType);
|
|
45
|
+
const siteSummary = parts[SITE_ID_FIELD_INDEX];
|
|
46
|
+
if (siteSummary && (siteSummary.lower_bound != null || siteSummary.upper_bound != null)) {
|
|
47
|
+
const lo = decodeString(siteSummary.lower_bound);
|
|
48
|
+
const hi = decodeString(siteSummary.upper_bound);
|
|
49
|
+
if (lo != null && hi != null && (siteStr < lo || siteStr > hi)) return false;
|
|
50
|
+
}
|
|
51
|
+
const searchTypeSummary = parts[SEARCH_TYPE_FIELD_INDEX];
|
|
52
|
+
if (searchTypeSummary && (searchTypeSummary.lower_bound != null || searchTypeSummary.upper_bound != null)) {
|
|
53
|
+
const lo = decodeString(searchTypeSummary.lower_bound);
|
|
54
|
+
const hi = decodeString(searchTypeSummary.upper_bound);
|
|
55
|
+
if (lo != null && hi != null && (searchStr < lo || searchStr > hi)) return false;
|
|
56
|
+
}
|
|
53
57
|
}
|
|
54
58
|
const monthSummary = parts[DATE_MONTH_FIELD_INDEX];
|
|
55
59
|
if (monthSummary && (monthSummary.lower_bound != null || monthSummary.upper_bound != null)) {
|
|
@@ -74,11 +78,11 @@ const ICEBERG_TYPE_MAP = {
|
|
|
74
78
|
DOUBLE: "double",
|
|
75
79
|
DATE: "date"
|
|
76
80
|
};
|
|
77
|
-
function icebergSchemaFor(table) {
|
|
81
|
+
function icebergSchemaFor(table, encoding = "string") {
|
|
78
82
|
return {
|
|
79
83
|
"type": "struct",
|
|
80
84
|
"schema-id": 0,
|
|
81
|
-
"fields":
|
|
85
|
+
"fields": icebergSchemasFor(encoding)[table].columns.map((col) => ({
|
|
82
86
|
id: col.fieldId,
|
|
83
87
|
name: col.name,
|
|
84
88
|
required: col.required,
|
|
@@ -86,8 +90,8 @@ function icebergSchemaFor(table) {
|
|
|
86
90
|
}))
|
|
87
91
|
};
|
|
88
92
|
}
|
|
89
|
-
function icebergPartitionSpecFor(table) {
|
|
90
|
-
const fields =
|
|
93
|
+
function icebergPartitionSpecFor(table, encoding = "string") {
|
|
94
|
+
const fields = icebergSchemasFor(encoding)[table].columns;
|
|
91
95
|
const fieldId = (name) => {
|
|
92
96
|
const col = fields.find((c) => c.name === name);
|
|
93
97
|
if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
|
|
@@ -176,14 +180,14 @@ async function icebergAppendRetrying(args, options = {}) {
|
|
|
176
180
|
async function ensureIcebergNamespace(conn) {
|
|
177
181
|
await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
|
|
178
182
|
}
|
|
179
|
-
async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
|
|
183
|
+
async function createIcebergTables(conn, tables = ICEBERG_TABLES, encoding = "string") {
|
|
180
184
|
const results = [];
|
|
181
185
|
for (const table of tables) await icebergCreateTable({
|
|
182
186
|
catalog: conn.catalog,
|
|
183
187
|
namespace: conn.namespace,
|
|
184
188
|
table,
|
|
185
|
-
schema: icebergSchemaFor(table),
|
|
186
|
-
partitionSpec: icebergPartitionSpecFor(table)
|
|
189
|
+
schema: icebergSchemaFor(table, encoding),
|
|
190
|
+
partitionSpec: icebergPartitionSpecFor(table, encoding)
|
|
187
191
|
}).then(() => results.push({
|
|
188
192
|
table,
|
|
189
193
|
outcome: ok(void 0)
|
|
@@ -275,20 +279,22 @@ async function listIcebergDataFiles(conn, opts) {
|
|
|
275
279
|
if (snapshotId == null || !metadata) return [];
|
|
276
280
|
}
|
|
277
281
|
const endWalk = profiler?.start("iceberg.walk");
|
|
278
|
-
const partitionFilter = buildPartitionFilter(opts.siteId, opts.searchType, wantedMonths);
|
|
282
|
+
const partitionFilter = buildPartitionFilter(opts.siteId, opts.searchType, wantedMonths, opts.encoding ?? "string");
|
|
279
283
|
const manifests = await icebergManifests({
|
|
280
284
|
metadata,
|
|
281
285
|
resolver: conn.resolver,
|
|
282
286
|
partitionFilter
|
|
283
287
|
});
|
|
288
|
+
const wantSite = String(opts.siteId);
|
|
289
|
+
const wantSearch = String(opts.searchType);
|
|
284
290
|
const out = [];
|
|
285
291
|
for (const m of manifests) for (const entry of m.entries) {
|
|
286
292
|
if (entry.status === 2) continue;
|
|
287
293
|
const df = entry.data_file;
|
|
288
294
|
if (df.content !== 0) continue;
|
|
289
295
|
const part = df.partition;
|
|
290
|
-
if (part.site_id !==
|
|
291
|
-
if (part.search_type !==
|
|
296
|
+
if (String(part.site_id) !== wantSite) continue;
|
|
297
|
+
if (String(part.search_type) !== wantSearch) continue;
|
|
292
298
|
const month = part.date_month;
|
|
293
299
|
if (typeof month !== "number" || !wantedMonths.has(month)) continue;
|
|
294
300
|
out.push({
|
|
@@ -326,6 +332,8 @@ async function dropIcebergTables(conn, tables) {
|
|
|
326
332
|
return results;
|
|
327
333
|
}
|
|
328
334
|
const DAY_MILLIS = 864e5;
|
|
335
|
+
const INT32_MIN = -2147483648;
|
|
336
|
+
const INT32_MAX = 2147483647;
|
|
329
337
|
function toIcebergDate(value) {
|
|
330
338
|
if (typeof value === "string") {
|
|
331
339
|
const ms = Date.parse(`${value}T00:00:00Z`);
|
|
@@ -343,6 +351,14 @@ function coerceJsonSafe(value) {
|
|
|
343
351
|
if (typeof value === "bigint") return Number(value);
|
|
344
352
|
return value;
|
|
345
353
|
}
|
|
354
|
+
function toIntPartitionSiteId(value) {
|
|
355
|
+
if (value == null || typeof value === "string" && value.trim() === "") throw new TypeError("toRecords: slice.ctx.siteId is required for int partition encoding");
|
|
356
|
+
if (typeof value !== "string" && typeof value !== "number" && typeof value !== "bigint") throw new TypeError(`toRecords: int partition site_id must be a safe integer, got '${String(value)}'`);
|
|
357
|
+
const siteId = Number(value);
|
|
358
|
+
if (!Number.isSafeInteger(siteId)) throw new TypeError(`toRecords: int partition site_id must be a safe integer, got '${String(value)}'`);
|
|
359
|
+
if (siteId < INT32_MIN || siteId > INT32_MAX) throw new TypeError(`toRecords: int partition site_id must fit Iceberg INT, got '${String(value)}'`);
|
|
360
|
+
return siteId;
|
|
361
|
+
}
|
|
346
362
|
function dedupeByIdentity(table, records) {
|
|
347
363
|
if (records.length < 2) return records;
|
|
348
364
|
const key = ICEBERG_SCHEMAS[table].identityColumns;
|
|
@@ -353,19 +369,21 @@ function dedupeByIdentity(table, records) {
|
|
|
353
369
|
}
|
|
354
370
|
return seen.size === records.length ? records : [...seen.values()];
|
|
355
371
|
}
|
|
356
|
-
function toRecords(slice, rows) {
|
|
357
|
-
const
|
|
372
|
+
function toRecords(slice, rows, encoding) {
|
|
373
|
+
const siteVal = encoding === "int" ? toIntPartitionSiteId(slice.ctx.siteId) : slice.ctx.siteId ?? "";
|
|
374
|
+
const searchVal = encoding === "int" ? SEARCH_TYPE_INT[slice.searchType] : slice.searchType;
|
|
358
375
|
return rows.map((row) => {
|
|
359
376
|
const out = {};
|
|
360
377
|
for (const k in row) out[k] = coerceJsonSafe(row[k]);
|
|
361
378
|
out.date = toIcebergDate(out.date);
|
|
362
|
-
out.site_id =
|
|
363
|
-
out.search_type =
|
|
379
|
+
out.site_id = siteVal;
|
|
380
|
+
out.search_type = searchVal;
|
|
364
381
|
return out;
|
|
365
382
|
});
|
|
366
383
|
}
|
|
367
384
|
function createIcebergAppendSink(options) {
|
|
368
385
|
let connection;
|
|
386
|
+
const encoding = options.encoding ?? "string";
|
|
369
387
|
const buffers = /* @__PURE__ */ new Map();
|
|
370
388
|
function connect() {
|
|
371
389
|
connection ??= connectIcebergCatalog(options.catalog);
|
|
@@ -375,7 +393,7 @@ function createIcebergAppendSink(options) {
|
|
|
375
393
|
capabilities: { appendOnly: true },
|
|
376
394
|
async emit(slice, rows) {
|
|
377
395
|
if (rows.length === 0) return { rowCount: 0 };
|
|
378
|
-
const records = toRecords(slice, rows);
|
|
396
|
+
const records = toRecords(slice, rows, encoding);
|
|
379
397
|
const buffer = buffers.get(slice.table);
|
|
380
398
|
if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
|
|
381
399
|
else buffers.set(slice.table, records);
|
|
@@ -429,4 +447,4 @@ function createIcebergAppendSink(options) {
|
|
|
429
447
|
}
|
|
430
448
|
};
|
|
431
449
|
}
|
|
432
|
-
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
|
450
|
+
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
package/dist/ingest.mjs
CHANGED
|
@@ -26,7 +26,7 @@ function toPath(gscUrl) {
|
|
|
26
26
|
}
|
|
27
27
|
}
|
|
28
28
|
function toSumPosition(apiPosition, impressions) {
|
|
29
|
-
return (apiPosition - 1) * Math.max(impressions, 1);
|
|
29
|
+
return ((apiPosition >= 1 ? apiPosition : 1) - 1) * Math.max(impressions, 1);
|
|
30
30
|
}
|
|
31
31
|
function transformGscRow(table, apiRow, options = {}) {
|
|
32
32
|
const keys = apiRow.keys;
|
|
@@ -76,8 +76,10 @@ function transformGscRow(table, apiRow, options = {}) {
|
|
|
76
76
|
};
|
|
77
77
|
}
|
|
78
78
|
if (table === "hourly_pages") {
|
|
79
|
-
const
|
|
80
|
-
const date =
|
|
79
|
+
const hourStamp = String(keys[0] ?? "");
|
|
80
|
+
const date = hourStamp.slice(0, 10);
|
|
81
|
+
const hour = Number.parseInt(hourStamp.slice(11, 13), 10);
|
|
82
|
+
if (!Number.isInteger(hour) || hour < 0 || hour > 23) throw new Error(`hourly_pages: cannot derive hour-of-day from '${hourStamp}'`);
|
|
81
83
|
return {
|
|
82
84
|
date,
|
|
83
85
|
row: {
|
package/dist/rollups.mjs
CHANGED
|
@@ -560,7 +560,7 @@ const indexingMetadataRollup = {
|
|
|
560
560
|
if (!latestRemove || r.latestRemoveAt > latestRemove) latestRemove = r.latestRemoveAt;
|
|
561
561
|
}
|
|
562
562
|
}
|
|
563
|
-
const days = new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
|
|
563
|
+
const days = /* @__PURE__ */ new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
|
|
564
564
|
const perDay = Array.from(days).sort().map((day) => ({
|
|
565
565
|
day,
|
|
566
566
|
updates: updatesByDay.get(day) ?? 0,
|
package/dist/schema.d.mts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
|
|
2
|
-
export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
|
1
|
+
import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
|
|
2
|
+
export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
package/dist/schema.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
|
|
2
|
-
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
|
1
|
+
import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
|
|
2
|
+
export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
package/dist/sql-fragments.d.mts
CHANGED
|
@@ -18,4 +18,27 @@ declare const METRIC_EXPR: Record<Metric, string>;
|
|
|
18
18
|
* on the resolved column expression so drizzle can pass a column ref.
|
|
19
19
|
*/
|
|
20
20
|
declare function topLevelPagePredicateSql(pathExpr: string): string;
|
|
21
|
-
|
|
21
|
+
/**
|
|
22
|
+
* How a canonicalized date column is emitted by {@link dateReplaceClause}:
|
|
23
|
+
* - `'date'` keeps a real `DATE` value (`CAST(col AS DATE)`). Right for views
|
|
24
|
+
* and `.duckdb` exports the app re-queries, where the column type matters.
|
|
25
|
+
* - `'string'` emits an ISO `YYYY-MM-DD` string (`strftime(CAST(col AS DATE)…)`).
|
|
26
|
+
* Right for row materialisation to JSON/CSV/NDJSON, where a `DATE` would
|
|
27
|
+
* serialize as an opaque object / epoch.
|
|
28
|
+
*/
|
|
29
|
+
type DateCanonicalForm = 'date' | 'string';
|
|
30
|
+
/**
|
|
31
|
+
* Build a `read_parquet` `REPLACE (…)` clause that canonicalizes legacy `date`
|
|
32
|
+
* columns. `date` lands as VARCHAR in older parquets (BYTE_ARRAY/UTF8, written
|
|
33
|
+
* before the schema enforced DATE); DuckDB infers the column type from the file,
|
|
34
|
+
* so without this every read path would expose VARCHAR despite SCHEMAS declaring
|
|
35
|
+
* DATE. The `CAST(col AS DATE)` is a no-op for already-DATE columns and
|
|
36
|
+
* vectorized parsing for VARCHAR ones, so output stays canonical either way.
|
|
37
|
+
*
|
|
38
|
+
* Pure: the caller passes the table's DATE column names (derived from `SCHEMAS`)
|
|
39
|
+
* so this fragment carries no schema/drizzle dependency. Returns `''` when the
|
|
40
|
+
* table has no DATE columns, so callers can interpolate it unconditionally:
|
|
41
|
+
* `SELECT * ${dateReplaceClause(cols)} FROM read_parquet(…)`.
|
|
42
|
+
*/
|
|
43
|
+
declare function dateReplaceClause(dateColumns: readonly string[], form?: DateCanonicalForm): string;
|
|
44
|
+
export { DateCanonicalForm, METRIC_EXPR, dateReplaceClause, escapeLike, topLevelPagePredicateSql };
|
package/dist/sql-fragments.mjs
CHANGED
|
@@ -10,4 +10,9 @@ const METRIC_EXPR = {
|
|
|
10
10
|
function topLevelPagePredicateSql(pathExpr) {
|
|
11
11
|
return `LENGTH(${pathExpr}) - LENGTH(REPLACE(${pathExpr}, '/', '')) <= 1`;
|
|
12
12
|
}
|
|
13
|
-
|
|
13
|
+
function dateReplaceClause(dateColumns, form = "string") {
|
|
14
|
+
if (dateColumns.length === 0) return "";
|
|
15
|
+
const cast = (n) => form === "date" ? `CAST(${n} AS DATE) AS ${n}` : `strftime(CAST(${n} AS DATE), '%Y-%m-%d') AS ${n}`;
|
|
16
|
+
return `REPLACE (${dateColumns.map(cast).join(", ")})`;
|
|
17
|
+
}
|
|
18
|
+
export { METRIC_EXPR, dateReplaceClause, escapeLike, topLevelPagePredicateSql };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.29.0",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -172,8 +172,8 @@
|
|
|
172
172
|
},
|
|
173
173
|
"peerDependencies": {
|
|
174
174
|
"@duckdb/duckdb-wasm": "^1.32.0",
|
|
175
|
-
"hyparquet": "^1.26.
|
|
176
|
-
"hyparquet-writer": "^0.
|
|
175
|
+
"hyparquet": "^1.26.1",
|
|
176
|
+
"hyparquet-writer": "^0.16.1"
|
|
177
177
|
},
|
|
178
178
|
"peerDependenciesMeta": {
|
|
179
179
|
"@duckdb/duckdb-wasm": {
|
|
@@ -188,11 +188,11 @@
|
|
|
188
188
|
},
|
|
189
189
|
"dependencies": {
|
|
190
190
|
"drizzle-orm": "1.0.0-rc.3",
|
|
191
|
-
"hyparquet": "^1.26.
|
|
192
|
-
"hyparquet-writer": "^0.
|
|
191
|
+
"hyparquet": "^1.26.1",
|
|
192
|
+
"hyparquet-writer": "^0.16.1",
|
|
193
193
|
"proper-lockfile": "^4.1.2",
|
|
194
|
-
"@gscdump/contracts": "0.
|
|
195
|
-
"gscdump": "0.
|
|
194
|
+
"@gscdump/contracts": "0.29.0",
|
|
195
|
+
"gscdump": "0.29.0"
|
|
196
196
|
},
|
|
197
197
|
"devDependencies": {
|
|
198
198
|
"@duckdb/duckdb-wasm": "^1.32.0",
|