@gscdump/engine 0.28.3 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,15 @@ const ICEBERG_TABLES = [
10
10
  "search_appearance_queries",
11
11
  "search_appearance_page_queries"
12
12
  ];
13
+ const SEARCH_TYPE_INT = {
14
+ web: 1,
15
+ image: 2,
16
+ video: 3,
17
+ news: 4,
18
+ discover: 5,
19
+ googleNews: 6
20
+ };
21
+ const INT_SEARCH_TYPE = Object.fromEntries(Object.entries(SEARCH_TYPE_INT).map(([k, v]) => [v, k]));
13
22
  const ICEBERG_PARTITION_COLUMNS = [{
14
23
  name: "site_id",
15
24
  type: "STRING",
@@ -21,6 +30,20 @@ const ICEBERG_PARTITION_COLUMNS = [{
21
30
  required: true,
22
31
  fieldId: 2
23
32
  }];
33
+ function icebergPartitionColumns(encoding = "string") {
34
+ if (encoding === "string") return ICEBERG_PARTITION_COLUMNS;
35
+ return [{
36
+ name: "site_id",
37
+ type: "INT",
38
+ required: true,
39
+ fieldId: 1
40
+ }, {
41
+ name: "search_type",
42
+ type: "INT",
43
+ required: true,
44
+ fieldId: 2
45
+ }];
46
+ }
24
47
  const ICEBERG_FIELD_ID_BASE = 3;
25
48
  const ICEBERG_PARTITION_SPEC = [
26
49
  {
@@ -48,7 +71,7 @@ function mapColumnType(t) {
48
71
  case "DATE": return "DATE";
49
72
  }
50
73
  }
51
- function icebergTableSpec(table) {
74
+ function icebergTableSpec(table, encoding = "string") {
52
75
  const base = SCHEMAS[table];
53
76
  const dataColumns = base.columns.map((col, i) => ({
54
77
  name: col.name,
@@ -58,7 +81,7 @@ function icebergTableSpec(table) {
58
81
  }));
59
82
  return {
60
83
  table,
61
- columns: [...ICEBERG_PARTITION_COLUMNS, ...dataColumns],
84
+ columns: [...icebergPartitionColumns(encoding), ...dataColumns],
62
85
  partitionSpec: ICEBERG_PARTITION_SPEC,
63
86
  identityColumns: [
64
87
  "site_id",
@@ -68,6 +91,10 @@ function icebergTableSpec(table) {
68
91
  };
69
92
  }
70
93
  const ICEBERG_SCHEMAS = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t)]));
94
+ const ICEBERG_SCHEMAS_INT = Object.fromEntries(ICEBERG_TABLES.map((t) => [t, icebergTableSpec(t, "int")]));
95
+ function icebergSchemasFor(encoding = "string") {
96
+ return encoding === "int" ? ICEBERG_SCHEMAS_INT : ICEBERG_SCHEMAS;
97
+ }
71
98
  const ICEBERG_TABLE_SET = new Set(ICEBERG_TABLES);
72
99
  function isIcebergTable(table) {
73
100
  return ICEBERG_TABLE_SET.has(table);
@@ -76,4 +103,4 @@ function assertIcebergTable(table) {
76
103
  if (!isIcebergTable(table)) throw new Error(`Unknown Iceberg table '${table}'. Expected one of: ${ICEBERG_TABLES.join(", ")}`);
77
104
  return table;
78
105
  }
79
- export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable };
106
+ export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable };
@@ -50,6 +50,33 @@ interface IcebergColumn {
50
50
  */
51
51
  fieldId: number;
52
52
  }
53
+ /**
54
+ * Partition-key encoding for the two identity columns (`site_id`, `search_type`).
55
+ *
56
+ * - `'string'` (default, legacy): both columns are STRING. Correct, but R2 SQL's
57
+ * string min/max statistics are truncated in predicate pushdown, so a bare
58
+ * `WHERE site_id='<uuid>'` UNDERCOUNTS — callers must CONCAT(col,'') to stay
59
+ * correct, which defeats partition pruning.
60
+ * - `'int'`: BOTH `site_id` and `search_type` are INT. Integer statistics are
61
+ * fixed-width and never truncated, so `WHERE site_id=<n>` is both correct AND
62
+ * prunes (empirically confirmed 2026-06-19, gscdump.com probe-int64-partition;
63
+ * INT equality proven via the search_type column in the engine e2e canary). A
64
+ * small INT site_id is ample (≪ 2.1B sites) — no LONG/BigInt needed. The caller
65
+ * maps the UUID `site_id` ↔ int (app-owned, per-tenant serial) and uses
66
+ * {@link SEARCH_TYPE_INT} for `search_type` (engine-owned, fixed enum).
67
+ *
68
+ * New per-team catalogs are provisioned `'int'`; existing catalogs stay
69
+ * `'string'`. Purely additive: `'string'` is the default everywhere so existing
70
+ * tables, writers, and readers are unchanged.
71
+ */
72
+ type PartitionKeyEncoding = 'string' | 'int';
73
+ /**
74
+ * Stable `search_type` enum → int map for `'int'`-encoded catalogs. Engine-owned
75
+ * and FROZEN: never renumber or reuse an id (it's the on-disk partition value).
76
+ */
77
+ declare const SEARCH_TYPE_INT: Record<SearchType, number>;
78
+ /** Reverse of {@link SEARCH_TYPE_INT} — int → `search_type`, for read-result mapping. */
79
+ declare const INT_SEARCH_TYPE: Record<number, SearchType>;
53
80
  /** Iceberg partition transform applied to a source column. */
54
81
  type IcebergPartitionTransform = 'identity' | 'month';
55
82
  interface IcebergPartitionField {
@@ -82,6 +109,15 @@ interface IcebergTableSpec {
82
109
  * contiguously from id 3 (see `ICEBERG_FIELD_ID_BASE`).
83
110
  */
84
111
  declare const ICEBERG_PARTITION_COLUMNS: readonly IcebergColumn[];
112
+ /**
113
+ * The two partition-identity columns for a given {@link PartitionKeyEncoding}.
114
+ * `'string'` returns {@link ICEBERG_PARTITION_COLUMNS} verbatim; `'int'` swaps
115
+ * BOTH to INT — `site_id` (the app's small `user_sites.int_id`; ≪ 2.1B sites, so
116
+ * INT is ample) and `search_type` (its fixed enum code). Integer identity columns
117
+ * avoid R2 SQL's truncated-string-stats equality undercount and restore pruning.
118
+ * Field ids are unchanged (1, 2) — only the column types differ.
119
+ */
120
+ declare function icebergPartitionColumns(encoding?: PartitionKeyEncoding): readonly IcebergColumn[];
85
121
  /**
86
122
  * First field id used for per-table (non-partition) columns — immediately
87
123
  * after the two partition-identity columns (`site_id`=1, `search_type`=2).
@@ -105,9 +141,13 @@ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
105
141
  * CONTRACT NOTE: implementation agents must treat the RETURNED VALUE as the
106
142
  * source of truth — do not hand-list columns elsewhere.
107
143
  */
108
- declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
109
- /** All Iceberg table specs, keyed by table name. */
144
+ declare function icebergTableSpec(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergTableSpec;
145
+ /** All Iceberg table specs (legacy `'string'` encoding), keyed by table name. */
110
146
  declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
147
+ /** All Iceberg table specs in `'int'` encoding (INT site_id + INT search_type). */
148
+ declare const ICEBERG_SCHEMAS_INT: Record<IcebergTableName, IcebergTableSpec>;
149
+ /** Table specs for the given encoding (`'string'` default). */
150
+ declare function icebergSchemasFor(encoding?: PartitionKeyEncoding): Record<IcebergTableName, IcebergTableSpec>;
111
151
  /** True when `table` is one of the canonical {@link ICEBERG_TABLES}. */
112
152
  declare function isIcebergTable(table: string): table is IcebergTableName;
113
153
  /**
@@ -172,14 +212,14 @@ interface IcebergConnection {
172
212
  * `ICEBERG_SCHEMAS` contract. Field ids are advisory — R2 Data Catalog
173
213
  * re-assigns them on `createTable` (see `ICEBERG_FIELD_ID_BASE`).
174
214
  */
175
- declare function icebergSchemaFor(table: IcebergTableName): IcebergSchema;
215
+ declare function icebergSchemaFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergSchema;
176
216
  /**
177
217
  * Build the icebird `PartitionSpec` for one of the 5 fact tables: the locked
178
218
  * spec `identity(site_id) + identity(search_type) + month(date)`. Each
179
219
  * partition field's `source-id` is resolved to the real column field id from
180
220
  * {@link icebergSchemaFor}.
181
221
  */
182
- declare function icebergPartitionSpecFor(table: IcebergTableName): IcebergPartitionSpec;
222
+ declare function icebergPartitionSpecFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergPartitionSpec;
183
223
  /** Options for {@link connectIcebergCatalog}. */
184
224
  interface ConnectIcebergOptions {
185
225
  /**
@@ -264,7 +304,7 @@ declare function ensureIcebergNamespace(conn: IcebergConnection): Promise<void>;
264
304
  * than thrown so a partial run is observable; "table already exists" surfaces
265
305
  * as a failed result. Used by the app's one-off provisioning script.
266
306
  */
267
- declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[]): Promise<IcebergTableOpResult[]>;
307
+ declare function createIcebergTables(conn: IcebergConnection, tables?: readonly IcebergTableName[], encoding?: PartitionKeyEncoding): Promise<IcebergTableOpResult[]>;
268
308
  /**
269
309
  * List the table names currently in the catalog namespace.
270
310
  *
@@ -284,10 +324,16 @@ interface IcebergListedDataFile {
284
324
  }
285
325
  interface ListIcebergDataFilesOptions {
286
326
  table: IcebergTableName;
287
- /** Partition identity column. */
288
- siteId: string;
289
- /** Partition identity column. */
290
- searchType: string;
327
+ /** Partition identity column. `number` for `'int'`-encoded catalogs. */
328
+ siteId: string | number;
329
+ /** Partition identity column. `number` (int code) for `'int'`-encoded catalogs. */
330
+ searchType: string | number;
331
+ /**
332
+ * Partition-key encoding of the catalog. `'int'` changes how manifest-summary
333
+ * bounds are decoded (int bytes vs UTF-8) and how the per-file partition value
334
+ * is compared. Defaults to `'string'`.
335
+ */
336
+ encoding?: PartitionKeyEncoding;
291
337
  /**
292
338
  * Inclusive date range. Every month touched by `[start, end]` is scanned;
293
339
  * `month(date)` is the third partition transform.
@@ -437,6 +483,14 @@ interface IcebergAppendSinkOptions extends SinkOptions {
437
483
  * uses the defaults; tests inject a synchronous `sleep`.
438
484
  */
439
485
  commitRetry?: CommitRetryOptions;
486
+ /**
487
+ * Partition-key encoding (default `'string'`). `'int'` writes BOTH `site_id`
488
+ * and `search_type` as INT — the caller MUST pass the numeric `site_id` (a
489
+ * numeric string is fine; it's `Number()`-coerced) in `slice.ctx.siteId`. A
490
+ * small INT is ample (≪ 2.1B sites), so no LONG/BigInt is involved. See
491
+ * {@link import('./iceberg/schema').PartitionKeyEncoding}.
492
+ */
493
+ encoding?: PartitionKeyEncoding;
440
494
  }
441
495
  /** `LocalIcebergSink` options — points at the local Iceberg REST catalog. */
442
496
  interface LocalIcebergSinkOptions extends SinkOptions {
@@ -447,4 +501,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
447
501
  /** S3-compatible warehouse location (POC: MinIO). */
448
502
  warehouse: string;
449
503
  }
450
- export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
504
+ export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
@@ -1,3 +1,4 @@
1
+ import { ParquetQueryFilter } from "hyparquet";
1
2
  import { BuilderState, SearchType, SearchType as SearchType$1 } from "gscdump/query";
2
3
  import { Grain, Grain as Grain$1, Row, Row as Row$1, TableName, TableName as TableName$1, TenantCtx, TenantCtx as TenantCtx$1 } from "@gscdump/contracts";
3
4
  /**
@@ -397,6 +398,15 @@ interface QueryExecuteOptions {
397
398
  * the page_queries schema, not the analyzer's primary `table`.
398
399
  */
399
400
  placeholderTables?: Record<string, TableName>;
401
+ /**
402
+ * Per-placeholder row-group pushdown filter, derived from the query's
403
+ * structured filter (see `extractParquetPushdown`). A pure-JS decode executor
404
+ * MAY pass it to the parquet reader to prune row groups and shrink the rows
405
+ * it materialises before the SQL WHERE re-applies. Pure optimization: the
406
+ * filter is a superset of the final predicate, so an executor that ignores it
407
+ * (e.g. native DuckDB, which pushes from the SQL itself) stays correct.
408
+ */
409
+ pushdownFilters?: Record<string, ParquetQueryFilter>;
400
410
  dataSource: DataSource;
401
411
  table: TableName;
402
412
  signal?: AbortSignal;
@@ -475,6 +485,12 @@ interface RunSQLOptions {
475
485
  * Undefined keeps the legacy cross-type union.
476
486
  */
477
487
  searchType?: SearchType;
488
+ /**
489
+ * Per-placeholder parquet pushdown filter, forwarded verbatim to the
490
+ * executor. Keyed by fileSet name (matching `fileSets`). See
491
+ * `QueryExecuteOptions.pushdownFilters` and `extractParquetPushdown`.
492
+ */
493
+ pushdownFilters?: Record<string, ParquetQueryFilter>;
478
494
  /**
479
495
  * Optional read-path profiler. `runSQL` emits `manifest.list` +
480
496
  * `executor.execute` spans and forwards it into the executor for the
@@ -30,8 +30,13 @@ interface ResolverOptions<TableKey extends string = string> {
30
30
  adapter: ResolverAdapter<TableKey>;
31
31
  /** Optional site scope. Required for multi-tenant D1; omitted for parquet. */
32
32
  siteId?: string | number;
33
- /** Optional searchType scope. Required for multi-tenant Iceberg; omitted for parquet. */
34
- searchType?: string;
33
+ /**
34
+ * Optional searchType scope. Required for multi-tenant Iceberg; omitted for
35
+ * parquet. `number` is the int-encoded code (`SEARCH_TYPE_INT`) for catalogs
36
+ * whose `search_type` partition column is INT — bound bare (unquoted) so the
37
+ * int partition prunes; `string` for the default string-encoded catalogs.
38
+ */
39
+ searchType?: string | number;
35
40
  }
36
41
  interface ResolvedSQL {
37
42
  sql: string;
@@ -8,6 +8,7 @@ import { tmpdir } from "node:os";
8
8
  import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
9
9
  const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
10
10
  let singleton = null;
11
+ let singletonOpts = null;
11
12
  function bundles() {
12
13
  return {
13
14
  mvp: {
@@ -29,11 +30,19 @@ async function initialize(opts) {
29
30
  conn: db.connect()
30
31
  };
31
32
  }
33
+ function getSingleton(opts) {
34
+ if (!singleton) {
35
+ singleton = initialize(opts);
36
+ singletonOpts = opts;
37
+ }
38
+ return singleton;
39
+ }
32
40
  function createNodeDuckDBHandle(opts = {}) {
33
- if (!singleton) singleton = initialize(opts);
41
+ if (singleton && opts.verbose !== void 0 && opts.verbose !== (singletonOpts?.verbose ?? false)) console.warn(`[gscdump] createNodeDuckDBHandle: ignoring verbose=${opts.verbose} — a shared DuckDB instance was already initialized with verbose=${singletonOpts?.verbose ?? false}. Call resetNodeDuckDB() before re-initializing to change it.`);
42
+ getSingleton(opts);
34
43
  return {
35
44
  async query(sql, params) {
36
- const { conn } = await singleton;
45
+ const { conn } = await getSingleton(opts);
37
46
  if (!params || params.length === 0) return arrowToRows(conn.query(sql));
38
47
  const stmt = conn.prepare(sql);
39
48
  try {
@@ -43,15 +52,15 @@ function createNodeDuckDBHandle(opts = {}) {
43
52
  }
44
53
  },
45
54
  async registerFileBuffer(name, bytes) {
46
- const { db } = await singleton;
55
+ const { db } = await getSingleton(opts);
47
56
  db.registerFileBuffer(name, bytes);
48
57
  },
49
58
  async copyFileToBuffer(name) {
50
- const { db } = await singleton;
59
+ const { db } = await getSingleton(opts);
51
60
  return db.copyFileToBuffer(name);
52
61
  },
53
62
  async dropFiles(names) {
54
- const { db } = await singleton;
63
+ const { db } = await getSingleton(opts);
55
64
  for (const name of names) {
56
65
  try {
57
66
  db.dropFile(name);
@@ -69,9 +78,12 @@ function createNodeDuckDBHandle(opts = {}) {
69
78
  function resetNodeDuckDB() {
70
79
  const pending = singleton;
71
80
  singleton = null;
81
+ singletonOpts = null;
72
82
  pending?.then(({ db, conn }) => {
73
83
  conn.close();
74
84
  db.reset();
75
- }).catch(() => {});
85
+ }).catch((err) => {
86
+ console.warn("[gscdump] resetNodeDuckDB: failed to release DuckDB instance", err);
87
+ });
76
88
  }
77
89
  export { createNodeDuckDBHandle, resetNodeDuckDB };
@@ -24,10 +24,18 @@ interface DecodeParquetOptions {
24
24
  * per row group — pruning groups whose column statistics can't match and
25
25
  * materialising only matching rows — so a filtered decode of a large file
26
26
  * holds at most one row group plus the matches in memory, never the whole
27
- * file. Use this whenever the caller needs a sub-slice of a big parquet
28
- * (e.g. one feedpath out of a site-wide sitemap-urls index).
27
+ * file. Use when a caller needs a sub-slice of a big parquet keyed on a
28
+ * clustered column (a row group's min/max stats only prune if the predicate
29
+ * column is the physical sort key — see `sortKey`/`clusterKey`).
29
30
  */
30
31
  filter?: ParquetQueryFilter;
32
+ /**
33
+ * Project a subset of columns. hyparquet only fetches + decodes the named
34
+ * column chunks, so a read that needs 2 of 14 columns skips the other 12's
35
+ * pages entirely. Omit to read every column. Names not present in the file
36
+ * are ignored by the reader.
37
+ */
38
+ columns?: readonly string[];
31
39
  }
32
40
  declare function decodeParquetToRows(bytes: Uint8Array, opts?: DecodeParquetOptions): Promise<Row[]>;
33
41
  interface HyparquetCodecOptions {
@@ -1,14 +1,83 @@
1
1
  import { SCHEMAS, TABLE_METADATA, dedupeByNaturalKey } from "../_chunks/schema.mjs";
2
2
  import { parquetReadObjects } from "hyparquet";
3
- import { parquetWriteBuffer } from "hyparquet-writer";
3
+ import { ByteWriter, parquetWriteRows } from "hyparquet-writer";
4
4
  const ROW_GROUP_SIZE = 25e3;
5
5
  function basicTypeFor(colType) {
6
- if (colType === "VARCHAR" || colType === "DATE") return "STRING";
6
+ if (colType === "VARCHAR") return "STRING";
7
7
  if (colType === "BIGINT") return "INT64";
8
8
  if (colType === "INTEGER") return "INT32";
9
9
  if (colType === "DOUBLE") return "DOUBLE";
10
+ if (colType === "DATE") return "INT32";
10
11
  throw new Error(`unsupported column type for parquet encoding: ${colType}`);
11
12
  }
13
+ const EPOCH_DAY_MS = 864e5;
14
+ function toEpochDays(value) {
15
+ if (value === null || value === void 0) return null;
16
+ if (typeof value === "number") return value;
17
+ if (value instanceof Date) {
18
+ const ms = value.getTime();
19
+ if (Number.isNaN(ms)) throw new TypeError("encodeRowsToParquet: invalid Date for DATE column");
20
+ return Math.floor(ms / EPOCH_DAY_MS);
21
+ }
22
+ if (typeof value === "string") {
23
+ const ms = Date.parse(`${value}T00:00:00Z`);
24
+ if (Number.isNaN(ms)) throw new TypeError(`encodeRowsToParquet: invalid date string '${value}'`);
25
+ return Math.floor(ms / EPOCH_DAY_MS);
26
+ }
27
+ throw new TypeError(`encodeRowsToParquet: unsupported DATE value '${String(value)}'`);
28
+ }
29
+ function isoFromDate(d) {
30
+ return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
31
+ }
32
+ function buildWriteSchema(columns) {
33
+ const schema = [{
34
+ name: "root",
35
+ num_children: columns.length
36
+ }];
37
+ for (const col of columns) {
38
+ const repetition_type = col.nullable ? "OPTIONAL" : "REQUIRED";
39
+ switch (col.type) {
40
+ case "DATE":
41
+ schema.push({
42
+ name: col.name,
43
+ type: "INT32",
44
+ converted_type: "DATE",
45
+ repetition_type
46
+ });
47
+ break;
48
+ case "VARCHAR":
49
+ schema.push({
50
+ name: col.name,
51
+ type: "BYTE_ARRAY",
52
+ converted_type: "UTF8",
53
+ repetition_type
54
+ });
55
+ break;
56
+ case "INTEGER":
57
+ schema.push({
58
+ name: col.name,
59
+ type: "INT32",
60
+ repetition_type
61
+ });
62
+ break;
63
+ case "BIGINT":
64
+ schema.push({
65
+ name: col.name,
66
+ type: "INT64",
67
+ repetition_type
68
+ });
69
+ break;
70
+ case "DOUBLE":
71
+ schema.push({
72
+ name: col.name,
73
+ type: "DOUBLE",
74
+ repetition_type
75
+ });
76
+ break;
77
+ }
78
+ }
79
+ return schema;
80
+ }
12
81
  function coerceValue(value, type) {
13
82
  if (value === null || value === void 0) return null;
14
83
  if (type === "STRING") return typeof value === "string" ? value : String(value);
@@ -50,65 +119,86 @@ function sortRowsByClusterKey(table, rows) {
50
119
  });
51
120
  return copy;
52
121
  }
122
+ function encodeOrderedRows(rows, columns, rowGroupSize) {
123
+ const schema = buildWriteSchema(columns);
124
+ const isDate = columns.map((col) => col.type === "DATE");
125
+ const types = columns.map((col) => basicTypeFor(col.type));
126
+ const columnSpecs = columns.map((col) => ({
127
+ name: col.name,
128
+ nullable: col.nullable,
129
+ columnIndex: true
130
+ }));
131
+ function* coercedRows() {
132
+ for (const r of rows) {
133
+ const out = {};
134
+ for (let c = 0; c < columns.length; c++) {
135
+ const name = columns[c].name;
136
+ out[name] = isDate[c] ? toEpochDays(r[name]) : coerceValue(r[name], types[c]);
137
+ }
138
+ yield out;
139
+ }
140
+ }
141
+ const writer = new ByteWriter();
142
+ parquetWriteRows({
143
+ writer,
144
+ rows: coercedRows(),
145
+ columns: columnSpecs,
146
+ schema,
147
+ rowGroupSize
148
+ });
149
+ return new Uint8Array(writer.getBuffer());
150
+ }
53
151
  function encodeRowsToParquet(table, rows) {
54
152
  const schema = SCHEMAS[table];
55
- const sorted = sortRowsByClusterKey(table, rows);
56
- const buffer = parquetWriteBuffer({
57
- columnData: schema.columns.map((col) => {
58
- const type = basicTypeFor(col.type);
59
- const data = sorted.map((r) => coerceValue(r[col.name], type));
60
- return {
61
- name: col.name,
62
- data,
63
- type,
64
- nullable: col.nullable,
65
- columnIndex: true
66
- };
67
- }),
68
- rowGroupSize: ROW_GROUP_SIZE
69
- });
70
- return new Uint8Array(buffer);
153
+ return encodeOrderedRows(sortRowsByClusterKey(table, rows), schema.columns, ROW_GROUP_SIZE);
71
154
  }
72
155
  function encodeRowsToParquetFlex(rows, opts) {
73
156
  const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
74
- const sorted = sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
157
+ return encodeOrderedRows(sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
75
158
  for (const col of sortKey) {
76
159
  const cmp = compareValues(a[col], b[col]);
77
160
  if (cmp !== 0) return cmp;
78
161
  }
79
162
  return 0;
80
- });
81
- const buffer = parquetWriteBuffer({
82
- columnData: columns.map((col) => {
83
- const type = basicTypeFor(col.type);
84
- const data = sorted.map((r) => coerceValue(r[col.name], type));
85
- return {
86
- name: col.name,
87
- data,
88
- type,
89
- nullable: col.nullable,
90
- columnIndex: true
91
- };
92
- }),
93
- rowGroupSize
94
- });
95
- return new Uint8Array(buffer);
163
+ }), columns, rowGroupSize);
96
164
  }
97
165
  function asyncBufferFromBytes(bytes) {
98
- const ab = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
166
+ const base = bytes.byteOffset;
167
+ const buf = bytes.buffer;
99
168
  return {
100
- byteLength: ab.byteLength,
169
+ byteLength: bytes.byteLength,
101
170
  slice(start, end) {
102
- return ab.slice(start, end);
171
+ const from = base + start;
172
+ const to = end === void 0 ? base + bytes.byteLength : base + end;
173
+ return buf.slice(from, to);
103
174
  }
104
175
  };
105
176
  }
106
177
  async function decodeParquetToRows(bytes, opts = {}) {
107
178
  if (bytes.byteLength === 0) return [];
108
- return await parquetReadObjects({
179
+ return normalizeDecodedDates(await parquetReadObjects({
109
180
  file: asyncBufferFromBytes(bytes),
110
- ...opts.filter ? { filter: opts.filter } : {}
111
- });
181
+ ...opts.columns ? { columns: [...opts.columns] } : {},
182
+ ...opts.filter ? {
183
+ filter: opts.filter,
184
+ useBloomFilters: true
185
+ } : {}
186
+ }));
187
+ }
188
+ function normalizeDecodedDates(rows) {
189
+ if (rows.length === 0) return rows;
190
+ const dateCols = [];
191
+ const first = rows[0];
192
+ for (const k in first) if (first[k] instanceof Date) dateCols.push(k);
193
+ if (dateCols.length === 0) return rows;
194
+ for (const row of rows) {
195
+ const r = row;
196
+ for (const k of dateCols) {
197
+ const v = r[k];
198
+ if (v instanceof Date) r[k] = isoFromDate(v);
199
+ }
200
+ }
201
+ return rows;
112
202
  }
113
203
  function createHyparquetCodec(options = {}) {
114
204
  return {
@@ -135,7 +225,7 @@ function createHyparquetCodec(options = {}) {
135
225
  const allRows = [];
136
226
  for (const key of inputKeys) {
137
227
  const rows = await decodeParquetToRows(await dataSource.read(key));
138
- allRows.push(...rows);
228
+ for (let i = 0; i < rows.length; i++) allRows.push(rows[i]);
139
229
  }
140
230
  const rows = dedupeByNaturalKey(ctx.table, allRows);
141
231
  const bytes = encodeRowsToParquet(ctx.table, rows);
@@ -74,7 +74,7 @@ function snapshotAlias(fileName) {
74
74
  if (!m?.[1]) throw new TypeError(`snapshotAlias: unrecognised filename ${JSON.stringify(fileName)}`);
75
75
  return `cold_${m[1].replace("-", "_")}`;
76
76
  }
77
- const SNAPSHOT_TYPE_ERROR_KINDS = new Set([
77
+ const SNAPSHOT_TYPE_ERROR_KINDS = /* @__PURE__ */ new Set([
78
78
  "invalid-snapshot-filename",
79
79
  "unsupported-snapshot-index-version",
80
80
  "invalid-schema-identifier",
package/dist/errors.mjs CHANGED
@@ -168,7 +168,7 @@ const engineErrors = {
168
168
  };
169
169
  }
170
170
  };
171
- const ENGINE_ERROR_KINDS = new Set([
171
+ const ENGINE_ERROR_KINDS = /* @__PURE__ */ new Set([
172
172
  "analyzer-not-found",
173
173
  "analyzer-capability-missing",
174
174
  "invalid-sql-literal",
@@ -1,4 +1,4 @@
1
- import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
1
+ import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
2
2
  import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
3
3
  type IcebergAppendSink = Sink;
4
4
  /**
@@ -10,4 +10,4 @@ type IcebergAppendSink = Sink;
10
10
  * with no rows never touches the network.
11
11
  */
12
12
  declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
13
- export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
13
+ export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };