@gscdump/engine 0.23.2 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,23 @@
1
1
  import { Row as Row$1, SearchType, TenantCtx as TenantCtx$1 } from "./storage.mjs";
2
2
  import { icebergAppend, restCatalogConnect, s3SignedResolver } from "icebird";
3
3
  import { TableName } from "@gscdump/contracts";
4
- /** The 6 fact tables that exist as global Iceberg tables. */
4
+ /**
5
+ * S3-compatible credentials for the Iceberg warehouse object store (R2 in prod,
6
+ * MinIO in the POC). The single definition shared by every catalog/writer/sink
7
+ * that signs warehouse object access — keep this contract in one place so the
8
+ * credential shape cannot drift between the icebird and PyIceberg paths.
9
+ */
10
+ interface IcebergS3Config {
11
+ /** S3 endpoint host (POC MinIO: `localhost:9100`; prod: the R2 S3 endpoint). */
12
+ endpoint: string;
13
+ accessKeyId: string;
14
+ secretAccessKey: string;
15
+ /** Defaults to `'auto'` (R2's region). */
16
+ region?: string;
17
+ }
18
+ /** The 9 fact tables that exist as global Iceberg tables. */
5
19
  type IcebergTableName = Extract<TableName, 'pages' | 'queries' | 'countries' | 'page_queries' | 'dates' | 'search_appearance' | 'search_appearance_pages' | 'search_appearance_queries' | 'search_appearance_page_queries'>;
6
- /** The 6 Iceberg table names, in canonical order. */
20
+ /** The 9 Iceberg table names, in canonical order. */
7
21
  declare const ICEBERG_TABLES: readonly IcebergTableName[];
8
22
  /**
9
23
  * Iceberg-native column type. Superset-mapped from the engine `ColumnType`;
@@ -35,7 +49,7 @@ interface IcebergTableSpec {
35
49
  table: IcebergTableName;
36
50
  columns: readonly IcebergColumn[];
37
51
  /**
38
- * Partition spec — shared by all 5 tables: identity(site_id),
52
+ * Partition spec — shared by every table: identity(site_id),
39
53
  * identity(search_type), month(date).
40
54
  */
41
55
  partitionSpec: readonly IcebergPartitionField[];
@@ -66,7 +80,7 @@ declare const ICEBERG_PARTITION_COLUMNS: readonly IcebergColumn[];
66
80
  * catalog. Iceberg still guarantees ids are stable once a table exists.
67
81
  */
68
82
  declare const ICEBERG_FIELD_ID_BASE = 3;
69
- /** Shared partition spec — identical across all 5 tables. */
83
+ /** Shared partition spec — identical across every table. */
70
84
  declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
71
85
  /**
72
86
  * Derive the full Iceberg table spec for a table from the engine `SCHEMAS`
@@ -78,7 +92,7 @@ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
78
92
  * source of truth — do not hand-list columns elsewhere.
79
93
  */
80
94
  declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
81
- /** All 5 Iceberg table specs, keyed by table name. */
95
+ /** All Iceberg table specs, keyed by table name. */
82
96
  declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
83
97
  /** icebird's lowercase Iceberg primitive types (subset we use). */
84
98
  type IcebergPrimitiveType = 'string' | 'int' | 'long' | 'double' | 'date';
@@ -107,15 +121,6 @@ interface IcebergPartitionSpec {
107
121
  'spec-id': number;
108
122
  'fields': IcebergPartitionSpecField[];
109
123
  }
110
- /** S3-compatible credentials for the R2 warehouse. */
111
- interface IcebergS3Config {
112
- /** R2 S3 endpoint, e.g. `https://<account>.r2.cloudflarestorage.com`. */
113
- endpoint: string;
114
- accessKeyId: string;
115
- secretAccessKey: string;
116
- /** Defaults to `'auto'` (R2's region). */
117
- region?: string;
118
- }
119
124
  /** Everything needed to talk to the R2 Data Catalog. */
120
125
  interface IcebergCatalogConfig {
121
126
  /** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
@@ -67,6 +67,11 @@ function createNodeDuckDBHandle(opts = {}) {
67
67
  };
68
68
  }
69
69
  function resetNodeDuckDB() {
70
+ const pending = singleton;
70
71
  singleton = null;
72
+ pending?.then(({ db, conn }) => {
73
+ conn.close();
74
+ db.reset();
75
+ }).catch(() => {});
71
76
  }
72
77
  export { createNodeDuckDBHandle, resetNodeDuckDB };
@@ -1,5 +1,11 @@
1
1
  import { inferLegacyTier, inferSearchType } from "../_chunks/storage.mjs";
2
2
  const SHARD_RE = /^u_[^/]+\/manifest\/(?<siteId>[^/]+)\/(?<table>[^/]+)\/HEAD$/;
3
+ const CAS_BACKOFF_BASE_MS = 5;
4
+ const CAS_BACKOFF_CAP_MS = 250;
5
+ async function casBackoff(attempt) {
6
+ const ceil = Math.min(CAS_BACKOFF_CAP_MS, CAS_BACKOFF_BASE_MS * 2 ** attempt);
7
+ await new Promise((resolve) => setTimeout(resolve, Math.random() * ceil));
8
+ }
3
9
  function defaultSnapshotId() {
4
10
  return `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`;
5
11
  }
@@ -52,7 +58,7 @@ function createR2ManifestStore(opts) {
52
58
  const { bucket, userId } = opts;
53
59
  const newSnapshotId = opts.newSnapshotId ?? defaultSnapshotId;
54
60
  const now = opts.now ?? (() => Date.now());
55
- const maxRetries = opts.maxRetries ?? 8;
61
+ const maxRetries = opts.maxRetries ?? 16;
56
62
  const onEvent = opts.onEvent;
57
63
  async function readShard(siteId, table) {
58
64
  const head = await bucket.get(headKey(userId, siteId, table));
@@ -112,6 +118,7 @@ function createR2ManifestStore(opts) {
112
118
  attempt
113
119
  });
114
120
  attempt++;
121
+ if (attempt < maxRetries) await casBackoff(attempt);
115
122
  }
116
123
  throw new Error(`R2 manifest CAS exceeded ${maxRetries} retries for ${siteId}/${table}`);
117
124
  }
package/dist/index.mjs CHANGED
@@ -301,18 +301,32 @@ function createIngestAccumulator(opts) {
301
301
  }
302
302
  const DAY_MILLIS = 864e5;
303
303
  function toIcebergDate(value) {
304
- if (typeof value === "string") return Math.floor(Date.parse(`${value}T00:00:00Z`) / DAY_MILLIS);
305
- if (value instanceof Date) return Math.floor(value.getTime() / DAY_MILLIS);
304
+ if (typeof value === "string") {
305
+ const ms = Date.parse(`${value}T00:00:00Z`);
306
+ if (Number.isNaN(ms)) throw new TypeError(`toIcebergDate: invalid date string '${value}'`);
307
+ return Math.floor(ms / DAY_MILLIS);
308
+ }
309
+ if (value instanceof Date) {
310
+ const ms = value.getTime();
311
+ if (Number.isNaN(ms)) throw new TypeError("toIcebergDate: invalid Date (NaN)");
312
+ return Math.floor(ms / DAY_MILLIS);
313
+ }
314
+ return value;
315
+ }
316
+ function coerceJsonSafe(value) {
317
+ if (typeof value === "bigint") return Number(value);
306
318
  return value;
307
319
  }
308
320
  function toRecords(slice, rows) {
309
321
  const siteId = slice.ctx.siteId ?? "";
310
- return rows.map((row) => ({
311
- ...row,
312
- date: toIcebergDate(row.date),
313
- site_id: siteId,
314
- search_type: slice.searchType
315
- }));
322
+ return rows.map((row) => {
323
+ const out = {};
324
+ for (const k in row) out[k] = coerceJsonSafe(row[k]);
325
+ out.date = toIcebergDate(out.date);
326
+ out.site_id = siteId;
327
+ out.search_type = slice.searchType;
328
+ return out;
329
+ });
316
330
  }
317
331
  function createIcebergAppendSink(options) {
318
332
  let connection;
@@ -1,16 +1,8 @@
1
- import { LocalIcebergSinkOptions, Sink } from "./_chunks/sink.mjs";
2
- /** S3-compatible credentials for the warehouse (POC: MinIO). */
3
- interface LocalIcebergS3Config {
4
- /** S3 endpoint host (POC MinIO: `localhost:9100`). */
5
- endpoint: string;
6
- accessKeyId: string;
7
- secretAccessKey: string;
8
- region?: string;
9
- }
1
+ import { IcebergS3Config, LocalIcebergSinkOptions, Sink } from "./_chunks/sink.mjs";
10
2
  /** Full `LocalIcebergSink` options — extends the frozen contract options. */
11
3
  interface LocalIcebergSinkFullOptions extends LocalIcebergSinkOptions {
12
4
  /** S3 credentials for the warehouse. Defaults to the POC MinIO creds. */
13
- s3?: LocalIcebergS3Config;
5
+ s3?: IcebergS3Config;
14
6
  /** Python interpreter. Defaults to `$GSCDUMP_ICEBERG_PYTHON` then `python3`. */
15
7
  python?: string;
16
8
  /** Override the writer-script path. Defaults to `scripts/iceberg-writer.py`. */
@@ -28,4 +20,4 @@ interface LocalIcebergSink extends Sink {
28
20
  * use this sink must skip when the stack is unreachable.
29
21
  */
30
22
  declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
31
- export { type LocalIcebergS3Config, type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
23
+ export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.23.2",
4
+ "version": "0.24.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -177,11 +177,11 @@
177
177
  }
178
178
  },
179
179
  "dependencies": {
180
- "drizzle-orm": "^0.45.2",
180
+ "drizzle-orm": "1.0.0-rc.3",
181
181
  "icebird": "^0.8.6",
182
182
  "proper-lockfile": "^4.1.2",
183
- "@gscdump/contracts": "0.23.2",
184
- "gscdump": "0.23.2"
183
+ "@gscdump/contracts": "0.24.0",
184
+ "gscdump": "0.24.0"
185
185
  },
186
186
  "devDependencies": {
187
187
  "@duckdb/duckdb-wasm": "^1.32.0",