@gscdump/engine 0.23.4 → 0.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/iceberg-schema.mjs +1 -0
- package/dist/_chunks/schema.d.mts +440 -792
- package/dist/_chunks/sink.d.mts +19 -14
- package/dist/adapters/duckdb-node.mjs +5 -0
- package/dist/adapters/r2-manifest.mjs +8 -1
- package/dist/index.mjs +10 -2
- package/dist/sink-node.d.mts +3 -11
- package/package.json +4 -4
package/dist/_chunks/sink.d.mts
CHANGED
|
@@ -1,9 +1,23 @@
|
|
|
1
1
|
import { Row as Row$1, SearchType, TenantCtx as TenantCtx$1 } from "./storage.mjs";
|
|
2
2
|
import { icebergAppend, restCatalogConnect, s3SignedResolver } from "icebird";
|
|
3
3
|
import { TableName } from "@gscdump/contracts";
|
|
4
|
-
/**
|
|
4
|
+
/**
|
|
5
|
+
* S3-compatible credentials for the Iceberg warehouse object store (R2 in prod,
|
|
6
|
+
* MinIO in the POC). The single definition shared by every catalog/writer/sink
|
|
7
|
+
* that signs warehouse object access — keep this contract in one place so the
|
|
8
|
+
* credential shape cannot drift between the icebird and PyIceberg paths.
|
|
9
|
+
*/
|
|
10
|
+
interface IcebergS3Config {
|
|
11
|
+
/** S3 endpoint host (POC MinIO: `localhost:9100`; prod: the R2 S3 endpoint). */
|
|
12
|
+
endpoint: string;
|
|
13
|
+
accessKeyId: string;
|
|
14
|
+
secretAccessKey: string;
|
|
15
|
+
/** Defaults to `'auto'` (R2's region). */
|
|
16
|
+
region?: string;
|
|
17
|
+
}
|
|
18
|
+
/** The 9 fact tables that exist as global Iceberg tables. */
|
|
5
19
|
type IcebergTableName = Extract<TableName, 'pages' | 'queries' | 'countries' | 'page_queries' | 'dates' | 'search_appearance' | 'search_appearance_pages' | 'search_appearance_queries' | 'search_appearance_page_queries'>;
|
|
6
|
-
/** The
|
|
20
|
+
/** The 9 Iceberg table names, in canonical order. */
|
|
7
21
|
declare const ICEBERG_TABLES: readonly IcebergTableName[];
|
|
8
22
|
/**
|
|
9
23
|
* Iceberg-native column type. Superset-mapped from the engine `ColumnType`;
|
|
@@ -35,7 +49,7 @@ interface IcebergTableSpec {
|
|
|
35
49
|
table: IcebergTableName;
|
|
36
50
|
columns: readonly IcebergColumn[];
|
|
37
51
|
/**
|
|
38
|
-
* Partition spec — shared by
|
|
52
|
+
* Partition spec — shared by every table: identity(site_id),
|
|
39
53
|
* identity(search_type), month(date).
|
|
40
54
|
*/
|
|
41
55
|
partitionSpec: readonly IcebergPartitionField[];
|
|
@@ -66,7 +80,7 @@ declare const ICEBERG_PARTITION_COLUMNS: readonly IcebergColumn[];
|
|
|
66
80
|
* catalog. Iceberg still guarantees ids are stable once a table exists.
|
|
67
81
|
*/
|
|
68
82
|
declare const ICEBERG_FIELD_ID_BASE = 3;
|
|
69
|
-
/** Shared partition spec — identical across
|
|
83
|
+
/** Shared partition spec — identical across every table. */
|
|
70
84
|
declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
|
|
71
85
|
/**
|
|
72
86
|
* Derive the full Iceberg table spec for a table from the engine `SCHEMAS`
|
|
@@ -78,7 +92,7 @@ declare const ICEBERG_PARTITION_SPEC: readonly IcebergPartitionField[];
|
|
|
78
92
|
* source of truth — do not hand-list columns elsewhere.
|
|
79
93
|
*/
|
|
80
94
|
declare function icebergTableSpec(table: IcebergTableName): IcebergTableSpec;
|
|
81
|
-
/** All
|
|
95
|
+
/** All Iceberg table specs, keyed by table name. */
|
|
82
96
|
declare const ICEBERG_SCHEMAS: Record<IcebergTableName, IcebergTableSpec>;
|
|
83
97
|
/** icebird's lowercase Iceberg primitive types (subset we use). */
|
|
84
98
|
type IcebergPrimitiveType = 'string' | 'int' | 'long' | 'double' | 'date';
|
|
@@ -107,15 +121,6 @@ interface IcebergPartitionSpec {
|
|
|
107
121
|
'spec-id': number;
|
|
108
122
|
'fields': IcebergPartitionSpecField[];
|
|
109
123
|
}
|
|
110
|
-
/** S3-compatible credentials for the R2 warehouse. */
|
|
111
|
-
interface IcebergS3Config {
|
|
112
|
-
/** R2 S3 endpoint, e.g. `https://<account>.r2.cloudflarestorage.com`. */
|
|
113
|
-
endpoint: string;
|
|
114
|
-
accessKeyId: string;
|
|
115
|
-
secretAccessKey: string;
|
|
116
|
-
/** Defaults to `'auto'` (R2's region). */
|
|
117
|
-
region?: string;
|
|
118
|
-
}
|
|
119
124
|
/** Everything needed to talk to the R2 Data Catalog. */
|
|
120
125
|
interface IcebergCatalogConfig {
|
|
121
126
|
/** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
|
|
@@ -67,6 +67,11 @@ function createNodeDuckDBHandle(opts = {}) {
|
|
|
67
67
|
};
|
|
68
68
|
}
|
|
69
69
|
function resetNodeDuckDB() {
|
|
70
|
+
const pending = singleton;
|
|
70
71
|
singleton = null;
|
|
72
|
+
pending?.then(({ db, conn }) => {
|
|
73
|
+
conn.close();
|
|
74
|
+
db.reset();
|
|
75
|
+
}).catch(() => {});
|
|
71
76
|
}
|
|
72
77
|
export { createNodeDuckDBHandle, resetNodeDuckDB };
|
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
import { inferLegacyTier, inferSearchType } from "../_chunks/storage.mjs";
|
|
2
2
|
const SHARD_RE = /^u_[^/]+\/manifest\/(?<siteId>[^/]+)\/(?<table>[^/]+)\/HEAD$/;
|
|
3
|
+
const CAS_BACKOFF_BASE_MS = 5;
|
|
4
|
+
const CAS_BACKOFF_CAP_MS = 250;
|
|
5
|
+
async function casBackoff(attempt) {
|
|
6
|
+
const ceil = Math.min(CAS_BACKOFF_CAP_MS, CAS_BACKOFF_BASE_MS * 2 ** attempt);
|
|
7
|
+
await new Promise((resolve) => setTimeout(resolve, Math.random() * ceil));
|
|
8
|
+
}
|
|
3
9
|
function defaultSnapshotId() {
|
|
4
10
|
return `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`;
|
|
5
11
|
}
|
|
@@ -52,7 +58,7 @@ function createR2ManifestStore(opts) {
|
|
|
52
58
|
const { bucket, userId } = opts;
|
|
53
59
|
const newSnapshotId = opts.newSnapshotId ?? defaultSnapshotId;
|
|
54
60
|
const now = opts.now ?? (() => Date.now());
|
|
55
|
-
const maxRetries = opts.maxRetries ??
|
|
61
|
+
const maxRetries = opts.maxRetries ?? 16;
|
|
56
62
|
const onEvent = opts.onEvent;
|
|
57
63
|
async function readShard(siteId, table) {
|
|
58
64
|
const head = await bucket.get(headKey(userId, siteId, table));
|
|
@@ -112,6 +118,7 @@ function createR2ManifestStore(opts) {
|
|
|
112
118
|
attempt
|
|
113
119
|
});
|
|
114
120
|
attempt++;
|
|
121
|
+
if (attempt < maxRetries) await casBackoff(attempt);
|
|
115
122
|
}
|
|
116
123
|
throw new Error(`R2 manifest CAS exceeded ${maxRetries} retries for ${siteId}/${table}`);
|
|
117
124
|
}
|
package/dist/index.mjs
CHANGED
|
@@ -301,8 +301,16 @@ function createIngestAccumulator(opts) {
|
|
|
301
301
|
}
|
|
302
302
|
const DAY_MILLIS = 864e5;
|
|
303
303
|
function toIcebergDate(value) {
|
|
304
|
-
if (typeof value === "string")
|
|
305
|
-
|
|
304
|
+
if (typeof value === "string") {
|
|
305
|
+
const ms = Date.parse(`${value}T00:00:00Z`);
|
|
306
|
+
if (Number.isNaN(ms)) throw new TypeError(`toIcebergDate: invalid date string '${value}'`);
|
|
307
|
+
return Math.floor(ms / DAY_MILLIS);
|
|
308
|
+
}
|
|
309
|
+
if (value instanceof Date) {
|
|
310
|
+
const ms = value.getTime();
|
|
311
|
+
if (Number.isNaN(ms)) throw new TypeError("toIcebergDate: invalid Date (NaN)");
|
|
312
|
+
return Math.floor(ms / DAY_MILLIS);
|
|
313
|
+
}
|
|
306
314
|
return value;
|
|
307
315
|
}
|
|
308
316
|
function coerceJsonSafe(value) {
|
package/dist/sink-node.d.mts
CHANGED
|
@@ -1,16 +1,8 @@
|
|
|
1
|
-
import { LocalIcebergSinkOptions, Sink } from "./_chunks/sink.mjs";
|
|
2
|
-
/** S3-compatible credentials for the warehouse (POC: MinIO). */
|
|
3
|
-
interface LocalIcebergS3Config {
|
|
4
|
-
/** S3 endpoint host (POC MinIO: `localhost:9100`). */
|
|
5
|
-
endpoint: string;
|
|
6
|
-
accessKeyId: string;
|
|
7
|
-
secretAccessKey: string;
|
|
8
|
-
region?: string;
|
|
9
|
-
}
|
|
1
|
+
import { IcebergS3Config, LocalIcebergSinkOptions, Sink } from "./_chunks/sink.mjs";
|
|
10
2
|
/** Full `LocalIcebergSink` options — extends the frozen contract options. */
|
|
11
3
|
interface LocalIcebergSinkFullOptions extends LocalIcebergSinkOptions {
|
|
12
4
|
/** S3 credentials for the warehouse. Defaults to the POC MinIO creds. */
|
|
13
|
-
s3?:
|
|
5
|
+
s3?: IcebergS3Config;
|
|
14
6
|
/** Python interpreter. Defaults to `$GSCDUMP_ICEBERG_PYTHON` then `python3`. */
|
|
15
7
|
python?: string;
|
|
16
8
|
/** Override the writer-script path. Defaults to `scripts/iceberg-writer.py`. */
|
|
@@ -28,4 +20,4 @@ interface LocalIcebergSink extends Sink {
|
|
|
28
20
|
* use this sink must skip when the stack is unreachable.
|
|
29
21
|
*/
|
|
30
22
|
declare function createLocalIcebergSink(options: LocalIcebergSinkFullOptions): LocalIcebergSink;
|
|
31
|
-
export { type
|
|
23
|
+
export { type LocalIcebergSink, type LocalIcebergSinkFullOptions, createLocalIcebergSink };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.24.1",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -177,11 +177,11 @@
|
|
|
177
177
|
}
|
|
178
178
|
},
|
|
179
179
|
"dependencies": {
|
|
180
|
-
"drizzle-orm": "
|
|
180
|
+
"drizzle-orm": "1.0.0-rc.3",
|
|
181
181
|
"icebird": "^0.8.6",
|
|
182
182
|
"proper-lockfile": "^4.1.2",
|
|
183
|
-
"@gscdump/contracts": "0.
|
|
184
|
-
"gscdump": "0.
|
|
183
|
+
"@gscdump/contracts": "0.24.1",
|
|
184
|
+
"gscdump": "0.24.1"
|
|
185
185
|
},
|
|
186
186
|
"devDependencies": {
|
|
187
187
|
"@duckdb/duckdb-wasm": "^1.32.0",
|