@gscdump/engine 0.27.2 → 0.28.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,441 @@
1
+ import { AsyncBuffer } from "hyparquet";
2
+ import { Writer } from "hyparquet-writer/src/types.js";
3
+ interface WriterOptions {
4
+ /**
5
+ * If `'*'`, the writer must create the object only if it does not already
6
+ * exist (S3 conditional write: `If-None-Match: *`). On collision the
7
+ * underlying `finish()` rejects with an error whose `status` is 412 (or 409
8
+ * on some providers). The HTTP/S3 `urlResolver` translates this to the
9
+ * `If-None-Match` header. Other resolvers may honor or ignore it.
10
+ */
11
+ ifNoneMatch?: '*';
12
+ }
13
+ interface Resolver {
14
+ reader: (path: string, byteLength?: number) => AsyncBuffer | Promise<AsyncBuffer>;
15
+ writer?: (path: string, options?: WriterOptions) => Writer;
16
+ deleter?: (path: string) => Promise<void>;
17
+ }
18
+ type Lister = (path: string) => Promise<string[]>;
19
+ /** Catalogs */
20
+ interface RestCatalogContext {
21
+ type: 'rest';
22
+ url: string;
23
+ prefix: string;
24
+ defaults: Record<string, string>;
25
+ overrides: Record<string, string>;
26
+ requestInit?: RequestInit;
27
+ }
28
+ interface FileCatalog {
29
+ type: 'file';
30
+ resolver: Resolver;
31
+ lister?: Lister;
32
+ /**
33
+ * Opt in to S3-safe metadata commits: every `vN.metadata.json` (the
34
+ * initial create and every subsequent commit) is written with
35
+ * `If-None-Match: *` and `version-hint.text` is best-effort. High-level
36
+ * write functions retry on 412/409 by reloading the latest metadata and
37
+ * re-staging. `icebergCreateTable` and `icebergTransaction` do not retry.
38
+ * Default false preserves backwards-compatible (overwrite) behavior.
39
+ */
40
+ conditionalCommits?: boolean;
41
+ }
42
+ type Catalog = RestCatalogContext | FileCatalog;
43
+ interface LoadTableResponse {
44
+ metadataLocation?: string;
45
+ metadata: TableMetadata;
46
+ config: Record<string, string>;
47
+ }
48
+ interface TableMetadata {
49
+ 'format-version': number;
50
+ 'table-uuid': string;
51
+ location: string;
52
+ 'last-sequence-number': number; // missing in V1, required in V2+
53
+ 'last-updated-ms': number;
54
+ 'last-column-id': number;
55
+ 'current-schema-id': number; // optional in V1, required in V2+
56
+ schemas: Schema[]; // optional in V1, required in V2+
57
+ 'default-spec-id': number; // optional in V1, required in V2+
58
+ 'partition-specs': PartitionSpec[]; // optional in V1, required in V2+
59
+ 'last-partition-id': number; // optional in V1, required in V2+
60
+ properties?: Record<string, string>;
61
+ 'current-snapshot-id'?: number | bigint;
62
+ snapshots?: Snapshot[];
63
+ 'snapshot-log'?: SnapshotLog[];
64
+ 'metadata-log'?: MetadataLog[];
65
+ 'sort-orders': SortOrder[]; // optional in V1, required in V2+
66
+ 'default-sort-order-id': number; // optional in V1, required in V2+
67
+ refs?: Record<string, SnapshotRef>;
68
+ statistics?: TableStatistics[];
69
+ 'partition-statistics'?: PartitionStatistics[];
70
+ 'next-row-id'?: number | bigint; // required in V3
71
+ 'encryption-keys'?: EncryptionKey[]; // V3
72
+ }
73
+ interface Schema {
74
+ type: 'struct';
75
+ 'schema-id': number;
76
+ 'identifier-field-ids'?: number[];
77
+ fields: Field[];
78
+ }
79
+ interface Field {
80
+ id: number;
81
+ name: string;
82
+ required: boolean;
83
+ type: IcebergType;
84
+ doc?: string;
85
+ 'initial-default'?: any;
86
+ 'write-default'?: any;
87
+ }
88
+ type IcebergType = 'unknown' | 'boolean' | 'int' | 'long' | 'float' | 'double' | 'date' | 'time' | 'timestamp' | 'timestamptz' | 'timestamp_ns' | 'timestamptz_ns' | 'string' | 'uuid' | `fixed[${number}]` | 'binary' | `decimal(${number},${number})` | `decimal(${number}, ${number})` | 'variant' | 'geometry' | `geometry(${string})` | 'geography' | `geography(${string})` | IcebergNestedType;
89
+ type IcebergNestedType = Schema | {
90
+ type: 'list';
91
+ 'element-id': number;
92
+ 'element-required': boolean;
93
+ element: IcebergType;
94
+ } | {
95
+ type: 'map';
96
+ 'key-id': number;
97
+ key: IcebergType;
98
+ 'value-id': number;
99
+ 'value-required': boolean;
100
+ value: IcebergType;
101
+ };
102
+ interface PartitionSpec {
103
+ 'spec-id': number;
104
+ fields: PartitionField[];
105
+ }
106
+ interface PartitionField {
107
+ 'source-id'?: number;
108
+ 'source-ids'?: number[];
109
+ 'field-id': number;
110
+ name: string;
111
+ transform: PartitionTransform;
112
+ }
113
+ type PartitionTransform = 'identity' | `bucket[${number}]` | `truncate[${number}]` | 'year' | 'month' | 'day' | 'hour' | 'void' | string;
114
+ interface PartitionStatistics {
115
+ 'snapshot-id': bigint;
116
+ 'statistics-path': string;
117
+ 'file-size-in-bytes': bigint;
118
+ }
119
+ interface SortOrder {
120
+ 'order-id': number;
121
+ 'fields': SortField[];
122
+ }
123
+ interface SortField {
124
+ transform: string;
125
+ 'source-id'?: number;
126
+ 'source-ids'?: number[]; // V3
127
+ 'direction': 'asc' | 'desc';
128
+ 'null-order': 'nulls-first' | 'nulls-last';
129
+ }
130
+ interface Snapshot {
131
+ 'snapshot-id': number | bigint;
132
+ 'parent-snapshot-id'?: number | bigint;
133
+ 'sequence-number': number;
134
+ 'timestamp-ms': number;
135
+ 'manifest-list': string;
136
+ manifests?: Manifest$1[];
137
+ summary: {
138
+ // spec: "value of these fields should be of string type"
139
+ operation: string; // 'spark.app.id'?: string
140
+ 'added-data-files'?: string;
141
+ 'added-records'?: string;
142
+ 'deleted-data-files'?: string;
143
+ 'deleted-records'?: string;
144
+ 'removed-files-size'?: string;
145
+ 'added-delete-files'?: string;
146
+ 'removed-delete-files'?: string;
147
+ 'added-position-deletes'?: string;
148
+ 'removed-position-deletes'?: string;
149
+ 'added-equality-deletes'?: string;
150
+ 'removed-equality-deletes'?: string;
151
+ 'added-dvs'?: string;
152
+ 'removed-dvs'?: string;
153
+ 'added-files-size'?: string;
154
+ 'changed-partition-count'?: string;
155
+ 'total-records'?: string;
156
+ 'total-files-size'?: string;
157
+ 'total-data-files'?: string;
158
+ 'total-delete-files'?: string;
159
+ 'total-position-deletes'?: string;
160
+ 'total-equality-deletes'?: string;
161
+ };
162
+ 'schema-id'?: number;
163
+ 'first-row-id'?: number; // V3
164
+ 'added-rows'?: number; // V3
165
+ 'key-id'?: string; // V3
166
+ }
167
+ interface TableStatistics {
168
+ 'snapshot-id': number | bigint;
169
+ 'statistics-path': string;
170
+ 'file-size-in-bytes': bigint;
171
+ 'file-footer-size-in-bytes': bigint;
172
+ }
173
+ interface SnapshotLog {
174
+ 'timestamp-ms': number;
175
+ 'snapshot-id': number | bigint;
176
+ }
177
+ interface SnapshotRef {
178
+ 'snapshot-id': number | bigint;
179
+ type: 'branch' | 'tag';
180
+ 'min-snapshots-to-keep'?: number;
181
+ 'max-snapshot-age-ms'?: number;
182
+ 'max-ref-age-ms'?: number;
183
+ }
184
+ interface EncryptionKey {
185
+ 'key-id': string;
186
+ 'key-metadata': string;
187
+ }
188
+ /**
189
+ * Iceberg REST `TableRequirement`s recognized by `checkRequirements`.
190
+ */
191
+ interface MetadataLog {
192
+ 'timestamp-ms': number;
193
+ 'metadata-file': string;
194
+ }
195
+ interface Manifest$1 {
196
+ manifest_path: string;
197
+ manifest_length: bigint;
198
+ partition_spec_id: number;
199
+ content: 0 | 1; // 0=data, 1=deletes
200
+ sequence_number?: bigint;
201
+ min_sequence_number?: bigint;
202
+ added_snapshot_id: bigint;
203
+ added_files_count: number;
204
+ existing_files_count: number;
205
+ deleted_files_count: number;
206
+ added_rows_count: bigint;
207
+ existing_rows_count: bigint;
208
+ deleted_rows_count: bigint;
209
+ partitions?: FieldSummary[]; // key_metadata?: unknown
210
+ first_row_id?: bigint | number;
211
+ }
212
+ interface ManifestEntry {
213
+ status: 0 | 1 | 2; // 0=existing, 1=added, 2=deleted
214
+ snapshot_id?: bigint;
215
+ sequence_number?: bigint;
216
+ file_sequence_number?: bigint;
217
+ partition_spec_id?: number;
218
+ data_file: DataFile;
219
+ }
220
+ interface FieldSummary {
221
+ contains_null: boolean;
222
+ contains_nan?: boolean | null;
223
+ lower_bound?: Uint8Array | null;
224
+ upper_bound?: Uint8Array | null;
225
+ }
226
+ interface DataFile {
227
+ content: 0 | 1 | 2; // 0=data, 1=position_delete, 2=equality_delete
228
+ file_path: string;
229
+ file_format: 'avro' | 'orc' | 'parquet' | 'puffin';
230
+ partition: Record<string, unknown>; // keyed by partition-field name in Avro
231
+ record_count: bigint;
232
+ file_size_in_bytes: bigint;
233
+ column_sizes?: Record<number, bigint>;
234
+ value_counts?: Record<number, bigint>;
235
+ null_value_counts?: Record<number, bigint>;
236
+ nan_value_counts?: Record<number, bigint>;
237
+ lower_bounds?: Record<number, unknown>;
238
+ upper_bounds?: Record<number, unknown>; // key_metadata?: string
239
+ split_offsets?: bigint[];
240
+ equality_ids?: number[];
241
+ sort_order_id?: number;
242
+ first_row_id?: bigint | number;
243
+ referenced_data_file?: string;
244
+ content_offset?: bigint;
245
+ content_size_in_bytes?: bigint;
246
+ }
247
+ /**
248
+ * Returns manifest entries for a snapshot. Defaults to the current snapshot;
249
+ * pass `snapshotId` to time-travel to a prior snapshot in the metadata's
250
+ * snapshot log.
251
+ *
252
+ * @import {Resolver, TableMetadata, Manifest, ManifestEntry} from '../src/types.js'
253
+ * @typedef {{ url: string, entries: ManifestEntry[] }[]} ManifestList
254
+ * @param {object} options
255
+ * @param {TableMetadata} options.metadata
256
+ * @param {Resolver} [options.resolver]
257
+ * @param {number | bigint} [options.snapshotId] - Optional snapshot id; defaults to `current-snapshot-id`.
258
+ * @returns {Promise<ManifestList>}
259
+ */
260
+ declare function icebergManifests({
261
+ metadata,
262
+ resolver,
263
+ snapshotId,
264
+ partitionFilter
265
+ }: {
266
+ metadata: TableMetadata;
267
+ resolver?: Resolver | undefined;
268
+ snapshotId?: number | bigint | undefined;
269
+ partitionFilter?: ManifestPartitionFilter | undefined;
270
+ }): Promise<ManifestList>;
271
+ /**
272
+ * Predicate applied to each manifest's manifest-list `partitions` field-summary
273
+ * array before entry fetch. Return `false` to skip the manifest's entry fetch.
274
+ */
275
+ type ManifestPartitionFilter = (partitions: Manifest['partitions'], partitionSpecId: number, manifest: Manifest) => boolean;
276
+ /**
277
+ * Returns manifest entries for a snapshot. Defaults to the current snapshot;
278
+ * pass `snapshotId` to time-travel to a prior snapshot in the metadata's
279
+ * snapshot log.
280
+ */
281
+ type ManifestList = {
282
+ url: string;
283
+ entries: ManifestEntry[];
284
+ }[];
285
+ /**
286
+ * Append rows to a table in one call: load metadata, stage the parquet writes
287
+ * + manifest + new snapshot, commit through the catalog.
288
+ *
289
+ * @param {object} options
290
+ * @param {Catalog} options.catalog
291
+ * @param {string | string[]} [options.namespace] - REST catalog only.
292
+ * @param {string} [options.table] - REST catalog only.
293
+ * @param {string} [options.tableUrl] - File catalog only.
294
+ * @param {Resolver} [options.resolver]
295
+ * @param {Record<string, any>[]} options.records
296
+ * @param {number} [options.sortOrderId] - Sort order id to apply; defaults to the table default.
297
+ * @returns {Promise<TableMetadata>}
298
+ */
299
+ declare function icebergAppend({
300
+ catalog,
301
+ namespace,
302
+ table,
303
+ tableUrl,
304
+ resolver,
305
+ records,
306
+ sortOrderId
307
+ }: {
308
+ catalog: Catalog;
309
+ namespace?: string | string[] | undefined;
310
+ table?: string | undefined;
311
+ tableUrl?: string | undefined;
312
+ resolver?: Resolver | undefined;
313
+ records: Record<string, any>[];
314
+ sortOrderId?: number | undefined;
315
+ }): Promise<TableMetadata>;
316
+ /**
317
+ * Create a new table. REST: delegates to the catalog's create endpoint.
318
+ * File: writes the initial `v1.metadata.json` and `version-hint.text` under
319
+ * `tableUrl` via `catalog.resolver`.
320
+ *
321
+ * @param {object} options
322
+ * @param {Catalog} options.catalog
323
+ * @param {string | string[]} [options.namespace] - REST catalog only.
324
+ * @param {string} [options.table] - REST catalog only.
325
+ * @param {string} [options.tableUrl] - File catalog only; also passed as `location` for REST.
326
+ * @param {Schema} [options.schema]
327
+ * @param {PartitionSpec} [options.partitionSpec]
328
+ * @param {SortOrder} [options.sortOrder]
329
+ * @param {Record<string, string>} [options.properties]
330
+ * @param {2 | 3} [options.formatVersion] - File catalog only.
331
+ * @param {boolean} [options.stageCreate] - REST catalog only.
332
+ * @returns {Promise<TableMetadata>}
333
+ */
334
+ declare function icebergCreateTable({
335
+ catalog,
336
+ namespace,
337
+ table,
338
+ tableUrl,
339
+ schema,
340
+ partitionSpec,
341
+ sortOrder,
342
+ properties,
343
+ formatVersion,
344
+ stageCreate
345
+ }: {
346
+ catalog: Catalog;
347
+ namespace?: string | string[] | undefined;
348
+ table?: string | undefined;
349
+ tableUrl?: string | undefined;
350
+ schema?: Schema | undefined;
351
+ partitionSpec?: PartitionSpec | undefined;
352
+ sortOrder?: SortOrder | undefined;
353
+ properties?: Record<string, string> | undefined;
354
+ formatVersion?: 2 | 3 | undefined;
355
+ stageCreate?: boolean | undefined;
356
+ }): Promise<TableMetadata>;
357
+ /**
358
+ * Iceberg REST Catalog client.
359
+ *
360
+ * Plain async functions over a stateless context object — no classes.
361
+ * The catalog client never imports from the read path; callers glue the
362
+ * two together by passing `metadata` and `metadata.location` from
363
+ * `restCatalogLoadTable` into `icebergRead`.
364
+ *
365
+ * @import {LoadTableResponse, PartitionSpec, RestCatalogContext, Schema, SortOrder, StorageCredential, TableIdentifier, TableMetadata, TableRequirement, TableUpdate} from '../../src/types.js'
366
+ */
367
+ /**
368
+ * Connect to a REST catalog by fetching `/v1/config`.
369
+ * Returns a frozen context object that holds the prefix, defaults, overrides
370
+ * and the user-supplied requestInit (for auth) for use in subsequent calls.
371
+ *
372
+ * @param {object} options
373
+ * @param {string} options.url - catalog base URL, with or without trailing slash
374
+ * @param {string} [options.warehouse] - optional warehouse query param sent to /v1/config
375
+ * @param {RequestInit} [options.requestInit] - fetch options (e.g. Authorization header)
376
+ * @returns {Promise<RestCatalogContext>}
377
+ */
378
+ declare function restCatalogConnect({
379
+ url,
380
+ warehouse,
381
+ requestInit
382
+ }: {
383
+ url: string;
384
+ warehouse?: string | undefined;
385
+ requestInit?: RequestInit | undefined;
386
+ }): Promise<RestCatalogContext>;
387
+ /**
388
+ * Load a single table. Returns the inline TableMetadata, the metadata
389
+ * file location, and any per-table config the server returned.
390
+ *
391
+ * The returned `metadata` and `metadata.location` can be passed directly
392
+ * into `icebergRead({ tableUrl: metadata.location, metadata })`.
393
+ *
394
+ * @param {RestCatalogContext} ctx
395
+ * @param {object} options
396
+ * @param {string | string[]} options.namespace
397
+ * @param {string} options.table
398
+ * @returns {Promise<LoadTableResponse>}
399
+ */
400
+ declare function restCatalogLoadTable(ctx: RestCatalogContext, {
401
+ namespace,
402
+ table
403
+ }: {
404
+ namespace: string | string[];
405
+ table: string;
406
+ }): Promise<LoadTableResponse>;
407
+ /**
408
+ * Wrap a `Resolver` so reads of the same path share one HTTP fetch and
409
+ * subsequent range reads share one in-memory buffer. Writes and deletes
410
+ * through the same resolver invalidate the cache entry for their path on
411
+ * success, so a commit pipeline (`icebergAppend` → reload metadata) sees the
412
+ * new bytes without manual invalidation.
413
+ *
414
+ * - `reader(path)` is memoized by path. The returned buffer is passed through
415
+ * `cachedAsyncBuffer` so range reads within a single file are also deduped.
416
+ * `byteLength` is a fetch hint and not part of the cache key — the bytes
417
+ * are the same either way.
418
+ * - `writer(path).finish()` invalidates the cache entry only when the
419
+ * underlying finish resolves; a rejected finish (e.g. an `If-None-Match: *`
420
+ * collision returning 412/409) leaves the cached bytes intact, since the
421
+ * server-side object hasn't changed.
422
+ * - `deleter(path)` invalidates the cache entry on success.
423
+ * - `writer` and `deleter` are omitted when the base resolver omits them, so
424
+ * wrapping a read-only resolver still yields a read-only resolver.
425
+ *
426
+ * Iceberg writes are almost entirely create-new-path (data parquets, manifest
427
+ * avros, snapshot avros, vN.metadata.json all live at fresh paths per
428
+ * commit). Only `version-hint.text` and equivalent catalog-pointer files are
429
+ * truly mutated in place, so path-level invalidation is sufficient to keep
430
+ * single-process write-then-read pipelines consistent.
431
+ *
432
+ * Cross-process freshness is out of scope: a reader in process B does not
433
+ * see writes committed by process A through a different cache. Use a
434
+ * short-lived resolver, an external coordination layer, or wrap with TTL /
435
+ * conditional-GET revalidation if you need that.
436
+ *
437
+ * @param {Resolver} base
438
+ * @returns {Resolver}
439
+ */
440
+ declare function cachingResolver(base: Resolver): Resolver;
441
+ export { cachingResolver, icebergAppend, icebergCreateTable, icebergManifests, restCatalogConnect, restCatalogLoadTable };