node-s3tables 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1 +1,396 @@
1
1
  # node-s3tables
2
+
3
+ A Node.js library for interacting with AWS S3 Tables using the Iceberg REST API.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install node-s3tables
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```javascript
14
+ import {
15
+ getMetadata,
16
+ addSchema,
17
+ addPartitionSpec,
18
+ addManifest,
19
+ addDataFiles,
20
+ setCurrentCommit,
21
+ } from 'node-s3tables';
22
+
23
+ // Get table metadata
24
+ const metadata = await getMetadata({
25
+ tableArn:
26
+ 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket/table/my-table-id',
27
+ });
28
+
29
+ // Add a new schema
30
+ await addSchema({
31
+ tableBucketARN: 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket',
32
+ namespace: 'my_namespace',
33
+ name: 'my_table',
34
+ schemaId: 2,
35
+ fields: [
36
+ { id: 1, name: 'id', required: true, type: 'long' },
37
+ { id: 2, name: 'name', required: false, type: 'string' },
38
+ ],
39
+ });
40
+ ```
41
+
42
+ ## API Reference
43
+
44
+ ### getMetadata(params)
45
+
46
+ Retrieves Iceberg metadata for an S3 table.
47
+
48
+ **Parameters:**
49
+
50
+ - `params.tableArn` (string) - The ARN of the table
51
+ - OR `params.tableBucketARN` (string) + `params.namespace` (string) + `params.name` (string)
52
+ - `params.config` (S3TablesClientConfig, optional) - AWS SDK configuration
53
+
54
+ **Returns:** Promise<IcebergMetadata>
55
+
56
+ ```javascript
57
+ // Using table ARN
58
+ const metadata = await getMetadata({
59
+ tableArn:
60
+ 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket/table/my-table-id',
61
+ });
62
+
63
+ // Using bucket ARN + namespace + name
64
+ const metadata = await getMetadata({
65
+ tableBucketARN: 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket',
66
+ namespace: 'my_namespace',
67
+ name: 'my_table',
68
+ });
69
+ ```
70
+
71
+ ### addSchema(params)
72
+
73
+ Adds a new schema to an S3 table and sets it as current.
74
+
75
+ **Parameters:**
76
+
77
+ - `params.tableBucketARN` (string) - The ARN of the table bucket
78
+ - `params.namespace` (string) - The namespace name
79
+ - `params.name` (string) - The table name
80
+ - `params.schemaId` (number) - The new schema ID
81
+ - `params.fields` (IcebergSchemaField[]) - Array of schema fields
82
+ - `params.credentials` (AwsCredentialIdentity, optional) - AWS credentials
83
+
84
+ **Returns:** Promise<string>
85
+
86
+ ```javascript
87
+ await addSchema({
88
+ tableBucketARN: 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket',
89
+ namespace: 'sales',
90
+ name: 'daily_sales',
91
+ schemaId: 2,
92
+ fields: [
93
+ { id: 1, name: 'sale_date', required: false, type: 'date' },
94
+ { id: 2, name: 'product_category', required: false, type: 'string' },
95
+ { id: 3, name: 'sales_amount', required: false, type: 'double' },
96
+ ],
97
+ });
98
+ ```
99
+
100
+ ### addPartitionSpec(params)
101
+
102
+ Adds a new partition specification to an S3 table and sets it as default.
103
+
104
+ **Parameters:**
105
+
106
+ - `params.tableBucketARN` (string) - The ARN of the table bucket
107
+ - `params.namespace` (string) - The namespace name
108
+ - `params.name` (string) - The table name
109
+ - `params.specId` (number) - The new partition spec ID
110
+ - `params.fields` (IcebergPartitionField[]) - Array of partition fields
111
+ - `params.credentials` (AwsCredentialIdentity, optional) - AWS credentials
112
+
113
+ **Returns:** Promise<string>
114
+
115
+ ```javascript
116
+ await addPartitionSpec({
117
+ tableBucketARN: 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket',
118
+ namespace: 'sales',
119
+ name: 'daily_sales',
120
+ specId: 1,
121
+ fields: [
122
+ {
123
+ 'field-id': 1000,
124
+ name: 'sale_date_day',
125
+ 'source-id': 1,
126
+ transform: 'day',
127
+ },
128
+ {
129
+ 'field-id': 1001,
130
+ name: 'product_category',
131
+ 'source-id': 2,
132
+ transform: 'identity',
133
+ },
134
+ ],
135
+ });
136
+ ```
137
+
138
+ ### addManifest(params)
139
+
140
+ Creates a manifest file for data files and returns a manifest list record.
141
+
142
+ **Parameters:**
143
+
144
+ - `params.credentials` (AwsCredentialIdentity, optional) - AWS credentials
145
+ - `params.region` (string) - AWS region
146
+ - `params.metadata` (IcebergMetadata) - Table metadata
147
+ - `params.schemaId` (number) - Schema ID to use
148
+ - `params.specId` (number) - Partition spec ID to use
149
+ - `params.snapshotId` (bigint) - Snapshot ID
150
+ - `params.sequenceNumber` (bigint) - Sequence number
151
+ - `params.files` (AddFile[]) - Array of data files
152
+
153
+ **Returns:** Promise<ManifestListRecord>
154
+
155
+ ```javascript
156
+ const manifestRecord = await addManifest({
157
+ region: 'us-west-2',
158
+ metadata: tableMetadata,
159
+ schemaId: 2,
160
+ specId: 1,
161
+ snapshotId: 4183020680887155442n,
162
+ sequenceNumber: 1n,
163
+ files: [
164
+ {
165
+ file: 's3://my-bucket/data/sales-2024-01-01.parquet',
166
+ partitions: { sale_date_day: '2024-01-01' },
167
+ recordCount: 1000n,
168
+ fileSize: 52428n,
169
+ },
170
+ ],
171
+ });
172
+ ```
173
+
174
+ ### addDataFiles(params)
175
+
176
+ Adds data files to an S3 table by creating a new snapshot.
177
+
178
+ **Parameters:**
179
+
180
+ - `params.tableBucketARN` (string) - The ARN of the table bucket
181
+ - `params.namespace` (string) - The namespace name
182
+ - `params.name` (string) - The table name
183
+ - `params.lists` (AddFileList[]) - Array of file lists to add
184
+ - `params.credentials` (AwsCredentialIdentity, optional) - AWS credentials
185
+
186
+ **Returns:** Promise<string>
187
+
188
+ ```javascript
189
+ await addDataFiles({
190
+ tableBucketARN: 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket',
191
+ namespace: 'sales',
192
+ name: 'daily_sales',
193
+ lists: [
194
+ {
195
+ specId: 1,
196
+ schemaId: 2,
197
+ files: [
198
+ {
199
+ file: 's3://my-bucket/data/sales-2024-01-01.parquet',
200
+ partitions: { sale_date_day: '2024-01-01' },
201
+ recordCount: 1000n,
202
+ fileSize: 52428n,
203
+ },
204
+ ],
205
+ },
206
+ ],
207
+ });
208
+ ```
209
+
210
+ ### setCurrentCommit(params)
211
+
212
+ Sets the current commit/snapshot for an S3 table.
213
+
214
+ **Parameters:**
215
+
216
+ - `params.tableBucketARN` (string) - The ARN of the table bucket
217
+ - `params.namespace` (string) - The namespace name
218
+ - `params.name` (string) - The table name
219
+ - `params.snapshotId` (bigint) - The snapshot ID to set as current
220
+ - `params.credentials` (AwsCredentialIdentity, optional) - AWS credentials
221
+
222
+ **Returns:** Promise<string>
223
+
224
+ ```javascript
225
+ await setCurrentCommit({
226
+ tableBucketARN: 'arn:aws:s3tables:us-west-2:123456789012:bucket/my-bucket',
227
+ namespace: 'sales',
228
+ name: 'daily_sales',
229
+ snapshotId: 4183020680887155442n,
230
+ });
231
+ ```
232
+
233
+ ## Type Definitions
234
+
235
+ ### AddFileList
236
+
237
+ ```typescript
238
+ interface AddFileList {
239
+ specId: number;
240
+ schemaId: number;
241
+ files: AddFile[];
242
+ }
243
+ ```
244
+
245
+ ### AddFile
246
+
247
+ ```typescript
248
+ interface AddFile {
249
+ file: string;
250
+ partitions: PartitionRecord;
251
+ fileSize: bigint;
252
+ recordCount: bigint;
253
+ columnSizes?: Record<string, bigint> | null;
254
+ valueCounts?: Record<string, bigint> | null;
255
+ nullValueCounts?: Record<string, bigint> | null;
256
+ nanValueCounts?: Record<string, bigint> | null;
257
+ lowerBounds?: Record<string, Buffer> | null;
258
+ upperBounds?: Record<string, Buffer> | null;
259
+ keyMetadata?: Buffer | null;
260
+ splitOffsets?: bigint[] | null;
261
+ equalityIds?: number[] | null;
262
+ sortOrderId?: number | null;
263
+ }
264
+ ```
265
+
266
+ ### IcebergSchemaField
267
+
268
+ ```typescript
269
+ interface IcebergSchemaField {
270
+ id: number;
271
+ name: string;
272
+ type: IcebergType;
273
+ required: boolean;
274
+ doc?: string;
275
+ }
276
+ ```
277
+
278
+ ### IcebergPartitionField
279
+
280
+ ```typescript
281
+ interface IcebergPartitionField {
282
+ 'field-id': number;
283
+ name: string;
284
+ 'source-id': number;
285
+ transform: IcebergTransform;
286
+ }
287
+ ```
288
+
289
+ ### IcebergType
290
+
291
+ Supported primitive types:
292
+
293
+ - `'boolean'`, `'int'`, `'long'`, `'float'`, `'double'`
294
+ - `'date'`, `'time'`, `'timestamp'`, `'timestamptz'`
295
+ - `'string'`, `'uuid'`, `'binary'`
296
+ - `'decimal(precision,scale)'`, `'fixed[length]'`
297
+
298
+ Complex types:
299
+
300
+ - List: `{ type: 'list', element: IcebergType, 'element-required': boolean }`
301
+ - Map: `{ type: 'map', key: IcebergType, value: IcebergType, 'value-required': boolean }`
302
+ - Struct: `{ type: 'struct', fields: IcebergSchemaField[] }`
303
+
304
+ ### IcebergTransform
305
+
306
+ Supported partition transforms:
307
+
308
+ - `'identity'` - Use the field value as-is
309
+ - `'year'`, `'month'`, `'day'`, `'hour'` - Date/time transforms
310
+ - `'bucket[N]'` - Hash bucket with N buckets
311
+ - `'truncate[N]'` - Truncate strings to N characters
312
+
313
+ ## Testing
314
+
315
+ ### Prerequisites
316
+
317
+ The tests require AWS credentials and S3 Tables resources. Set up the following environment variables in a `.env` file:
318
+
319
+ ```bash
320
+ TABLE_BUCKET_ARN=arn:aws:s3tables:us-west-2:123456789012:bucket/your-test-bucket
321
+ CATALOG_ID=123456789012:s3tablescatalog/your-test-bucket
322
+ OUTPUT_BUCKET=your-output-bucket
323
+ ```
324
+
325
+ ### AWS Service Calls and Permissions
326
+
327
+ The tests make calls to multiple AWS services and require the following permissions:
328
+
329
+ **S3 Tables:**
330
+
331
+ - `s3tables:CreateNamespace`
332
+ - `s3tables:DeleteNamespace`
333
+ - `s3tables:CreateTable`
334
+ - `s3tables:DeleteTable`
335
+ - `s3tables:GetTableMetadata`
336
+ - `s3tables:UpdateTableMetadata`
337
+
338
+ **S3:**
339
+
340
+ - `s3:PutObject` (for uploading test Parquet files)
341
+ - `s3:GetObject` (for reading manifest files)
342
+
343
+ **Lake Formation:**
344
+
345
+ - `lakeformation:AddLFTagsToResource` (adds `AccessLevel: Public` tag to namespaces)
346
+
347
+ **Athena:**
348
+
349
+ - `athena:StartQueryExecution`
350
+ - `athena:GetQueryExecution`
351
+ - `athena:GetQueryResults`
352
+
353
+ **Lake Formation Setup:**
354
+ The tests expect a Lake Formation tag with key `AccessLevel` and value `Public` to exist in your account. This tag is automatically applied to test namespaces to allow Athena query permissions.
355
+
356
+ ### Test Dependencies
357
+
358
+ The test suite uses additional dependencies for creating test data:
359
+
360
+ - `@aws-sdk/client-athena` - For running Athena queries in tests
361
+ - `@aws-sdk/client-lakeformation` - For Lake Formation permissions
362
+ - `parquetjs` - For creating test Parquet files
363
+ - `dotenv-cli` - For loading environment variables
364
+
365
+ ### Running Tests
366
+
367
+ Run the test suite:
368
+
369
+ ```bash
370
+ npm test
371
+ ```
372
+
373
+ Run tests with coverage:
374
+
375
+ ```bash
376
+ npm run test:cover
377
+ ```
378
+
379
+ Run a single test file:
380
+
381
+ ```bash
382
+ npm run test:single test/create.test.ts
383
+ ```
384
+
385
+ ## Configuration
386
+
387
+ The library uses the AWS SDK for authentication. Configure credentials using:
388
+
389
+ - Environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`)
390
+ - AWS credentials file (`~/.aws/credentials`)
391
+ - IAM roles (when running on EC2/Lambda)
392
+ - Or pass credentials directly to functions
393
+
394
+ ## License
395
+
396
+ MIT
package/dist/index.d.ts CHANGED
@@ -1,5 +1,33 @@
1
1
  import { AwsCredentialIdentity } from '@aws-sdk/types';
2
- import { S3TablesClientConfig } from '@aws-sdk/client-s3tables';
2
+
3
+ type RawValue = string | number | bigint | Buffer | null;
4
+ type PartitionRecord = Record<string, RawValue>;
5
+ interface PartitionSummary {
6
+ contains_null: boolean;
7
+ contains_nan?: boolean | null;
8
+ lower_bound?: Buffer | null;
9
+ upper_bound?: Buffer | null;
10
+ }
11
+ declare enum ListContent {
12
+ DATA = 0,
13
+ DELETES = 1
14
+ }
15
+ interface ManifestListRecord {
16
+ manifest_path: string;
17
+ manifest_length: bigint;
18
+ partition_spec_id: number;
19
+ content: ListContent;
20
+ sequence_number: bigint;
21
+ min_sequence_number: bigint;
22
+ added_snapshot_id: bigint;
23
+ added_data_files_count: number;
24
+ existing_data_files_count: number;
25
+ deleted_data_files_count: number;
26
+ added_rows_count: bigint;
27
+ existing_rows_count: bigint;
28
+ deleted_rows_count: bigint;
29
+ partitions?: PartitionSummary[] | null;
30
+ }
3
31
 
4
32
  type IcebergTransform = 'identity' | 'year' | 'month' | 'day' | 'hour' | `bucket[${number}]` | `truncate[${number}]`;
5
33
  interface IcebergPartitionField {
@@ -39,15 +67,61 @@ interface IcebergPartitionSpec {
39
67
  'spec-id': number;
40
68
  fields: IcebergPartitionField[];
41
69
  }
70
+ interface IcebergSnapshot {
71
+ 'snapshot-id': bigint | number;
72
+ 'parent-snapshot-id'?: bigint | number;
73
+ 'sequence-number': number;
74
+ 'timestamp-ms': number;
75
+ 'manifest-list': string;
76
+ summary: Record<string, string>;
77
+ 'schema-id'?: number;
78
+ }
42
79
  interface IcebergMetadata {
43
80
  'last-column-id': number;
44
81
  'current-schema-id': number;
45
82
  schemas: IcebergSchema[];
83
+ snapshots: IcebergSnapshot[];
46
84
  'default-spec-id': number;
47
85
  'partition-specs': IcebergPartitionSpec[];
48
86
  'last-partition-id': number;
49
- 'current-snapshot-id': number;
87
+ 'current-snapshot-id': bigint | number;
88
+ location: string;
89
+ }
90
+
91
+ interface AddFile {
92
+ file: string;
93
+ partitions: PartitionRecord;
94
+ fileSize: bigint;
95
+ recordCount: bigint;
96
+ columnSizes?: Record<string, bigint> | null | undefined;
97
+ valueCounts?: Record<string, bigint> | null | undefined;
98
+ nullValueCounts?: Record<string, bigint> | null | undefined;
99
+ nanValueCounts?: Record<string, bigint> | null | undefined;
100
+ lowerBounds?: Record<string, Buffer> | null | undefined;
101
+ upperBounds?: Record<string, Buffer> | null | undefined;
102
+ keyMetadata?: Buffer | null | undefined;
103
+ splitOffsets?: bigint[] | null | undefined;
104
+ equalityIds?: number[] | null | undefined;
105
+ sortOrderId?: number | null | undefined;
50
106
  }
107
+ interface AddManifestParams {
108
+ credentials?: AwsCredentialIdentity | undefined;
109
+ region: string;
110
+ metadata: IcebergMetadata;
111
+ schemaId: number;
112
+ specId: number;
113
+ snapshotId: bigint;
114
+ sequenceNumber: bigint;
115
+ files: AddFile[];
116
+ }
117
+ declare function addManifest(params: AddManifestParams): Promise<ManifestListRecord>;
118
+
119
+ type JSONPrimitive = string | number | boolean | null | bigint | undefined;
120
+ type JSONValue = JSONPrimitive | JSONObject | JSONArray;
121
+ interface JSONObject {
122
+ [key: string]: JSONValue;
123
+ }
124
+ type JSONArray = JSONValue[];
51
125
 
52
126
  type TableLocation = {
53
127
  tableArn: string;
@@ -57,7 +131,8 @@ type TableLocation = {
57
131
  name: string;
58
132
  };
59
133
  type GetMetadataParams = TableLocation & {
60
- config?: S3TablesClientConfig;
134
+ region?: string;
135
+ credentials?: AwsCredentialIdentity;
61
136
  };
62
137
  declare function getMetadata(params: GetMetadataParams): Promise<IcebergMetadata>;
63
138
  interface AddSchemaParams {
@@ -68,7 +143,7 @@ interface AddSchemaParams {
68
143
  schemaId: number;
69
144
  fields: IcebergSchemaField[];
70
145
  }
71
- declare function addSchema(params: AddSchemaParams): Promise<string>;
146
+ declare function addSchema(params: AddSchemaParams): Promise<JSONObject>;
72
147
  interface AddPartitionSpecParams {
73
148
  credentials?: AwsCredentialIdentity;
74
149
  tableBucketARN: string;
@@ -77,13 +152,38 @@ interface AddPartitionSpecParams {
77
152
  specId: number;
78
153
  fields: IcebergPartitionField[];
79
154
  }
80
- declare function addPartitionSpec(params: AddPartitionSpecParams): Promise<string>;
155
+ declare function addPartitionSpec(params: AddPartitionSpecParams): Promise<JSONObject>;
156
+
157
+ interface AddFileList {
158
+ specId: number;
159
+ schemaId: number;
160
+ files: AddFile[];
161
+ }
162
+ interface AddDataFilesParams {
163
+ credentials?: AwsCredentialIdentity;
164
+ tableBucketARN: string;
165
+ namespace: string;
166
+ name: string;
167
+ lists: AddFileList[];
168
+ }
169
+ declare function addDataFiles(params: AddDataFilesParams): Promise<JSONObject>;
170
+ interface SetCurrentCommitParams {
171
+ credentials?: AwsCredentialIdentity;
172
+ tableBucketARN: string;
173
+ namespace: string;
174
+ name: string;
175
+ snapshotId: bigint;
176
+ }
177
+ declare function setCurrentCommit(params: SetCurrentCommitParams): Promise<JSONObject>;
81
178
 
82
179
  declare const _default: {
83
180
  getMetadata: typeof getMetadata;
84
181
  addSchema: typeof addSchema;
85
182
  addPartitionSpec: typeof addPartitionSpec;
183
+ addManifest: typeof addManifest;
184
+ addDataFiles: typeof addDataFiles;
185
+ setCurrentCommit: typeof setCurrentCommit;
86
186
  };
87
187
 
88
- export { addPartitionSpec, addSchema, _default as default, getMetadata };
89
- export type { AddPartitionSpecParams, AddSchemaParams, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergTransform, IcebergType, TableLocation };
188
+ export { addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, setCurrentCommit };
189
+ export type { AddDataFilesParams, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergTransform, IcebergType, SetCurrentCommitParams, TableLocation };