node-es-transformer 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -12
- package/dist/node-es-transformer.cjs.js +567 -127
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +548 -127
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +24 -2
- package/package.json +12 -7
package/README.md
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
# node-es-transformer
|
|
10
10
|
|
|
11
|
-
Stream-based library for ingesting and transforming large data files (CSV/
|
|
11
|
+
Stream-based library for ingesting and transforming large data files (NDJSON/CSV/Parquet/Arrow IPC) into Elasticsearch indices.
|
|
12
12
|
|
|
13
13
|
## Quick Start
|
|
14
14
|
|
|
@@ -36,7 +36,7 @@ See [Usage](#usage) for more examples.
|
|
|
36
36
|
|
|
37
37
|
## Why Use This?
|
|
38
38
|
|
|
39
|
-
If you need to ingest large CSV/
|
|
39
|
+
If you need to ingest large NDJSON/CSV/Parquet/Arrow IPC files (GigaBytes) into Elasticsearch without running out of memory, this is the tool for you. Other solutions often run out of JS heap, hammer ES with too many requests, time out, or try to do everything in a single bulk request.
|
|
40
40
|
|
|
41
41
|
**When to use this:**
|
|
42
42
|
- Large file ingestion (20-30 GB tested)
|
|
@@ -156,6 +156,58 @@ transformer({
|
|
|
156
156
|
});
|
|
157
157
|
```
|
|
158
158
|
|
|
159
|
+
### Read Parquet from a file
|
|
160
|
+
|
|
161
|
+
```javascript
|
|
162
|
+
const transformer = require('node-es-transformer');
|
|
163
|
+
|
|
164
|
+
transformer({
|
|
165
|
+
fileName: 'users.parquet',
|
|
166
|
+
sourceFormat: 'parquet',
|
|
167
|
+
targetIndexName: 'users-index',
|
|
168
|
+
mappings: {
|
|
169
|
+
properties: {
|
|
170
|
+
id: { type: 'integer' },
|
|
171
|
+
first_name: { type: 'keyword' },
|
|
172
|
+
last_name: { type: 'keyword' },
|
|
173
|
+
full_name: { type: 'keyword' },
|
|
174
|
+
},
|
|
175
|
+
},
|
|
176
|
+
transform(row) {
|
|
177
|
+
return {
|
|
178
|
+
...row,
|
|
179
|
+
id: Number(row.id),
|
|
180
|
+
full_name: `${row.first_name} ${row.last_name}`,
|
|
181
|
+
};
|
|
182
|
+
},
|
|
183
|
+
});
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Read Arrow IPC from a file
|
|
187
|
+
|
|
188
|
+
```javascript
|
|
189
|
+
const transformer = require('node-es-transformer');
|
|
190
|
+
|
|
191
|
+
transformer({
|
|
192
|
+
fileName: 'users.arrow',
|
|
193
|
+
sourceFormat: 'arrow',
|
|
194
|
+
targetIndexName: 'users-index',
|
|
195
|
+
mappings: {
|
|
196
|
+
properties: {
|
|
197
|
+
id: { type: 'integer' },
|
|
198
|
+
first_name: { type: 'keyword' },
|
|
199
|
+
last_name: { type: 'keyword' },
|
|
200
|
+
},
|
|
201
|
+
},
|
|
202
|
+
transform(row) {
|
|
203
|
+
return {
|
|
204
|
+
...row,
|
|
205
|
+
id: Number(row.id),
|
|
206
|
+
};
|
|
207
|
+
},
|
|
208
|
+
});
|
|
209
|
+
```
|
|
210
|
+
|
|
159
211
|
### Infer mappings from CSV sample
|
|
160
212
|
|
|
161
213
|
```javascript
|
|
@@ -286,10 +338,14 @@ All options are passed to the main `transformer()` function.
|
|
|
286
338
|
|
|
287
339
|
Choose **one** of these sources:
|
|
288
340
|
|
|
289
|
-
- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json`
|
|
341
|
+
- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json`, `data/*.csv`, `data/*.parquet`, `data/*.arrow`).
|
|
290
342
|
- **`sourceIndexName`** (string): Source Elasticsearch index to reindex from.
|
|
291
343
|
- **`stream`** (Readable): Node.js readable stream to ingest from.
|
|
292
|
-
- **`sourceFormat`** (`'ndjson' | 'csv'`): Format for file/stream sources. Default: `'ndjson'`.
|
|
344
|
+
- **`sourceFormat`** (`'ndjson' | 'csv' | 'parquet' | 'arrow'`): Format for file/stream sources. Default: `'ndjson'`.
|
|
345
|
+
- `arrow` expects Arrow IPC file/stream payloads.
|
|
346
|
+
- `parquet` stream sources are currently buffered in memory before row iteration (file sources remain streaming by row cursor).
|
|
347
|
+
- `parquet` supports ZSTD-compressed files when running on Node.js 22+ (uses the built-in zlib zstd implementation).
|
|
348
|
+
- `parquet` INT64 values are normalized for JSON: safe-range values become numbers, larger values become strings.
|
|
293
349
|
- **`csvOptions`** (object): CSV parser options (delimiter, quote, columns, etc.) used when `sourceFormat: 'csv'`.
|
|
294
350
|
|
|
295
351
|
#### Client Configuration
|
|
@@ -305,7 +361,7 @@ Choose **one** of these sources:
|
|
|
305
361
|
|
|
306
362
|
- **`mappings`** (object): Elasticsearch document mappings for target index. If reindexing and not provided, mappings are copied from source index.
|
|
307
363
|
- **`mappingsOverride`** (boolean): When reindexing, apply `mappings` on top of source index mappings. Default: `false`.
|
|
308
|
-
- **`inferMappings`** (boolean): Infer mappings for `fileName` sources via `/_text_structure/find_structure`. Ignored when `mappings` is provided. If inference returns `ingest_pipeline`, it is created as `<targetIndexName>-inferred-pipeline` and applied as the index default pipeline (unless `pipeline` is explicitly set). Default: `false`.
|
|
364
|
+
- **`inferMappings`** (boolean): Infer mappings for `fileName` sources via `/_text_structure/find_structure`. Supported for `sourceFormat: 'ndjson'` and `sourceFormat: 'csv'` only. Ignored when `mappings` is provided. If inference returns `ingest_pipeline`, it is created as `<targetIndexName>-inferred-pipeline` and applied as the index default pipeline (unless `pipeline` is explicitly set). Default: `false`.
|
|
309
365
|
- **`inferMappingsOptions`** (object): Options for `/_text_structure/find_structure` (for example `sampleBytes`, `lines_to_sample`, `delimiter`, `quote`, `has_header_row`, `timeout`).
|
|
310
366
|
- **`deleteIndex`** (boolean): Delete target index if it exists before starting. Default: `false`.
|
|
311
367
|
- **`indexMappingTotalFieldsLimit`** (number): Field limit for target index (`index.mapping.total_fields.limit` setting).
|
|
@@ -330,9 +386,11 @@ When `inferMappings` is enabled, the target cluster must allow `/_text_structure
|
|
|
330
386
|
- **`skipHeader`** (boolean): Header skipping for file/stream sources.
|
|
331
387
|
- NDJSON: skips the first non-empty line
|
|
332
388
|
- CSV: skips the first data line only when `csvOptions.columns` does not consume headers
|
|
389
|
+
- Parquet/Arrow: ignored
|
|
333
390
|
- Default: `false`
|
|
334
391
|
- Applies only to `fileName`/`stream` sources
|
|
335
|
-
- **`verbose`** (boolean): Enable logging and progress bars. Default: `true`.
|
|
392
|
+
- **`verbose`** (boolean): Enable verbose logging and progress bars when using the built-in logger. Default: `true`.
|
|
393
|
+
- **`logger`** (object): Optional custom Pino-compatible logger. If omitted, the library creates an internal Pino logger (`name: node-es-transformer`) and uses `LOG_LEVEL` (if set) or `info`/`error` based on `verbose`.
|
|
336
394
|
|
|
337
395
|
### Return Value
|
|
338
396
|
|
|
@@ -345,16 +403,19 @@ The `transformer()` function returns a Promise that resolves to an object with:
|
|
|
345
403
|
- `'error'`: Error occurred
|
|
346
404
|
|
|
347
405
|
```javascript
|
|
406
|
+
const pino = require('pino');
|
|
407
|
+
const logger = pino({ name: 'my-app', level: process.env.LOG_LEVEL || 'info' });
|
|
408
|
+
|
|
348
409
|
const result = await transformer({
|
|
349
410
|
/* options */
|
|
350
411
|
});
|
|
351
412
|
|
|
352
413
|
result.events.on('complete', () => {
|
|
353
|
-
|
|
414
|
+
logger.info('Ingestion complete');
|
|
354
415
|
});
|
|
355
416
|
|
|
356
417
|
result.events.on('error', err => {
|
|
357
|
-
|
|
418
|
+
logger.error({ err }, 'Ingestion failed');
|
|
358
419
|
});
|
|
359
420
|
```
|
|
360
421
|
|
|
@@ -392,20 +453,23 @@ See [examples/typescript-example.ts](examples/typescript-example.ts) for more ex
|
|
|
392
453
|
Always handle errors when using the library:
|
|
393
454
|
|
|
394
455
|
```javascript
|
|
456
|
+
const pino = require('pino');
|
|
457
|
+
const logger = pino({ name: 'my-app', level: process.env.LOG_LEVEL || 'info' });
|
|
458
|
+
|
|
395
459
|
transformer({
|
|
396
460
|
/* options */
|
|
397
461
|
})
|
|
398
|
-
.then(() =>
|
|
399
|
-
.catch(err =>
|
|
462
|
+
.then(() => logger.info('Success'))
|
|
463
|
+
.catch(err => logger.error({ err }, 'Transformer failed'));
|
|
400
464
|
|
|
401
465
|
// Or with async/await
|
|
402
466
|
try {
|
|
403
467
|
await transformer({
|
|
404
468
|
/* options */
|
|
405
469
|
});
|
|
406
|
-
|
|
470
|
+
logger.info('Success');
|
|
407
471
|
} catch (err) {
|
|
408
|
-
|
|
472
|
+
logger.error({ err }, 'Transformer failed');
|
|
409
473
|
}
|
|
410
474
|
```
|
|
411
475
|
|