node-es-transformer 1.0.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -13
- package/dist/node-es-transformer.cjs.js +722 -128
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +704 -129
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +81 -2
- package/package.json +12 -6
package/README.md
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
# node-es-transformer
|
|
10
10
|
|
|
11
|
-
Stream-based library for ingesting and transforming large data files (CSV/
|
|
11
|
+
Stream-based library for ingesting and transforming large data files (NDJSON/CSV/Parquet/Arrow IPC) into Elasticsearch indices.
|
|
12
12
|
|
|
13
13
|
## Quick Start
|
|
14
14
|
|
|
@@ -36,7 +36,7 @@ See [Usage](#usage) for more examples.
|
|
|
36
36
|
|
|
37
37
|
## Why Use This?
|
|
38
38
|
|
|
39
|
-
If you need to ingest large CSV/
|
|
39
|
+
If you need to ingest large NDJSON/CSV/Parquet/Arrow IPC files (GigaBytes) into Elasticsearch without running out of memory, this is the tool for you. Other solutions often run out of JS heap, hammer ES with too many requests, time out, or try to do everything in a single bulk request.
|
|
40
40
|
|
|
41
41
|
**When to use this:**
|
|
42
42
|
- Large file ingestion (20-30 GB tested)
|
|
@@ -96,7 +96,7 @@ yarn add node-es-transformer
|
|
|
96
96
|
|
|
97
97
|
## Usage
|
|
98
98
|
|
|
99
|
-
### Read from a file
|
|
99
|
+
### Read NDJSON from a file
|
|
100
100
|
|
|
101
101
|
```javascript
|
|
102
102
|
const transformer = require('node-es-transformer');
|
|
@@ -129,6 +129,102 @@ transformer({
|
|
|
129
129
|
});
|
|
130
130
|
```
|
|
131
131
|
|
|
132
|
+
### Read CSV from a file
|
|
133
|
+
|
|
134
|
+
```javascript
|
|
135
|
+
const transformer = require('node-es-transformer');
|
|
136
|
+
|
|
137
|
+
transformer({
|
|
138
|
+
fileName: 'users.csv',
|
|
139
|
+
sourceFormat: 'csv',
|
|
140
|
+
targetIndexName: 'users-index',
|
|
141
|
+
mappings: {
|
|
142
|
+
properties: {
|
|
143
|
+
id: { type: 'integer' },
|
|
144
|
+
first_name: { type: 'keyword' },
|
|
145
|
+
last_name: { type: 'keyword' },
|
|
146
|
+
full_name: { type: 'keyword' },
|
|
147
|
+
},
|
|
148
|
+
},
|
|
149
|
+
transform(row) {
|
|
150
|
+
return {
|
|
151
|
+
...row,
|
|
152
|
+
id: Number(row.id),
|
|
153
|
+
full_name: `${row.first_name} ${row.last_name}`,
|
|
154
|
+
};
|
|
155
|
+
},
|
|
156
|
+
});
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Read Parquet from a file
|
|
160
|
+
|
|
161
|
+
```javascript
|
|
162
|
+
const transformer = require('node-es-transformer');
|
|
163
|
+
|
|
164
|
+
transformer({
|
|
165
|
+
fileName: 'users.parquet',
|
|
166
|
+
sourceFormat: 'parquet',
|
|
167
|
+
targetIndexName: 'users-index',
|
|
168
|
+
mappings: {
|
|
169
|
+
properties: {
|
|
170
|
+
id: { type: 'integer' },
|
|
171
|
+
first_name: { type: 'keyword' },
|
|
172
|
+
last_name: { type: 'keyword' },
|
|
173
|
+
full_name: { type: 'keyword' },
|
|
174
|
+
},
|
|
175
|
+
},
|
|
176
|
+
transform(row) {
|
|
177
|
+
return {
|
|
178
|
+
...row,
|
|
179
|
+
id: Number(row.id),
|
|
180
|
+
full_name: `${row.first_name} ${row.last_name}`,
|
|
181
|
+
};
|
|
182
|
+
},
|
|
183
|
+
});
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Read Arrow IPC from a file
|
|
187
|
+
|
|
188
|
+
```javascript
|
|
189
|
+
const transformer = require('node-es-transformer');
|
|
190
|
+
|
|
191
|
+
transformer({
|
|
192
|
+
fileName: 'users.arrow',
|
|
193
|
+
sourceFormat: 'arrow',
|
|
194
|
+
targetIndexName: 'users-index',
|
|
195
|
+
mappings: {
|
|
196
|
+
properties: {
|
|
197
|
+
id: { type: 'integer' },
|
|
198
|
+
first_name: { type: 'keyword' },
|
|
199
|
+
last_name: { type: 'keyword' },
|
|
200
|
+
},
|
|
201
|
+
},
|
|
202
|
+
transform(row) {
|
|
203
|
+
return {
|
|
204
|
+
...row,
|
|
205
|
+
id: Number(row.id),
|
|
206
|
+
};
|
|
207
|
+
},
|
|
208
|
+
});
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Infer mappings from CSV sample
|
|
212
|
+
|
|
213
|
+
```javascript
|
|
214
|
+
const transformer = require('node-es-transformer');
|
|
215
|
+
|
|
216
|
+
transformer({
|
|
217
|
+
fileName: 'users.csv',
|
|
218
|
+
sourceFormat: 'csv',
|
|
219
|
+
targetIndexName: 'users-index',
|
|
220
|
+
inferMappings: true,
|
|
221
|
+
inferMappingsOptions: {
|
|
222
|
+
sampleBytes: 200000,
|
|
223
|
+
lines_to_sample: 2000,
|
|
224
|
+
},
|
|
225
|
+
});
|
|
226
|
+
```
|
|
227
|
+
|
|
132
228
|
### Read from another index
|
|
133
229
|
|
|
134
230
|
```javascript
|
|
@@ -242,9 +338,13 @@ All options are passed to the main `transformer()` function.
|
|
|
242
338
|
|
|
243
339
|
Choose **one** of these sources:
|
|
244
340
|
|
|
245
|
-
- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json`).
|
|
341
|
+
- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json`, `data/*.csv`, `data/*.parquet`, `data/*.arrow`).
|
|
246
342
|
- **`sourceIndexName`** (string): Source Elasticsearch index to reindex from.
|
|
247
343
|
- **`stream`** (Readable): Node.js readable stream to ingest from.
|
|
344
|
+
- **`sourceFormat`** (`'ndjson' | 'csv' | 'parquet' | 'arrow'`): Format for file/stream sources. Default: `'ndjson'`.
|
|
345
|
+
- `arrow` expects Arrow IPC file/stream payloads.
|
|
346
|
+
- `parquet` stream sources are currently buffered in memory before row iteration (file sources remain streaming by row cursor).
|
|
347
|
+
- **`csvOptions`** (object): CSV parser options (delimiter, quote, columns, etc.) used when `sourceFormat: 'csv'`.
|
|
248
348
|
|
|
249
349
|
#### Client Configuration
|
|
250
350
|
|
|
@@ -259,10 +359,14 @@ Choose **one** of these sources:
|
|
|
259
359
|
|
|
260
360
|
- **`mappings`** (object): Elasticsearch document mappings for target index. If reindexing and not provided, mappings are copied from source index.
|
|
261
361
|
- **`mappingsOverride`** (boolean): When reindexing, apply `mappings` on top of source index mappings. Default: `false`.
|
|
362
|
+
- **`inferMappings`** (boolean): Infer mappings for `fileName` sources via `/_text_structure/find_structure`. Supported for `sourceFormat: 'ndjson'` and `sourceFormat: 'csv'` only. Ignored when `mappings` is provided. If inference returns `ingest_pipeline`, it is created as `<targetIndexName>-inferred-pipeline` and applied as the index default pipeline (unless `pipeline` is explicitly set). Default: `false`.
|
|
363
|
+
- **`inferMappingsOptions`** (object): Options for `/_text_structure/find_structure` (for example `sampleBytes`, `lines_to_sample`, `delimiter`, `quote`, `has_header_row`, `timeout`).
|
|
262
364
|
- **`deleteIndex`** (boolean): Delete target index if it exists before starting. Default: `false`.
|
|
263
365
|
- **`indexMappingTotalFieldsLimit`** (number): Field limit for target index (`index.mapping.total_fields.limit` setting).
|
|
264
366
|
- **`pipeline`** (string): Elasticsearch ingest pipeline name to use during indexing.
|
|
265
367
|
|
|
368
|
+
When `inferMappings` is enabled, the target cluster must allow `/_text_structure/find_structure` (cluster privilege: `monitor_text_structure`). If inferred ingest pipelines are used, the target cluster must also allow creating ingest pipelines (`_ingest/pipeline`).
|
|
369
|
+
|
|
266
370
|
#### Performance Options
|
|
267
371
|
|
|
268
372
|
- **`bufferSize`** (number): Buffer size threshold in KBytes for bulk indexing. Default: `5120` (5 MB).
|
|
@@ -276,9 +380,15 @@ Choose **one** of these sources:
|
|
|
276
380
|
- Return array of documents to split one source into multiple targets
|
|
277
381
|
- Return `null`/`undefined` to skip document
|
|
278
382
|
- **`query`** (object): Elasticsearch [DSL query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) to filter source documents.
|
|
279
|
-
- **`splitRegex`** (RegExp): Line split regex for file/stream sources
|
|
280
|
-
- **`skipHeader`** (boolean):
|
|
281
|
-
-
|
|
383
|
+
- **`splitRegex`** (RegExp): Line split regex for file/stream sources when `sourceFormat` is `'ndjson'`. Default: `/\n/`.
|
|
384
|
+
- **`skipHeader`** (boolean): Header skipping for file/stream sources.
|
|
385
|
+
- NDJSON: skips the first non-empty line
|
|
386
|
+
- CSV: skips the first data line only when `csvOptions.columns` does not consume headers
|
|
387
|
+
- Parquet/Arrow: ignored
|
|
388
|
+
- Default: `false`
|
|
389
|
+
- Applies only to `fileName`/`stream` sources
|
|
390
|
+
- **`verbose`** (boolean): Enable verbose logging and progress bars when using the built-in logger. Default: `true`.
|
|
391
|
+
- **`logger`** (object): Optional custom Pino-compatible logger. If omitted, the library creates an internal Pino logger (`name: node-es-transformer`) and uses `LOG_LEVEL` (if set) or `info`/`error` based on `verbose`.
|
|
282
392
|
|
|
283
393
|
### Return Value
|
|
284
394
|
|
|
@@ -291,16 +401,19 @@ The `transformer()` function returns a Promise that resolves to an object with:
|
|
|
291
401
|
- `'error'`: Error occurred
|
|
292
402
|
|
|
293
403
|
```javascript
|
|
404
|
+
const pino = require('pino');
|
|
405
|
+
const logger = pino({ name: 'my-app', level: process.env.LOG_LEVEL || 'info' });
|
|
406
|
+
|
|
294
407
|
const result = await transformer({
|
|
295
408
|
/* options */
|
|
296
409
|
});
|
|
297
410
|
|
|
298
411
|
result.events.on('complete', () => {
|
|
299
|
-
|
|
412
|
+
logger.info('Ingestion complete');
|
|
300
413
|
});
|
|
301
414
|
|
|
302
415
|
result.events.on('error', err => {
|
|
303
|
-
|
|
416
|
+
logger.error({ err }, 'Ingestion failed');
|
|
304
417
|
});
|
|
305
418
|
```
|
|
306
419
|
|
|
@@ -338,20 +451,23 @@ See [examples/typescript-example.ts](examples/typescript-example.ts) for more ex
|
|
|
338
451
|
Always handle errors when using the library:
|
|
339
452
|
|
|
340
453
|
```javascript
|
|
454
|
+
const pino = require('pino');
|
|
455
|
+
const logger = pino({ name: 'my-app', level: process.env.LOG_LEVEL || 'info' });
|
|
456
|
+
|
|
341
457
|
transformer({
|
|
342
458
|
/* options */
|
|
343
459
|
})
|
|
344
|
-
.then(() =>
|
|
345
|
-
.catch(err =>
|
|
460
|
+
.then(() => logger.info('Success'))
|
|
461
|
+
.catch(err => logger.error({ err }, 'Transformer failed'));
|
|
346
462
|
|
|
347
463
|
// Or with async/await
|
|
348
464
|
try {
|
|
349
465
|
await transformer({
|
|
350
466
|
/* options */
|
|
351
467
|
});
|
|
352
|
-
|
|
468
|
+
logger.info('Success');
|
|
353
469
|
} catch (err) {
|
|
354
|
-
|
|
470
|
+
logger.error({ err }, 'Transformer failed');
|
|
355
471
|
}
|
|
356
472
|
```
|
|
357
473
|
|