node-es-transformer 1.0.0-beta3 → 1.0.0-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -16
- package/dist/node-es-transformer.cjs.js +11 -20
- package/dist/node-es-transformer.esm.js +11 -20
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -14,23 +14,12 @@ If you're looking for a nodejs based tool which allows you to ingest large CSV/J
|
|
|
14
14
|
|
|
15
15
|
While I'd generally recommend using [Logstash](https://www.elastic.co/products/logstash), [filebeat](https://www.elastic.co/products/beats/filebeat), [Ingest Nodes](https://www.elastic.co/guide/en/elasticsearch/reference/master/ingest.html), [Elastic Agent](https://www.elastic.co/guide/en/fleet/current/fleet-overview.html) or [Elasticsearch Transforms](https://www.elastic.co/guide/en/elasticsearch/reference/current/transforms.html) for established use cases, this tool may be of help especially if you feel more at home in the JavaScript/nodejs universe and have use cases with customized ingestion and data transformation needs.
|
|
16
16
|
|
|
17
|
-
**This is experimental code, use at your own risk. Nonetheless, I encourage you to give it a try so I can gather some feedback.**
|
|
18
|
-
|
|
19
|
-
### So why is this still _alpha_?
|
|
20
|
-
|
|
21
|
-
- The API is not quite final and might change from release to release.
|
|
22
|
-
- The code needs some more safety measures to avoid some possible accidental data loss scenarios.
|
|
23
|
-
- No test coverage yet.
|
|
24
|
-
|
|
25
|
-
---
|
|
26
|
-
|
|
27
|
-
Now that we've talked about the caveats, let's have a look what you actually get with this tool:
|
|
28
|
-
|
|
29
17
|
## Features
|
|
30
18
|
|
|
31
19
|
- Buffering/Streaming for both reading and indexing. Files are read using streaming and Elasticsearch ingestion is done using buffered bulk indexing. This is tailored towards ingestion of large files. Successfully tested so far with JSON and CSV files in the range of 20-30 GBytes. On a single machine running both `node-es-transformer` and Elasticsearch ingestion rates up to 20k documents/second were achieved (2,9 GHz Intel Core i7, 16GByte RAM, SSD), depending on document size.
|
|
32
20
|
- Supports wildcards to ingest/transform a range of files in one go.
|
|
33
21
|
- Supports fetching documents from existing indices using search/scroll. This allows you to reindex with custom data transformations just using JavaScript in the `transform` callback.
|
|
22
|
+
- Supports ingesting docs based on a nodejs stream.
|
|
34
23
|
- The `transform` callback gives you each source document, but you can split it up in multiple ones and return an array of documents. An example use case for this: Each source document is a Tweet and you want to transform that into an entity centric index based on Hashtags.
|
|
35
24
|
|
|
36
25
|
## Getting started
|
|
@@ -112,9 +101,10 @@ transformer({
|
|
|
112
101
|
- `sourceClientConfig`/`targetClientConfig`: Optional Elasticsearch client options, defaults to `{ node: 'http://localhost:9200' }`.
|
|
113
102
|
- `bufferSize`: The threshold to flush bulk index request in KBytes, defaults to `5120`.
|
|
114
103
|
- `searchSize`: The amount of documents to be fetched with each search request when reindexing from another source index.
|
|
115
|
-
- `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName`
|
|
104
|
+
- `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName` and `stream` are not allowed.
|
|
105
|
+
- `stream`: Source nodejs stream to ingest. If this is set, `sourceIndexName` and `fileName` are not allowed.
|
|
116
106
|
- `splitRegex`: Custom line split regex, defaults to `/\n/`.
|
|
117
|
-
- `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName`
|
|
107
|
+
- `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName` and `stream` are not allowed.
|
|
118
108
|
- `targetIndexName`: The target Elasticsearch index where documents will be indexed.
|
|
119
109
|
- `mappings`: Optional Elasticsearch document mappings. If not set and you're reindexing from another index, the mappings from the existing index will be used.
|
|
120
110
|
- `mappingsOverride`: If you're reindexing and this is set to `true`, `mappings` will be applied on top of the source index's mappings. Defaults to `false`.
|
|
@@ -148,10 +138,10 @@ yarn
|
|
|
148
138
|
|
|
149
139
|
```bash
|
|
150
140
|
# Download the docker image
|
|
151
|
-
docker pull docker.elastic.co/elasticsearch/elasticsearch:8.
|
|
141
|
+
docker pull docker.elastic.co/elasticsearch/elasticsearch:8.17.0
|
|
152
142
|
|
|
153
143
|
# Run the container
|
|
154
|
-
docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.
|
|
144
|
+
docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.17.0
|
|
155
145
|
```
|
|
156
146
|
|
|
157
147
|
To commit, use `cz`. To prepare a release, use e.g. `yarn release -- --release-as 1.0.0-beta2`.
|
|
@@ -230,7 +230,6 @@ function indexQueueFactory(ref) {
|
|
|
230
230
|
|
|
231
231
|
// Async IIFE to start bulk indexing
|
|
232
232
|
(async function () {
|
|
233
|
-
console.log('START BULK INDEXING');
|
|
234
233
|
await client.helpers.bulk({
|
|
235
234
|
concurrency: parallelCalls,
|
|
236
235
|
flushBytes: flushBytes,
|
|
@@ -243,7 +242,6 @@ function indexQueueFactory(ref) {
|
|
|
243
242
|
};
|
|
244
243
|
},
|
|
245
244
|
});
|
|
246
|
-
console.log('FINISHED BULK INDEXING');
|
|
247
245
|
|
|
248
246
|
queueEmitter.emit('finish');
|
|
249
247
|
})();
|
|
@@ -294,20 +292,18 @@ function indexReaderFactory(
|
|
|
294
292
|
|
|
295
293
|
async function fetchPopulatedFields() {
|
|
296
294
|
try {
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
},
|
|
295
|
+
// Get all populated fields from the index
|
|
296
|
+
var response = await client.fieldCaps(
|
|
297
|
+
{
|
|
298
|
+
index: sourceIndexName,
|
|
299
|
+
fields: '*',
|
|
300
|
+
include_empty_fields: false,
|
|
301
|
+
filters: '-metadata',
|
|
305
302
|
},
|
|
306
|
-
|
|
303
|
+
{ maxRetries: 0 }
|
|
304
|
+
);
|
|
307
305
|
|
|
308
|
-
|
|
309
|
-
// to a list of unique field names used across all docs.
|
|
310
|
-
return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
|
|
306
|
+
return Object.keys(response.fields);
|
|
311
307
|
} catch (e) {
|
|
312
308
|
console.log('error', e);
|
|
313
309
|
}
|
|
@@ -415,7 +411,6 @@ function indexReaderFactory(
|
|
|
415
411
|
|
|
416
412
|
function streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose) {
|
|
417
413
|
function startIndex() {
|
|
418
|
-
console.log('START INDEX', splitRegex);
|
|
419
414
|
var finished = false;
|
|
420
415
|
|
|
421
416
|
var s = stream$$1.pipe(split(splitRegex)).pipe(
|
|
@@ -449,7 +444,7 @@ function streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose)
|
|
|
449
444
|
}
|
|
450
445
|
})
|
|
451
446
|
.on('error', function (err) {
|
|
452
|
-
console.log('Error while reading
|
|
447
|
+
console.log('Error while reading stream.', err);
|
|
453
448
|
})
|
|
454
449
|
.on('end', function () {
|
|
455
450
|
if (verbose) { console.log('Read entire stream.'); }
|
|
@@ -494,7 +489,6 @@ async function transformer(ref) {
|
|
|
494
489
|
var transform = ref.transform;
|
|
495
490
|
var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
|
|
496
491
|
|
|
497
|
-
console.log('TRANSFORMER');
|
|
498
492
|
if (typeof targetIndexName === 'undefined') {
|
|
499
493
|
throw Error('targetIndexName must be specified.');
|
|
500
494
|
}
|
|
@@ -557,7 +551,6 @@ async function transformer(ref) {
|
|
|
557
551
|
}
|
|
558
552
|
|
|
559
553
|
if (typeof stream$$1 !== 'undefined') {
|
|
560
|
-
console.log('STREAM READER');
|
|
561
554
|
return streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose);
|
|
562
555
|
}
|
|
563
556
|
|
|
@@ -565,11 +558,9 @@ async function transformer(ref) {
|
|
|
565
558
|
}
|
|
566
559
|
|
|
567
560
|
var reader = getReader();
|
|
568
|
-
console.log('READER INITIALIZED');
|
|
569
561
|
|
|
570
562
|
try {
|
|
571
563
|
var indexExists = await targetClient.indices.exists({ index: targetIndexName });
|
|
572
|
-
console.log('INDEX EXISTS', indexExists);
|
|
573
564
|
|
|
574
565
|
if (indexExists === false) {
|
|
575
566
|
await createMapping();
|
|
@@ -226,7 +226,6 @@ function indexQueueFactory(ref) {
|
|
|
226
226
|
|
|
227
227
|
// Async IIFE to start bulk indexing
|
|
228
228
|
(async function () {
|
|
229
|
-
console.log('START BULK INDEXING');
|
|
230
229
|
await client.helpers.bulk({
|
|
231
230
|
concurrency: parallelCalls,
|
|
232
231
|
flushBytes: flushBytes,
|
|
@@ -239,7 +238,6 @@ function indexQueueFactory(ref) {
|
|
|
239
238
|
};
|
|
240
239
|
},
|
|
241
240
|
});
|
|
242
|
-
console.log('FINISHED BULK INDEXING');
|
|
243
241
|
|
|
244
242
|
queueEmitter.emit('finish');
|
|
245
243
|
})();
|
|
@@ -290,20 +288,18 @@ function indexReaderFactory(
|
|
|
290
288
|
|
|
291
289
|
async function fetchPopulatedFields() {
|
|
292
290
|
try {
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
},
|
|
291
|
+
// Get all populated fields from the index
|
|
292
|
+
var response = await client.fieldCaps(
|
|
293
|
+
{
|
|
294
|
+
index: sourceIndexName,
|
|
295
|
+
fields: '*',
|
|
296
|
+
include_empty_fields: false,
|
|
297
|
+
filters: '-metadata',
|
|
301
298
|
},
|
|
302
|
-
|
|
299
|
+
{ maxRetries: 0 }
|
|
300
|
+
);
|
|
303
301
|
|
|
304
|
-
|
|
305
|
-
// to a list of unique field names used across all docs.
|
|
306
|
-
return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
|
|
302
|
+
return Object.keys(response.fields);
|
|
307
303
|
} catch (e) {
|
|
308
304
|
console.log('error', e);
|
|
309
305
|
}
|
|
@@ -411,7 +407,6 @@ function indexReaderFactory(
|
|
|
411
407
|
|
|
412
408
|
function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
|
|
413
409
|
function startIndex() {
|
|
414
|
-
console.log('START INDEX', splitRegex);
|
|
415
410
|
var finished = false;
|
|
416
411
|
|
|
417
412
|
var s = stream.pipe(split(splitRegex)).pipe(
|
|
@@ -445,7 +440,7 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
|
|
|
445
440
|
}
|
|
446
441
|
})
|
|
447
442
|
.on('error', function (err) {
|
|
448
|
-
console.log('Error while reading
|
|
443
|
+
console.log('Error while reading stream.', err);
|
|
449
444
|
})
|
|
450
445
|
.on('end', function () {
|
|
451
446
|
if (verbose) { console.log('Read entire stream.'); }
|
|
@@ -490,7 +485,6 @@ async function transformer(ref) {
|
|
|
490
485
|
var transform = ref.transform;
|
|
491
486
|
var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
|
|
492
487
|
|
|
493
|
-
console.log('TRANSFORMER');
|
|
494
488
|
if (typeof targetIndexName === 'undefined') {
|
|
495
489
|
throw Error('targetIndexName must be specified.');
|
|
496
490
|
}
|
|
@@ -553,7 +547,6 @@ async function transformer(ref) {
|
|
|
553
547
|
}
|
|
554
548
|
|
|
555
549
|
if (typeof stream !== 'undefined') {
|
|
556
|
-
console.log('STREAM READER');
|
|
557
550
|
return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
|
|
558
551
|
}
|
|
559
552
|
|
|
@@ -561,11 +554,9 @@ async function transformer(ref) {
|
|
|
561
554
|
}
|
|
562
555
|
|
|
563
556
|
var reader = getReader();
|
|
564
|
-
console.log('READER INITIALIZED');
|
|
565
557
|
|
|
566
558
|
try {
|
|
567
559
|
var indexExists = await targetClient.indices.exists({ index: targetIndexName });
|
|
568
|
-
console.log('INDEX EXISTS', indexExists);
|
|
569
560
|
|
|
570
561
|
if (indexExists === false) {
|
|
571
562
|
await createMapping();
|
package/package.json
CHANGED
|
@@ -14,11 +14,11 @@
|
|
|
14
14
|
"license": "Apache-2.0",
|
|
15
15
|
"author": "Walter Rafelsberger <walter@rafelsberger.at>",
|
|
16
16
|
"contributors": [],
|
|
17
|
-
"version": "1.0.0-
|
|
17
|
+
"version": "1.0.0-beta4",
|
|
18
18
|
"main": "dist/node-es-transformer.cjs.js",
|
|
19
19
|
"module": "dist/node-es-transformer.esm.js",
|
|
20
20
|
"dependencies": {
|
|
21
|
-
"@elastic/elasticsearch": "^8.
|
|
21
|
+
"@elastic/elasticsearch": "^8.17.0",
|
|
22
22
|
"cli-progress": "^3.12.0",
|
|
23
23
|
"event-stream": "3.3.4",
|
|
24
24
|
"git-cz": "^4.9.0",
|