node-es-transformer 1.0.0-beta3 → 1.0.0-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,23 +14,12 @@ If you're looking for a nodejs based tool which allows you to ingest large CSV/J
14
14
 
15
15
  While I'd generally recommend using [Logstash](https://www.elastic.co/products/logstash), [filebeat](https://www.elastic.co/products/beats/filebeat), [Ingest Nodes](https://www.elastic.co/guide/en/elasticsearch/reference/master/ingest.html), [Elastic Agent](https://www.elastic.co/guide/en/fleet/current/fleet-overview.html) or [Elasticsearch Transforms](https://www.elastic.co/guide/en/elasticsearch/reference/current/transforms.html) for established use cases, this tool may be of help especially if you feel more at home in the JavaScript/nodejs universe and have use cases with customized ingestion and data transformation needs.
16
16
 
17
- **This is experimental code, use at your own risk. Nonetheless, I encourage you to give it a try so I can gather some feedback.**
18
-
19
- ### So why is this still _alpha_?
20
-
21
- - The API is not quite final and might change from release to release.
22
- - The code needs some more safety measures to avoid some possible accidental data loss scenarios.
23
- - No test coverage yet.
24
-
25
- ---
26
-
27
- Now that we've talked about the caveats, let's have a look what you actually get with this tool:
28
-
29
17
  ## Features
30
18
 
31
19
  - Buffering/Streaming for both reading and indexing. Files are read using streaming and Elasticsearch ingestion is done using buffered bulk indexing. This is tailored towards ingestion of large files. Successfully tested so far with JSON and CSV files in the range of 20-30 GBytes. On a single machine running both `node-es-transformer` and Elasticsearch ingestion rates up to 20k documents/second were achieved (2,9 GHz Intel Core i7, 16GByte RAM, SSD), depending on document size.
32
20
  - Supports wildcards to ingest/transform a range of files in one go.
33
21
  - Supports fetching documents from existing indices using search/scroll. This allows you to reindex with custom data transformations just using JavaScript in the `transform` callback.
22
+ - Supports ingesting docs based on a nodejs stream.
34
23
  - The `transform` callback gives you each source document, but you can split it up in multiple ones and return an array of documents. An example use case for this: Each source document is a Tweet and you want to transform that into an entity centric index based on Hashtags.
35
24
 
36
25
  ## Getting started
@@ -112,9 +101,10 @@ transformer({
112
101
  - `sourceClientConfig`/`targetClientConfig`: Optional Elasticsearch client options, defaults to `{ node: 'http://localhost:9200' }`.
113
102
  - `bufferSize`: The threshold to flush bulk index request in KBytes, defaults to `5120`.
114
103
  - `searchSize`: The amount of documents to be fetched with each search request when reindexing from another source index.
115
- - `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName` is not allowed.
104
+ - `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName` and `stream` are not allowed.
105
+ - `stream`: Source nodejs stream to ingest. If this is set, `sourceIndexName` and `fileName` are not allowed.
116
106
  - `splitRegex`: Custom line split regex, defaults to `/\n/`.
117
- - `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName` is not allowed.
107
+ - `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName` and `stream` are not allowed.
118
108
  - `targetIndexName`: The target Elasticsearch index where documents will be indexed.
119
109
  - `mappings`: Optional Elasticsearch document mappings. If not set and you're reindexing from another index, the mappings from the existing index will be used.
120
110
  - `mappingsOverride`: If you're reindexing and this is set to `true`, `mappings` will be applied on top of the source index's mappings. Defaults to `false`.
@@ -148,10 +138,10 @@ yarn
148
138
 
149
139
  ```bash
150
140
  # Download the docker image
151
- docker pull docker.elastic.co/elasticsearch/elasticsearch:8.15.0
141
+ docker pull docker.elastic.co/elasticsearch/elasticsearch:8.17.0
152
142
 
153
143
  # Run the container
154
- docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.15.0
144
+ docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.17.0
155
145
  ```
156
146
 
157
147
  To commit, use `cz`. To prepare a release, use e.g. `yarn release -- --release-as 1.0.0-beta2`.
@@ -230,7 +230,6 @@ function indexQueueFactory(ref) {
230
230
 
231
231
  // Async IIFE to start bulk indexing
232
232
  (async function () {
233
- console.log('START BULK INDEXING');
234
233
  await client.helpers.bulk({
235
234
  concurrency: parallelCalls,
236
235
  flushBytes: flushBytes,
@@ -243,7 +242,6 @@ function indexQueueFactory(ref) {
243
242
  };
244
243
  },
245
244
  });
246
- console.log('FINISHED BULK INDEXING');
247
245
 
248
246
  queueEmitter.emit('finish');
249
247
  })();
@@ -294,20 +292,18 @@ function indexReaderFactory(
294
292
 
295
293
  async function fetchPopulatedFields() {
296
294
  try {
297
- var response = await client.search({
298
- index: sourceIndexName,
299
- size: searchSize,
300
- query: {
301
- function_score: {
302
- query: query,
303
- random_score: {},
304
- },
295
+ // Get all populated fields from the index
296
+ var response = await client.fieldCaps(
297
+ {
298
+ index: sourceIndexName,
299
+ fields: '*',
300
+ include_empty_fields: false,
301
+ filters: '-metadata',
305
302
  },
306
- });
303
+ { maxRetries: 0 }
304
+ );
307
305
 
308
- // Get all field names for each returned doc and flatten it
309
- // to a list of unique field names used across all docs.
310
- return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
306
+ return Object.keys(response.fields);
311
307
  } catch (e) {
312
308
  console.log('error', e);
313
309
  }
@@ -415,7 +411,6 @@ function indexReaderFactory(
415
411
 
416
412
  function streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose) {
417
413
  function startIndex() {
418
- console.log('START INDEX', splitRegex);
419
414
  var finished = false;
420
415
 
421
416
  var s = stream$$1.pipe(split(splitRegex)).pipe(
@@ -449,7 +444,7 @@ function streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose)
449
444
  }
450
445
  })
451
446
  .on('error', function (err) {
452
- console.log('Error while reading file.', err);
447
+ console.log('Error while reading stream.', err);
453
448
  })
454
449
  .on('end', function () {
455
450
  if (verbose) { console.log('Read entire stream.'); }
@@ -494,7 +489,6 @@ async function transformer(ref) {
494
489
  var transform = ref.transform;
495
490
  var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
496
491
 
497
- console.log('TRANSFORMER');
498
492
  if (typeof targetIndexName === 'undefined') {
499
493
  throw Error('targetIndexName must be specified.');
500
494
  }
@@ -557,7 +551,6 @@ async function transformer(ref) {
557
551
  }
558
552
 
559
553
  if (typeof stream$$1 !== 'undefined') {
560
- console.log('STREAM READER');
561
554
  return streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose);
562
555
  }
563
556
 
@@ -565,11 +558,9 @@ async function transformer(ref) {
565
558
  }
566
559
 
567
560
  var reader = getReader();
568
- console.log('READER INITIALIZED');
569
561
 
570
562
  try {
571
563
  var indexExists = await targetClient.indices.exists({ index: targetIndexName });
572
- console.log('INDEX EXISTS', indexExists);
573
564
 
574
565
  if (indexExists === false) {
575
566
  await createMapping();
@@ -226,7 +226,6 @@ function indexQueueFactory(ref) {
226
226
 
227
227
  // Async IIFE to start bulk indexing
228
228
  (async function () {
229
- console.log('START BULK INDEXING');
230
229
  await client.helpers.bulk({
231
230
  concurrency: parallelCalls,
232
231
  flushBytes: flushBytes,
@@ -239,7 +238,6 @@ function indexQueueFactory(ref) {
239
238
  };
240
239
  },
241
240
  });
242
- console.log('FINISHED BULK INDEXING');
243
241
 
244
242
  queueEmitter.emit('finish');
245
243
  })();
@@ -290,20 +288,18 @@ function indexReaderFactory(
290
288
 
291
289
  async function fetchPopulatedFields() {
292
290
  try {
293
- var response = await client.search({
294
- index: sourceIndexName,
295
- size: searchSize,
296
- query: {
297
- function_score: {
298
- query: query,
299
- random_score: {},
300
- },
291
+ // Get all populated fields from the index
292
+ var response = await client.fieldCaps(
293
+ {
294
+ index: sourceIndexName,
295
+ fields: '*',
296
+ include_empty_fields: false,
297
+ filters: '-metadata',
301
298
  },
302
- });
299
+ { maxRetries: 0 }
300
+ );
303
301
 
304
- // Get all field names for each returned doc and flatten it
305
- // to a list of unique field names used across all docs.
306
- return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
302
+ return Object.keys(response.fields);
307
303
  } catch (e) {
308
304
  console.log('error', e);
309
305
  }
@@ -411,7 +407,6 @@ function indexReaderFactory(
411
407
 
412
408
  function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
413
409
  function startIndex() {
414
- console.log('START INDEX', splitRegex);
415
410
  var finished = false;
416
411
 
417
412
  var s = stream.pipe(split(splitRegex)).pipe(
@@ -445,7 +440,7 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
445
440
  }
446
441
  })
447
442
  .on('error', function (err) {
448
- console.log('Error while reading file.', err);
443
+ console.log('Error while reading stream.', err);
449
444
  })
450
445
  .on('end', function () {
451
446
  if (verbose) { console.log('Read entire stream.'); }
@@ -490,7 +485,6 @@ async function transformer(ref) {
490
485
  var transform = ref.transform;
491
486
  var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
492
487
 
493
- console.log('TRANSFORMER');
494
488
  if (typeof targetIndexName === 'undefined') {
495
489
  throw Error('targetIndexName must be specified.');
496
490
  }
@@ -553,7 +547,6 @@ async function transformer(ref) {
553
547
  }
554
548
 
555
549
  if (typeof stream !== 'undefined') {
556
- console.log('STREAM READER');
557
550
  return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
558
551
  }
559
552
 
@@ -561,11 +554,9 @@ async function transformer(ref) {
561
554
  }
562
555
 
563
556
  var reader = getReader();
564
- console.log('READER INITIALIZED');
565
557
 
566
558
  try {
567
559
  var indexExists = await targetClient.indices.exists({ index: targetIndexName });
568
- console.log('INDEX EXISTS', indexExists);
569
560
 
570
561
  if (indexExists === false) {
571
562
  await createMapping();
package/package.json CHANGED
@@ -14,11 +14,11 @@
14
14
  "license": "Apache-2.0",
15
15
  "author": "Walter Rafelsberger <walter@rafelsberger.at>",
16
16
  "contributors": [],
17
- "version": "1.0.0-beta3",
17
+ "version": "1.0.0-beta4",
18
18
  "main": "dist/node-es-transformer.cjs.js",
19
19
  "module": "dist/node-es-transformer.esm.js",
20
20
  "dependencies": {
21
- "@elastic/elasticsearch": "^8.15.0",
21
+ "@elastic/elasticsearch": "^8.17.0",
22
22
  "cli-progress": "^3.12.0",
23
23
  "event-stream": "3.3.4",
24
24
  "git-cz": "^4.9.0",