npm - node-es-transformer - Versions diffs - 1.0.0-beta3 → 1.0.0-beta4 - Mend

node-es-transformer 1.0.0-beta3 → 1.0.0-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md +6 -16
package/dist/node-es-transformer.cjs.js +11 -20
package/dist/node-es-transformer.esm.js +11 -20
package/package.json +2 -2

package/README.md CHANGED Viewed

@@ -14,23 +14,12 @@ If you're looking for a nodejs based tool which allows you to ingest large CSV/J
 While I'd generally recommend using [Logstash](https://www.elastic.co/products/logstash), [filebeat](https://www.elastic.co/products/beats/filebeat), [Ingest Nodes](https://www.elastic.co/guide/en/elasticsearch/reference/master/ingest.html), [Elastic Agent](https://www.elastic.co/guide/en/fleet/current/fleet-overview.html) or [Elasticsearch Transforms](https://www.elastic.co/guide/en/elasticsearch/reference/current/transforms.html) for established use cases, this tool may be of help especially if you feel more at home in the JavaScript/nodejs universe and have use cases with customized ingestion and data transformation needs.
-**This is experimental code, use at your own risk. Nonetheless, I encourage you to give it a try so I can gather some feedback.**
-### So why is this still _alpha_?
-- The API is not quite final and might change from release to release.
-- The code needs some more safety measures to avoid some possible accidental data loss scenarios.
-- No test coverage yet.
----
-Now that we've talked about the caveats, let's have a look what you actually get with this tool:
 ## Features
 - Buffering/Streaming for both reading and indexing. Files are read using streaming and Elasticsearch ingestion is done using buffered bulk indexing. This is tailored towards ingestion of large files. Successfully tested so far with JSON and CSV files in the range of 20-30 GBytes. On a single machine running both `node-es-transformer` and Elasticsearch ingestion rates up to 20k documents/second were achieved (2,9 GHz Intel Core i7, 16GByte RAM, SSD), depending on document size.
 - Supports wildcards to ingest/transform a range of files in one go.
 - Supports fetching documents from existing indices using search/scroll. This allows you to reindex with custom data transformations just using JavaScript in the `transform` callback.
+- Supports ingesting docs based on a nodejs stream.
 - The `transform` callback gives you each source document, but you can split it up in multiple ones and return an array of documents. An example use case for this: Each source document is a Tweet and you want to transform that into an entity centric index based on Hashtags.
 ## Getting started
@@ -112,9 +101,10 @@ transformer({
 - `sourceClientConfig`/`targetClientConfig`: Optional Elasticsearch client options, defaults to `{ node: 'http://localhost:9200' }`.
 - `bufferSize`: The threshold to flush bulk index request in KBytes, defaults to `5120`.
 - `searchSize`: The amount of documents to be fetched with each search request when reindexing from another source index.
-- `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName` is not allowed.
+- `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName` and `stream` are not allowed.
+- `stream`: Source nodejs stream to ingest. If this is set, `sourceIndexName` and `fileName` are not allowed.
 - `splitRegex`: Custom line split regex, defaults to `/\n/`.
-- `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName` is not allowed.
+- `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName` and `stream` are not allowed.
 - `targetIndexName`: The target Elasticsearch index where documents will be indexed.
 - `mappings`: Optional Elasticsearch document mappings. If not set and you're reindexing from another index, the mappings from the existing index will be used.
 - `mappingsOverride`: If you're reindexing and this is set to `true`, `mappings` will be applied on top of the source index's mappings. Defaults to `false`.
@@ -148,10 +138,10 @@ yarn
 ```bash
 # Download the docker image
-docker pull docker.elastic.co/elasticsearch/elasticsearch:8.15.0
+docker pull docker.elastic.co/elasticsearch/elasticsearch:8.17.0
 # Run the container
-docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.15.0
+docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.17.0
 ```
 To commit, use `cz`. To prepare a release, use e.g. `yarn release -- --release-as 1.0.0-beta2`.

package/dist/node-es-transformer.cjs.js CHANGED Viewed

@@ -230,7 +230,6 @@ function indexQueueFactory(ref) {
   // Async IIFE to start bulk indexing
   (async function () {
-    console.log('START BULK INDEXING');
     await client.helpers.bulk({
       concurrency: parallelCalls,
       flushBytes: flushBytes,
@@ -243,7 +242,6 @@ function indexQueueFactory(ref) {
         };
       },
     });
-    console.log('FINISHED BULK INDEXING');
     queueEmitter.emit('finish');
   })();
@@ -294,20 +292,18 @@ function indexReaderFactory(
     async function fetchPopulatedFields() {
       try {
-        var response = await client.search({
-          index: sourceIndexName,
-          size: searchSize,
-          query: {
-            function_score: {
-              query: query,
-              random_score: {},
-            },
+        // Get all populated fields from the index
+        var response = await client.fieldCaps(
+          {
+            index: sourceIndexName,
+            fields: '*',
+            include_empty_fields: false,
+            filters: '-metadata',
           },
-        });
+          { maxRetries: 0 }
+        );
-        // Get all field names for each returned doc and flatten it
-        // to a list of unique field names used across all docs.
-        return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
+        return Object.keys(response.fields);
       } catch (e) {
         console.log('error', e);
       }
@@ -415,7 +411,6 @@ function indexReaderFactory(
 function streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose) {
   function startIndex() {
-    console.log('START INDEX', splitRegex);
     var finished = false;
     var s = stream$$1.pipe(split(splitRegex)).pipe(
@@ -449,7 +444,7 @@ function streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose)
           }
         })
         .on('error', function (err) {
-          console.log('Error while reading file.', err);
+          console.log('Error while reading stream.', err);
         })
         .on('end', function () {
           if (verbose) { console.log('Read entire stream.'); }
@@ -494,7 +489,6 @@ async function transformer(ref) {
   var transform = ref.transform;
   var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
-  console.log('TRANSFORMER');
   if (typeof targetIndexName === 'undefined') {
     throw Error('targetIndexName must be specified.');
   }
@@ -557,7 +551,6 @@ async function transformer(ref) {
     }
     if (typeof stream$$1 !== 'undefined') {
-      console.log('STREAM READER');
       return streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose);
     }
@@ -565,11 +558,9 @@ async function transformer(ref) {
   }
   var reader = getReader();
-  console.log('READER INITIALIZED');
   try {
     var indexExists = await targetClient.indices.exists({ index: targetIndexName });
-    console.log('INDEX EXISTS', indexExists);
     if (indexExists === false) {
       await createMapping();

package/dist/node-es-transformer.esm.js CHANGED Viewed

@@ -226,7 +226,6 @@ function indexQueueFactory(ref) {
   // Async IIFE to start bulk indexing
   (async function () {
-    console.log('START BULK INDEXING');
     await client.helpers.bulk({
       concurrency: parallelCalls,
       flushBytes: flushBytes,
@@ -239,7 +238,6 @@ function indexQueueFactory(ref) {
         };
       },
     });
-    console.log('FINISHED BULK INDEXING');
     queueEmitter.emit('finish');
   })();
@@ -290,20 +288,18 @@ function indexReaderFactory(
     async function fetchPopulatedFields() {
       try {
-        var response = await client.search({
-          index: sourceIndexName,
-          size: searchSize,
-          query: {
-            function_score: {
-              query: query,
-              random_score: {},
-            },
+        // Get all populated fields from the index
+        var response = await client.fieldCaps(
+          {
+            index: sourceIndexName,
+            fields: '*',
+            include_empty_fields: false,
+            filters: '-metadata',
           },
-        });
+          { maxRetries: 0 }
+        );
-        // Get all field names for each returned doc and flatten it
-        // to a list of unique field names used across all docs.
-        return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
+        return Object.keys(response.fields);
       } catch (e) {
         console.log('error', e);
       }
@@ -411,7 +407,6 @@ function indexReaderFactory(
 function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
   function startIndex() {
-    console.log('START INDEX', splitRegex);
     var finished = false;
     var s = stream.pipe(split(splitRegex)).pipe(
@@ -445,7 +440,7 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
           }
         })
         .on('error', function (err) {
-          console.log('Error while reading file.', err);
+          console.log('Error while reading stream.', err);
         })
         .on('end', function () {
           if (verbose) { console.log('Read entire stream.'); }
@@ -490,7 +485,6 @@ async function transformer(ref) {
   var transform = ref.transform;
   var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
-  console.log('TRANSFORMER');
   if (typeof targetIndexName === 'undefined') {
     throw Error('targetIndexName must be specified.');
   }
@@ -553,7 +547,6 @@ async function transformer(ref) {
     }
     if (typeof stream !== 'undefined') {
-      console.log('STREAM READER');
       return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
     }
@@ -561,11 +554,9 @@ async function transformer(ref) {
   }
   var reader = getReader();
-  console.log('READER INITIALIZED');
   try {
     var indexExists = await targetClient.indices.exists({ index: targetIndexName });
-    console.log('INDEX EXISTS', indexExists);
     if (indexExists === false) {
       await createMapping();

package/package.json CHANGED Viewed

@@ -14,11 +14,11 @@
   "license": "Apache-2.0",
   "author": "Walter Rafelsberger <walter@rafelsberger.at>",
   "contributors": [],
-  "version": "1.0.0-beta3",
+  "version": "1.0.0-beta4",
   "main": "dist/node-es-transformer.cjs.js",
   "module": "dist/node-es-transformer.esm.js",
   "dependencies": {
-    "@elastic/elasticsearch": "^8.15.0",
+    "@elastic/elasticsearch": "^8.17.0",
     "cli-progress": "^3.12.0",
     "event-stream": "3.3.4",
     "git-cz": "^4.9.0",