npm - node-es-transformer - Versions diffs - 1.0.0 → 1.0.2 - Mend

node-es-transformer 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +3 -4
package/dist/node-es-transformer.cjs.js +619 -0
package/dist/node-es-transformer.cjs.js.map +1 -0
package/dist/node-es-transformer.esm.js +617 -0
package/dist/node-es-transformer.esm.js.map +1 -0
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -79,11 +79,10 @@ If you need to ingest large CSV/JSON files (GigaBytes) into Elasticsearch withou
 | node-es-transformer     | Elasticsearch Client | Elasticsearch Server | Node.js |
 | ----------------------- | -------------------- | -------------------- | ------- |
-| 1.0.0-beta8+            | 8.x and 9.x          | 8.x and 9.x          | 22+     |
-| 1.0.0-beta7             | 9.x only             | 9.x only             | 22+     |
-| 1.0.0-beta6 and earlier | 8.x                  | 8.x                  | 22+     |
+| 1.0.0+                  | 8.x and 9.x          | 8.x and 9.x          | 22+     |
+| 1.0.0-beta7 and earlier | 8.x                  | 8.x                  | 18-20   |
-**Multi-Version Support**: Starting with v1.0.0-beta8, the library supports both Elasticsearch 8.x and 9.x through automatic version detection and client aliasing. This enables seamless reindexing between major versions (e.g., migrating from ES 8.x to 9.x). All functionality is tested in CI against multiple ES versions including cross-version reindexing scenarios.
+**Multi-Version Support**: Starting with v1.0.0, the library supports both Elasticsearch 8.x and 9.x through automatic version detection and client aliasing. This enables seamless reindexing between major versions (e.g., migrating from ES 8.x to 9.x). All functionality is tested in CI against multiple ES versions including cross-version reindexing scenarios.
 **Upgrading?** See [MIGRATION.md](MIGRATION.md) for upgrade guidance from beta versions to v1.0.0.

package/dist/node-es-transformer.cjs.js ADDED Viewed

@@ -0,0 +1,619 @@
+'use strict';
+var elasticsearch9 = require('es9');
+var elasticsearch8 = require('es8');
+var fs = require('fs');
+var es = require('event-stream');
+var glob = require('glob');
+var split = require('split2');
+var stream = require('stream');
+var cliProgress = require('cli-progress');
+// In earlier versions this was used to set the number of docs to index in a
+// single bulk request. Since we switched to use the helpers.bulk() method from
+// the ES client, this now translates to the `flushBytes` option of the helper.
+// However, for kind of a backwards compability with the old values, this uses
+// KBytes instead of Bytes. It will be multiplied by 1024 in the index queue.
+const DEFAULT_BUFFER_SIZE = 5120;
+// The default number of docs to fetch in a single search request when reindexing.
+const DEFAULT_SEARCH_SIZE = 1000;
+function createMappingFactory({
+  sourceClient,
+  sourceIndexName,
+  targetClient,
+  targetIndexName,
+  mappings,
+  mappingsOverride,
+  indexMappingTotalFieldsLimit,
+  verbose,
+  deleteIndex,
+  pipeline
+}) {
+  return async () => {
+    let targetMappings = mappingsOverride ? undefined : mappings;
+    if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
+      try {
+        const mapping = await sourceClient.indices.getMapping({
+          index: sourceIndexName
+        });
+        if (mapping[sourceIndexName]) {
+          targetMappings = mapping[sourceIndexName].mappings;
+        } else {
+          const allMappings = Object.values(mapping);
+          if (allMappings.length > 0) {
+            targetMappings = Object.values(mapping)[0].mappings;
+          }
+        }
+      } catch (err) {
+        console.log('Error reading source mapping', err);
+        return;
+      }
+    }
+    if (typeof targetMappings === 'object' && targetMappings !== null) {
+      if (mappingsOverride) {
+        targetMappings = {
+          ...targetMappings,
+          properties: {
+            ...targetMappings.properties,
+            ...mappings
+          }
+        };
+      }
+      try {
+        const indexExists = await targetClient.indices.exists({
+          index: targetIndexName
+        });
+        if (indexExists === true && deleteIndex === true) {
+          await targetClient.indices.delete({
+            index: targetIndexName
+          });
+        }
+        if (indexExists === false || deleteIndex === true) {
+          const resp = await targetClient.indices.create({
+            index: targetIndexName,
+            mappings: targetMappings,
+            ...(pipeline !== undefined ? {
+              settings: {
+                index: {
+                  default_pipeline: pipeline
+                }
+              }
+            } : {}),
+            ...(indexMappingTotalFieldsLimit !== undefined ? {
+              settings: {
+                'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
+                'index.number_of_shards': 1,
+                'index.number_of_replicas': 0
+              }
+            } : {})
+          });
+          if (verbose) console.log('Created target mapping', resp);
+        }
+      } catch (err) {
+        console.log('Error creating target mapping', err);
+      }
+    }
+  };
+}
+function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
+  function startIndex(files) {
+    let finished = false;
+    const file = files.shift();
+    const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
+      try {
+        // skip empty lines
+        if (line === '') {
+          return;
+        }
+        const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
+        // if doc is undefined we'll skip indexing it
+        if (typeof doc === 'undefined') {
+          s.resume();
+          return;
+        }
+        // the transform callback may return an array of docs so we can emit
+        // multiple docs from a single line
+        if (Array.isArray(doc)) {
+          doc.forEach(d => indexer.add(d));
+          return;
+        }
+        indexer.add(doc);
+      } catch (e) {
+        console.log('error', e);
+      }
+    }).on('error', err => {
+      console.log('Error while reading file.', err);
+    }).on('end', () => {
+      if (verbose) console.log('Read entire file: ', file);
+      if (files.length > 0) {
+        startIndex(files);
+        return;
+      }
+      indexer.finish();
+      finished = true;
+    }));
+    indexer.queueEmitter.on('pause', () => {
+      if (finished) return;
+      s.pause();
+    });
+    indexer.queueEmitter.on('resume', () => {
+      if (finished) return;
+      s.resume();
+    });
+  }
+  return () => {
+    try {
+      const files = glob.globSync(fileName);
+      startIndex(files);
+    } catch (error) {
+      console.log('Error matching files:', error);
+    }
+  };
+}
+const EventEmitter = require('events');
+const queueEmitter = new EventEmitter();
+const parallelCalls = 5;
+// a simple helper queue to bulk index documents
+function indexQueueFactory({
+  targetClient: client,
+  targetIndexName,
+  bufferSize = DEFAULT_BUFFER_SIZE,
+  skipHeader = false
+}) {
+  let docsPerSecond = 0;
+  const flushBytes = bufferSize * 1024; // Convert KB to Bytes
+  const highWaterMark = flushBytes * parallelCalls;
+  // Create a Readable stream
+  const stream$1 = new stream.Readable({
+    read() {},
+    // Implement read but we manage pushing manually
+    highWaterMark // Buffer size for backpressure management
+  });
+  async function* ndjsonStreamIterator(readableStream) {
+    let buffer = ''; // To hold the incomplete data
+    let skippedHeader = false;
+    try {
+      // Iterate over the stream using async iteration
+      for await (const chunk of readableStream) {
+        buffer += chunk.toString(); // Accumulate the chunk data in the buffer
+        // Split the buffer into lines (NDJSON items)
+        const lines = buffer.split('\n');
+        // The last line might be incomplete, so hold it back in the buffer
+        buffer = lines.pop();
+        // Yield each complete JSON object
+        for (const line of lines) {
+          if (line.trim()) {
+            try {
+              if (!skipHeader || skipHeader && !skippedHeader) {
+                yield JSON.parse(line); // Parse and yield the JSON object
+                skippedHeader = true;
+              }
+            } catch (err) {
+              // Handle JSON parse errors if necessary
+              console.error('Failed to parse JSON:', err);
+            }
+          }
+        }
+      }
+      // Handle any remaining data in the buffer after the stream ends
+      if (buffer.trim()) {
+        try {
+          yield JSON.parse(buffer);
+        } catch (err) {
+          console.error('Failed to parse final JSON:', err);
+        }
+      }
+    } finally {
+      // Ensure the stream is properly cleaned up if the iterator is terminated early
+      if (!readableStream.destroyed) {
+        readableStream.destroy();
+      }
+    }
+  }
+  let finished = false;
+  let drainListener = null;
+  // Async IIFE to start bulk indexing
+  (async () => {
+    const interval = setInterval(() => {
+      queueEmitter.emit('docsPerSecond', docsPerSecond);
+      docsPerSecond = 0;
+    }, 1000);
+    try {
+      await client.helpers.bulk({
+        concurrency: parallelCalls,
+        flushBytes,
+        flushInterval: 1000,
+        refreshOnCompletion: true,
+        datasource: ndjsonStreamIterator(stream$1),
+        onDocument(doc) {
+          docsPerSecond++;
+          return {
+            index: {
+              _index: targetIndexName
+            }
+          };
+        }
+      });
+    } catch (error) {
+      console.error('Error during bulk indexing:', error);
+      throw error;
+    } finally {
+      // Clean up interval
+      clearInterval(interval);
+      // Remove drain listener if it exists
+      if (drainListener) {
+        stream$1.removeListener('drain', drainListener);
+        drainListener = null;
+      }
+      // Remove all listeners from stream
+      stream$1.removeAllListeners();
+      // Properly destroy the stream to prevent open handles
+      if (!stream$1.destroyed) {
+        stream$1.destroy();
+      }
+      // Emit finish and clean up queue emitter listeners
+      queueEmitter.emit('finish');
+      queueEmitter.removeAllListeners();
+    }
+  })();
+  return {
+    add: doc => {
+      if (finished) {
+        throw new Error('Unexpected doc added after indexer should finish.');
+      }
+      const canContinue = stream$1.push(`${JSON.stringify(doc)}\n`);
+      if (!canContinue) {
+        queueEmitter.emit('pause');
+        // Store the listener so we can clean it up later
+        drainListener = () => {
+          queueEmitter.emit('resume');
+        };
+        stream$1.once('drain', drainListener);
+      }
+    },
+    finish: () => {
+      finished = true;
+      stream$1.push(null);
+    },
+    queueEmitter
+  };
+}
+// create a new progress bar instance and use shades_classic theme
+const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
+function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
+  return async function indexReader() {
+    let docsNum = 0;
+    let scrollId;
+    let finished = false;
+    let readActive = false;
+    let backPressurePause = false;
+    async function fetchPopulatedFields() {
+      try {
+        // Get all populated fields from the index
+        const response = await client.fieldCaps({
+          index: sourceIndexName,
+          fields: '*',
+          include_empty_fields: false,
+          filters: '-metadata'
+        }, {
+          maxRetries: 0
+        });
+        return Object.keys(response.fields);
+      } catch (e) {
+        console.log('error', e);
+      }
+    }
+    function search(fields) {
+      return client.search({
+        index: sourceIndexName,
+        scroll: '600s',
+        size: searchSize,
+        query,
+        ...(fields ? {
+          _source: fields
+        } : {})
+      });
+    }
+    function scroll(id) {
+      return client.scroll({
+        scroll_id: id,
+        scroll: '600s'
+      });
+    }
+    let fieldsWithData;
+    // identify populated fields
+    if (populatedFields) {
+      fieldsWithData = await fetchPopulatedFields();
+    }
+    await fetchNextResponse();
+    function processHit(hit) {
+      docsNum += 1;
+      try {
+        const doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
+        // if doc is undefined we'll skip indexing it
+        if (typeof doc === 'undefined') {
+          return;
+        }
+        // the transform callback may return an array of docs so we can emit
+        // multiple docs from a single line
+        if (Array.isArray(doc)) {
+          doc.forEach(d => indexer.add(d));
+          return;
+        }
+        indexer.add(doc);
+      } catch (e) {
+        console.log('error', e);
+      }
+    }
+    async function fetchNextResponse() {
+      readActive = true;
+      const sc = scrollId ? await scroll(scrollId) : await search(fieldsWithData);
+      if (!scrollId) {
+        progressBar.start(sc.hits.total.value, 0);
+      }
+      scrollId = sc._scroll_id;
+      readActive = false;
+      processResponse(sc);
+    }
+    async function processResponse(response) {
+      // collect the docs from this response
+      response.hits.hits.forEach(processHit);
+      progressBar.update(docsNum);
+      // check to see if we have collected all of the docs
+      if (response.hits.total.value === docsNum) {
+        indexer.finish();
+        return;
+      }
+      if (!backPressurePause) {
+        await fetchNextResponse();
+      }
+    }
+    indexer.queueEmitter.on('pause', async () => {
+      backPressurePause = true;
+    });
+    indexer.queueEmitter.on('resume', async () => {
+      backPressurePause = false;
+      if (readActive || finished) {
+        return;
+      }
+      await fetchNextResponse();
+    });
+    indexer.queueEmitter.on('finish', () => {
+      finished = true;
+      progressBar.stop();
+    });
+  };
+}
+function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
+  function startIndex() {
+    let finished = false;
+    const s = stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
+      try {
+        // skip empty lines
+        if (line === '') {
+          return;
+        }
+        const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
+        // if doc is undefined we'll skip indexing it
+        if (typeof doc === 'undefined') {
+          s.resume();
+          return;
+        }
+        // the transform callback may return an array of docs so we can emit
+        // multiple docs from a single line
+        if (Array.isArray(doc)) {
+          doc.forEach(d => indexer.add(d));
+          return;
+        }
+        indexer.add(doc);
+      } catch (e) {
+        console.log('error', e);
+      }
+    }).on('error', err => {
+      console.log('Error while reading stream.', err);
+    }).on('end', () => {
+      if (verbose) console.log('Read entire stream.');
+      indexer.finish();
+      finished = true;
+    }));
+    indexer.queueEmitter.on('pause', () => {
+      if (finished) return;
+      s.pause();
+    });
+    indexer.queueEmitter.on('resume', () => {
+      if (finished) return;
+      s.resume();
+    });
+  }
+  return () => {
+    startIndex();
+  };
+}
+/**
+ * Detect Elasticsearch version by querying the cluster
+ */
+async function detectElasticsearchVersion(config) {
+  try {
+    // Try with v9 client first (most common for new setups)
+    const testClient = new elasticsearch9.Client(config);
+    const info = await testClient.info();
+    const version = info.version?.number;
+    await testClient.close();
+    if (version) {
+      const majorVersion = parseInt(version.split('.')[0], 10);
+      return majorVersion;
+    }
+  } catch (e) {
+    // If v9 client fails, try v8 client
+    try {
+      const testClient = new elasticsearch8.Client(config);
+      const info = await testClient.info();
+      const version = info.version?.number;
+      await testClient.close();
+      if (version) {
+        const majorVersion = parseInt(version.split('.')[0], 10);
+        return majorVersion;
+      }
+    } catch (e2) {
+      // Could not detect version
+    }
+  }
+  // Default to v9 if detection fails
+  return 9;
+}
+/**
+ * Create or validate an Elasticsearch client
+ * @param {Object|Client} clientOrConfig - Either a client instance or config object
+ * @param {Object} defaultConfig - Default configuration to use if creating a new client
+ * @param {number} [forceVersion] - Force a specific ES client version (8 or 9)
+ */
+async function getOrCreateClient(clientOrConfig, defaultConfig, forceVersion) {
+  // If already a client instance, return it
+  if (clientOrConfig && typeof clientOrConfig.info === 'function') {
+    return clientOrConfig;
+  }
+  const config = clientOrConfig || defaultConfig;
+  // If version is forced, use the specified client
+  if (forceVersion === 8) {
+    return new elasticsearch8.Client(config);
+  } else if (forceVersion === 9) {
+    return new elasticsearch9.Client(config);
+  }
+  // Auto-detect version
+  const majorVersion = await detectElasticsearchVersion(config);
+  if (majorVersion >= 9) {
+    return new elasticsearch9.Client(config);
+  } else {
+    return new elasticsearch8.Client(config);
+  }
+}
+async function transformer({
+  deleteIndex = false,
+  sourceClient: sourceClientInput,
+  targetClient: targetClientInput,
+  sourceClientConfig,
+  targetClientConfig,
+  sourceClientVersion,
+  targetClientVersion,
+  bufferSize = DEFAULT_BUFFER_SIZE,
+  searchSize = DEFAULT_SEARCH_SIZE,
+  stream,
+  fileName,
+  splitRegex = /\n/,
+  sourceIndexName,
+  targetIndexName,
+  mappings,
+  mappingsOverride = false,
+  indexMappingTotalFieldsLimit,
+  pipeline,
+  populatedFields = false,
+  query,
+  skipHeader = false,
+  transform,
+  verbose = true
+}) {
+  if (typeof targetIndexName === 'undefined') {
+    throw Error('targetIndexName must be specified.');
+  }
+  const defaultClientConfig = {
+    node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
+  };
+  // Support both old (config) and new (client instance) patterns
+  const sourceClient = await getOrCreateClient(sourceClientInput || sourceClientConfig, defaultClientConfig, sourceClientVersion);
+  const targetClient = await getOrCreateClient(targetClientInput || targetClientConfig || sourceClientInput || sourceClientConfig, defaultClientConfig, targetClientVersion);
+  const createMapping = createMappingFactory({
+    sourceClient,
+    sourceIndexName,
+    targetClient,
+    targetIndexName,
+    mappings,
+    mappingsOverride,
+    indexMappingTotalFieldsLimit,
+    verbose,
+    deleteIndex,
+    pipeline
+  });
+  const indexer = indexQueueFactory({
+    targetClient,
+    targetIndexName,
+    bufferSize,
+    skipHeader});
+  function getReader() {
+    if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
+      throw Error('Only either one of fileName or sourceIndexName can be specified.');
+    }
+    if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined' || typeof fileName !== 'undefined' && typeof stream !== 'undefined' || typeof sourceIndexName !== 'undefined' && typeof stream !== 'undefined') {
+      throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
+    }
+    if (typeof fileName !== 'undefined') {
+      return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose);
+    }
+    if (typeof sourceIndexName !== 'undefined') {
+      return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
+    }
+    if (typeof stream !== 'undefined') {
+      return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
+    }
+    return null;
+  }
+  const reader = getReader();
+  try {
+    const indexExists = await targetClient.indices.exists({
+      index: targetIndexName
+    });
+    if (indexExists === false) {
+      await createMapping();
+      reader();
+    } else if (deleteIndex === true) {
+      await targetClient.indices.delete({
+        index: targetIndexName
+      });
+      await createMapping();
+      reader();
+    } else {
+      reader();
+    }
+  } catch (error) {
+    console.error('Error checking index existence:', error);
+  } finally {
+    // targetClient.close();
+  }
+  return {
+    events: indexer.queueEmitter
+  };
+}
+module.exports = transformer;
+//# sourceMappingURL=node-es-transformer.cjs.js.map