npm - node-es-transformer - Versions diffs - 1.0.2 → 1.1.0 - Mend

node-es-transformer 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +58 -4
package/dist/node-es-transformer.cjs.js +279 -81
package/dist/node-es-transformer.cjs.js.map +1 -1
package/dist/node-es-transformer.esm.js +280 -82
package/dist/node-es-transformer.esm.js.map +1 -1
package/index.d.ts +58 -1
package/package.json +2 -1

package/README.md CHANGED Viewed

@@ -96,7 +96,7 @@ yarn add node-es-transformer
 ## Usage
-### Read from a file
+### Read NDJSON from a file
 ```javascript
 const transformer = require('node-es-transformer');
@@ -129,6 +129,50 @@ transformer({
 });
 ```
+### Read CSV from a file
+```javascript
+const transformer = require('node-es-transformer');
+transformer({
+  fileName: 'users.csv',
+  sourceFormat: 'csv',
+  targetIndexName: 'users-index',
+  mappings: {
+    properties: {
+      id: { type: 'integer' },
+      first_name: { type: 'keyword' },
+      last_name: { type: 'keyword' },
+      full_name: { type: 'keyword' },
+    },
+  },
+  transform(row) {
+    return {
+      ...row,
+      id: Number(row.id),
+      full_name: `${row.first_name} ${row.last_name}`,
+    };
+  },
+});
+```
+### Infer mappings from CSV sample
+```javascript
+const transformer = require('node-es-transformer');
+transformer({
+  fileName: 'users.csv',
+  sourceFormat: 'csv',
+  targetIndexName: 'users-index',
+  inferMappings: true,
+  inferMappingsOptions: {
+    sampleBytes: 200000,
+    lines_to_sample: 2000,
+  },
+});
+```
 ### Read from another index
 ```javascript
@@ -242,9 +286,11 @@ All options are passed to the main `transformer()` function.
 Choose **one** of these sources:
-- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json`).
+- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json` or `data/*.csv`).
 - **`sourceIndexName`** (string): Source Elasticsearch index to reindex from.
 - **`stream`** (Readable): Node.js readable stream to ingest from.
+- **`sourceFormat`** (`'ndjson' | 'csv'`): Format for file/stream sources. Default: `'ndjson'`.
+- **`csvOptions`** (object): CSV parser options (delimiter, quote, columns, etc.) used when `sourceFormat: 'csv'`.
 #### Client Configuration
@@ -259,10 +305,14 @@ Choose **one** of these sources:
 - **`mappings`** (object): Elasticsearch document mappings for target index. If reindexing and not provided, mappings are copied from source index.
 - **`mappingsOverride`** (boolean): When reindexing, apply `mappings` on top of source index mappings. Default: `false`.
+- **`inferMappings`** (boolean): Infer mappings for `fileName` sources via `/_text_structure/find_structure`. Ignored when `mappings` is provided. If inference returns `ingest_pipeline`, it is created as `<targetIndexName>-inferred-pipeline` and applied as the index default pipeline (unless `pipeline` is explicitly set). Default: `false`.
+- **`inferMappingsOptions`** (object): Options for `/_text_structure/find_structure` (for example `sampleBytes`, `lines_to_sample`, `delimiter`, `quote`, `has_header_row`, `timeout`).
 - **`deleteIndex`** (boolean): Delete target index if it exists before starting. Default: `false`.
 - **`indexMappingTotalFieldsLimit`** (number): Field limit for target index (`index.mapping.total_fields.limit` setting).
 - **`pipeline`** (string): Elasticsearch ingest pipeline name to use during indexing.
+When `inferMappings` is enabled, the target cluster must allow `/_text_structure/find_structure` (cluster privilege: `monitor_text_structure`). If inferred ingest pipelines are used, the target cluster must also allow creating ingest pipelines (`_ingest/pipeline`).
 #### Performance Options
 - **`bufferSize`** (number): Buffer size threshold in KBytes for bulk indexing. Default: `5120` (5 MB).
@@ -276,8 +326,12 @@ Choose **one** of these sources:
   - Return array of documents to split one source into multiple targets
   - Return `null`/`undefined` to skip document
 - **`query`** (object): Elasticsearch [DSL query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) to filter source documents.
-- **`splitRegex`** (RegExp): Line split regex for file/stream sources. Default: `/\n/`.
-- **`skipHeader`** (boolean): Skip first line of source file (e.g., CSV header). Default: `false`.
+- **`splitRegex`** (RegExp): Line split regex for file/stream sources when `sourceFormat` is `'ndjson'`. Default: `/\n/`.
+- **`skipHeader`** (boolean): Header skipping for file/stream sources.
+  - NDJSON: skips the first non-empty line
+  - CSV: skips the first data line only when `csvOptions.columns` does not consume headers
+  - Default: `false`
+  - Applies only to `fileName`/`stream` sources
 - **`verbose`** (boolean): Enable logging and progress bars. Default: `true`.
 ### Return Value

package/dist/node-es-transformer.cjs.js CHANGED Viewed

@@ -3,6 +3,7 @@
 var elasticsearch9 = require('es9');
 var elasticsearch8 = require('es8');
 var fs = require('fs');
+var csvParse = require('csv-parse');
 var es = require('event-stream');
 var glob = require('glob');
 var split = require('split2');
@@ -25,6 +26,7 @@ function createMappingFactory({
   targetClient,
   targetIndexName,
   mappings,
+  inferredIngestPipeline,
   mappingsOverride,
   indexMappingTotalFieldsLimit,
   verbose,
@@ -33,6 +35,7 @@ function createMappingFactory({
 }) {
   return async () => {
     let targetMappings = mappingsOverride ? undefined : mappings;
+    let defaultPipeline = pipeline;
     if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
       try {
         const mapping = await sourceClient.indices.getMapping({
@@ -71,22 +74,34 @@ function createMappingFactory({
           });
         }
         if (indexExists === false || deleteIndex === true) {
+          if (typeof defaultPipeline === 'undefined' && typeof inferredIngestPipeline === 'object' && inferredIngestPipeline !== null && typeof targetClient?.ingest?.putPipeline === 'function') {
+            const inferredPipelineName = `${targetIndexName}-inferred-pipeline`;
+            try {
+              await targetClient.ingest.putPipeline({
+                id: inferredPipelineName,
+                ...inferredIngestPipeline
+              });
+              defaultPipeline = inferredPipelineName;
+              if (verbose) console.log(`Created inferred ingest pipeline ${inferredPipelineName}`);
+            } catch (err) {
+              console.log('Error creating inferred ingest pipeline', err);
+            }
+          }
+          const settings = {
+            ...(defaultPipeline !== undefined ? {
+              'index.default_pipeline': defaultPipeline
+            } : {}),
+            ...(indexMappingTotalFieldsLimit !== undefined ? {
+              'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
+              'index.number_of_shards': 1,
+              'index.number_of_replicas': 0
+            } : {})
+          };
           const resp = await targetClient.indices.create({
             index: targetIndexName,
             mappings: targetMappings,
-            ...(pipeline !== undefined ? {
-              settings: {
-                index: {
-                  default_pipeline: pipeline
-                }
-              }
-            } : {}),
-            ...(indexMappingTotalFieldsLimit !== undefined ? {
-              settings: {
-                'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
-                'index.number_of_shards': 1,
-                'index.number_of_replicas': 0
-              }
+            ...(Object.keys(settings).length > 0 ? {
+              settings
             } : {})
           });
           if (verbose) console.log('Created target mapping', resp);
@@ -98,37 +113,89 @@ function createMappingFactory({
   };
 }
-function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
-  function startIndex(files) {
-    let finished = false;
-    const file = files.shift();
+function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
+  const options = {
+    bom: true,
+    columns: true,
+    trim: true,
+    skip_empty_lines: true,
+    ...csvOptions
+  };
+  const consumesHeader = options.columns === true || typeof options.columns === 'function';
+  if (skipHeader && !consumesHeader && typeof options.from_line === 'undefined') {
+    options.from_line = 2;
+  }
+  return options;
+}
+function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
+  function addParsedDoc(parsed, file, streamRef) {
+    const context = {
+      fileName: file
+    };
+    const doc = typeof transform === 'function' ? transform(parsed, context) : parsed;
+    // if doc is null/undefined we'll skip indexing it
+    if (doc === null || typeof doc === 'undefined') {
+      streamRef.resume();
+      return;
+    }
+    // the transform callback may return an array of docs so we can emit
+    // multiple docs from a single line
+    if (Array.isArray(doc)) {
+      doc.forEach(d => {
+        if (d === null || typeof d === 'undefined') return;
+        indexer.add(d);
+      });
+      return;
+    }
+    indexer.add(doc);
+  }
+  function createNdjsonReader(file) {
+    let skippedHeader = false;
     const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
       try {
         // skip empty lines
         if (line === '') {
           return;
         }
-        const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
-        // if doc is undefined we'll skip indexing it
-        if (typeof doc === 'undefined') {
-          s.resume();
-          return;
-        }
-        // the transform callback may return an array of docs so we can emit
-        // multiple docs from a single line
-        if (Array.isArray(doc)) {
-          doc.forEach(d => indexer.add(d));
+        if (skipHeader && !skippedHeader) {
+          skippedHeader = true;
           return;
         }
-        indexer.add(doc);
+        const parsed = JSON.parse(line);
+        addParsedDoc(parsed, file, s);
       } catch (e) {
         console.log('error', e);
       }
     }).on('error', err => {
       console.log('Error while reading file.', err);
-    }).on('end', () => {
+    }));
+    return s;
+  }
+  function createCsvReader(file) {
+    const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
+    const s = fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
+      try {
+        addParsedDoc(record, file, s);
+      } catch (e) {
+        console.log('error', e);
+      }
+    }).on('error', err => {
+      console.log('Error while reading CSV file.', err);
+    }));
+    return s;
+  }
+  function startIndex(files) {
+    let finished = false;
+    if (files.length === 0) {
+      indexer.finish();
+      return;
+    }
+    const file = files.shift();
+    const s = sourceFormat === 'csv' ? createCsvReader(file) : createNdjsonReader(file);
+    s.on('end', () => {
       if (verbose) console.log('Read entire file: ', file);
       if (files.length > 0) {
         startIndex(files);
@@ -136,7 +203,7 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
       }
       indexer.finish();
       finished = true;
-    }));
+    });
     indexer.queueEmitter.on('pause', () => {
       if (finished) return;
       s.pause();
@@ -157,29 +224,26 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
 }
 const EventEmitter = require('events');
-const queueEmitter = new EventEmitter();
 const parallelCalls = 5;
 // a simple helper queue to bulk index documents
 function indexQueueFactory({
   targetClient: client,
   targetIndexName,
-  bufferSize = DEFAULT_BUFFER_SIZE,
-  skipHeader = false
+  bufferSize = DEFAULT_BUFFER_SIZE
 }) {
+  const queueEmitter = new EventEmitter();
   let docsPerSecond = 0;
   const flushBytes = bufferSize * 1024; // Convert KB to Bytes
   const highWaterMark = flushBytes * parallelCalls;
-  // Create a Readable stream
-  const stream$1 = new stream.Readable({
-    read() {},
-    // Implement read but we manage pushing manually
+  // Create a PassThrough stream (readable + writable) for proper backpressure
+  const stream$1 = new stream.PassThrough({
     highWaterMark // Buffer size for backpressure management
   });
   async function* ndjsonStreamIterator(readableStream) {
     let buffer = ''; // To hold the incomplete data
-    let skippedHeader = false;
     try {
       // Iterate over the stream using async iteration
       for await (const chunk of readableStream) {
@@ -193,16 +257,14 @@ function indexQueueFactory({
         // Yield each complete JSON object
         for (const line of lines) {
-          if (line.trim()) {
-            try {
-              if (!skipHeader || skipHeader && !skippedHeader) {
-                yield JSON.parse(line); // Parse and yield the JSON object
-                skippedHeader = true;
-              }
-            } catch (err) {
-              // Handle JSON parse errors if necessary
-              console.error('Failed to parse JSON:', err);
-            }
+          if (!line.trim()) {
+            continue;
+          }
+          try {
+            yield JSON.parse(line); // Parse and yield the JSON object
+          } catch (err) {
+            // Handle JSON parse errors if necessary
+            console.error('Failed to parse JSON:', err);
           }
         }
       }
@@ -278,7 +340,7 @@ function indexQueueFactory({
       if (finished) {
         throw new Error('Unexpected doc added after indexer should finish.');
       }
-      const canContinue = stream$1.push(`${JSON.stringify(doc)}\n`);
+      const canContinue = stream$1.write(`${JSON.stringify(doc)}\n`);
       if (!canContinue) {
         queueEmitter.emit('pause');
@@ -291,7 +353,7 @@ function indexQueueFactory({
     },
     finish: () => {
       finished = true;
-      stream$1.push(null);
+      stream$1.end();
     },
     queueEmitter
   };
@@ -408,40 +470,154 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
   };
 }
-function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
+const DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES = 100000;
+const DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE = 1000;
+function readSample(filePath, sampleBytes) {
+  const fd = fs.openSync(filePath, 'r');
+  try {
+    const buffer = Buffer.alloc(sampleBytes);
+    const bytesRead = fs.readSync(fd, buffer, 0, sampleBytes, 0);
+    return buffer.subarray(0, bytesRead).toString('utf8');
+  } finally {
+    fs.closeSync(fd);
+  }
+}
+function emptyInferenceResult(mappings) {
+  return {
+    mappings,
+    ingestPipeline: undefined
+  };
+}
+async function inferMappingsFromSource({
+  targetClient,
+  fileName,
+  sourceFormat,
+  csvOptions,
+  skipHeader,
+  mappings,
+  inferMappings,
+  inferMappingsOptions,
+  verbose
+}) {
+  if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
+    return emptyInferenceResult(mappings);
+  }
+  if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
+    return emptyInferenceResult(mappings);
+  }
+  const files = glob.globSync(fileName);
+  if (files.length === 0) {
+    if (verbose) console.log(`No files matched for mapping inference: ${fileName}`);
+    return emptyInferenceResult(mappings);
+  }
+  const {
+    sampleBytes = DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES,
+    ...requestParams
+  } = inferMappingsOptions || {};
+  const sampleText = readSample(files[0], sampleBytes);
+  if (!sampleText || sampleText.trim() === '') {
+    if (verbose) console.log('Skipping mapping inference because the sample text is empty.');
+    return emptyInferenceResult(mappings);
+  }
+  const params = {
+    body: sampleText,
+    lines_to_sample: DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE,
+    ...requestParams
+  };
+  if (typeof params.format === 'undefined') {
+    params.format = sourceFormat === 'csv' ? 'delimited' : 'ndjson';
+  }
+  if (sourceFormat === 'csv') {
+    if (typeof params.delimiter === 'undefined' && typeof csvOptions?.delimiter === 'string') {
+      params.delimiter = csvOptions.delimiter;
+    }
+    if (typeof params.quote === 'undefined' && typeof csvOptions?.quote === 'string') {
+      params.quote = csvOptions.quote;
+    }
+    if (typeof params.has_header_row === 'undefined' && typeof csvOptions?.columns === 'boolean') {
+      params.has_header_row = csvOptions.columns;
+    }
+    if (typeof params.has_header_row === 'undefined' && skipHeader) {
+      params.has_header_row = true;
+    }
+  }
+  try {
+    const response = await targetClient.textStructure.findStructure(params);
+    if (response?.mappings && verbose) {
+      console.log(`Inferred mappings via _text_structure/find_structure from ${files[0]}`);
+    }
+    if (response?.ingest_pipeline && verbose) {
+      console.log('Inferred ingest pipeline via _text_structure/find_structure');
+    }
+    return {
+      mappings: response?.mappings || mappings,
+      ingestPipeline: response?.ingest_pipeline
+    };
+  } catch (error) {
+    if (verbose) {
+      console.log('Could not infer mappings via _text_structure/find_structure:', error.message);
+    }
+    return emptyInferenceResult(mappings);
+  }
+}
+function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
+  function addParsedDoc(parsed, streamRef) {
+    const doc = typeof transform === 'function' ? transform(parsed) : parsed;
+    // if doc is null/undefined we'll skip indexing it
+    if (doc === null || typeof doc === 'undefined') {
+      streamRef.resume();
+      return;
+    }
+    // the transform callback may return an array of docs so we can emit
+    // multiple docs from a single line
+    if (Array.isArray(doc)) {
+      doc.forEach(d => {
+        if (d === null || typeof d === 'undefined') return;
+        indexer.add(d);
+      });
+      return;
+    }
+    indexer.add(doc);
+  }
   function startIndex() {
     let finished = false;
-    const s = stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
+    const s = sourceFormat === 'csv' ? stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
       try {
-        // skip empty lines
-        if (line === '') {
-          return;
-        }
-        const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
-        // if doc is undefined we'll skip indexing it
-        if (typeof doc === 'undefined') {
-          s.resume();
-          return;
-        }
-        // the transform callback may return an array of docs so we can emit
-        // multiple docs from a single line
-        if (Array.isArray(doc)) {
-          doc.forEach(d => indexer.add(d));
-          return;
-        }
-        indexer.add(doc);
+        addParsedDoc(record, s);
       } catch (e) {
         console.log('error', e);
       }
     }).on('error', err => {
-      console.log('Error while reading stream.', err);
-    }).on('end', () => {
+      console.log('Error while reading CSV stream.', err);
+    })) : (() => {
+      let skippedHeader = false;
+      return stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
+        try {
+          // skip empty lines
+          if (line === '') {
+            return;
+          }
+          if (skipHeader && !skippedHeader) {
+            skippedHeader = true;
+            return;
+          }
+          const parsed = JSON.parse(line);
+          addParsedDoc(parsed, s);
+        } catch (e) {
+          console.log('error', e);
+        }
+      }).on('error', err => {
+        console.log('Error while reading stream.', err);
+      }));
+    })();
+    s.on('end', () => {
       if (verbose) console.log('Read entire stream.');
       indexer.finish();
       finished = true;
-    }));
+    });
     indexer.queueEmitter.on('pause', () => {
       if (finished) return;
       s.pause();
@@ -530,11 +706,15 @@ async function transformer({
   searchSize = DEFAULT_SEARCH_SIZE,
   stream,
   fileName,
+  sourceFormat = 'ndjson',
+  csvOptions = {},
   splitRegex = /\n/,
   sourceIndexName,
   targetIndexName,
   mappings,
   mappingsOverride = false,
+  inferMappings = false,
+  inferMappingsOptions = {},
   indexMappingTotalFieldsLimit,
   pipeline,
   populatedFields = false,
@@ -553,12 +733,24 @@ async function transformer({
   // Support both old (config) and new (client instance) patterns
   const sourceClient = await getOrCreateClient(sourceClientInput || sourceClientConfig, defaultClientConfig, sourceClientVersion);
   const targetClient = await getOrCreateClient(targetClientInput || targetClientConfig || sourceClientInput || sourceClientConfig, defaultClientConfig, targetClientVersion);
+  const inferenceResult = await inferMappingsFromSource({
+    targetClient,
+    fileName,
+    sourceFormat,
+    csvOptions,
+    skipHeader,
+    mappings,
+    inferMappings,
+    inferMappingsOptions,
+    verbose
+  });
   const createMapping = createMappingFactory({
     sourceClient,
     sourceIndexName,
     targetClient,
     targetIndexName,
-    mappings,
+    mappings: inferenceResult.mappings,
+    inferredIngestPipeline: inferenceResult.ingestPipeline,
     mappingsOverride,
     indexMappingTotalFieldsLimit,
     verbose,
@@ -568,8 +760,12 @@ async function transformer({
   const indexer = indexQueueFactory({
     targetClient,
     targetIndexName,
-    bufferSize,
-    skipHeader});
+    bufferSize});
+  function validateSourceFormat() {
+    if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
+      throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "csv".`);
+    }
+  }
   function getReader() {
     if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
       throw Error('Only either one of fileName or sourceIndexName can be specified.');
@@ -578,13 +774,15 @@ async function transformer({
       throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
     }
     if (typeof fileName !== 'undefined') {
-      return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose);
+      validateSourceFormat();
+      return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
     }
     if (typeof sourceIndexName !== 'undefined') {
       return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
     }
     if (typeof stream !== 'undefined') {
-      return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
+      validateSourceFormat();
+      return streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
     }
     return null;
   }