node-es-transformer 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,15 @@
1
1
  import elasticsearch9 from 'es9';
2
2
  import elasticsearch8 from 'es8';
3
+ import parquet from '@dsnp/parquetjs';
4
+ import * as arrow from 'apache-arrow';
3
5
  import fs from 'fs';
6
+ import { parse } from 'csv-parse';
4
7
  import es from 'event-stream';
5
8
  import { globSync } from 'glob';
6
9
  import split from 'split2';
7
- import { Readable } from 'stream';
10
+ import { PassThrough } from 'stream';
8
11
  import cliProgress from 'cli-progress';
12
+ import pino from 'pino';
9
13
 
10
14
  // In earlier versions this was used to set the number of docs to index in a
11
15
  // single bulk request. Since we switched to use the helpers.bulk() method from
@@ -23,14 +27,16 @@ function createMappingFactory({
23
27
  targetClient,
24
28
  targetIndexName,
25
29
  mappings,
30
+ inferredIngestPipeline,
26
31
  mappingsOverride,
27
32
  indexMappingTotalFieldsLimit,
28
- verbose,
29
33
  deleteIndex,
30
- pipeline
34
+ pipeline,
35
+ logger
31
36
  }) {
32
37
  return async () => {
33
38
  let targetMappings = mappingsOverride ? undefined : mappings;
39
+ let defaultPipeline = pipeline;
34
40
  if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
35
41
  try {
36
42
  const mapping = await sourceClient.indices.getMapping({
@@ -45,7 +51,10 @@ function createMappingFactory({
45
51
  }
46
52
  }
47
53
  } catch (err) {
48
- console.log('Error reading source mapping', err);
54
+ logger.error({
55
+ err,
56
+ sourceIndexName
57
+ }, 'Error reading source mapping');
49
58
  return;
50
59
  }
51
60
  }
@@ -69,93 +78,312 @@ function createMappingFactory({
69
78
  });
70
79
  }
71
80
  if (indexExists === false || deleteIndex === true) {
72
- const resp = await targetClient.indices.create({
73
- index: targetIndexName,
74
- mappings: targetMappings,
75
- ...(pipeline !== undefined ? {
76
- settings: {
77
- index: {
78
- default_pipeline: pipeline
79
- }
80
- }
81
+ if (typeof defaultPipeline === 'undefined' && typeof inferredIngestPipeline === 'object' && inferredIngestPipeline !== null && typeof targetClient?.ingest?.putPipeline === 'function') {
82
+ const inferredPipelineName = `${targetIndexName}-inferred-pipeline`;
83
+ try {
84
+ await targetClient.ingest.putPipeline({
85
+ id: inferredPipelineName,
86
+ ...inferredIngestPipeline
87
+ });
88
+ defaultPipeline = inferredPipelineName;
89
+ logger.info({
90
+ inferredPipelineName
91
+ }, 'Created inferred ingest pipeline');
92
+ } catch (err) {
93
+ logger.error({
94
+ err,
95
+ inferredPipelineName
96
+ }, 'Error creating inferred ingest pipeline');
97
+ }
98
+ }
99
+ const settings = {
100
+ ...(defaultPipeline !== undefined ? {
101
+ 'index.default_pipeline': defaultPipeline
81
102
  } : {}),
82
103
  ...(indexMappingTotalFieldsLimit !== undefined ? {
83
- settings: {
84
- 'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
85
- 'index.number_of_shards': 1,
86
- 'index.number_of_replicas': 0
87
- }
104
+ 'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
105
+ 'index.number_of_shards': 1,
106
+ 'index.number_of_replicas': 0
107
+ } : {})
108
+ };
109
+ const response = await targetClient.indices.create({
110
+ index: targetIndexName,
111
+ mappings: targetMappings,
112
+ ...(Object.keys(settings).length > 0 ? {
113
+ settings
88
114
  } : {})
89
115
  });
90
- if (verbose) console.log('Created target mapping', resp);
116
+ logger.info({
117
+ targetIndexName,
118
+ response
119
+ }, 'Created target mapping');
91
120
  }
92
121
  } catch (err) {
93
- console.log('Error creating target mapping', err);
122
+ logger.error({
123
+ err,
124
+ targetIndexName
125
+ }, 'Error creating target mapping');
94
126
  }
95
127
  }
96
128
  };
97
129
  }
98
130
 
99
- function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
100
- function startIndex(files) {
101
- let finished = false;
102
- const file = files.shift();
103
- const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
131
+ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
132
+ const options = {
133
+ bom: true,
134
+ columns: true,
135
+ trim: true,
136
+ skip_empty_lines: true,
137
+ ...csvOptions
138
+ };
139
+ const consumesHeader = options.columns === true || typeof options.columns === 'function';
140
+ if (skipHeader && !consumesHeader && typeof options.from_line === 'undefined') {
141
+ options.from_line = 2;
142
+ }
143
+ return options;
144
+ }
145
+
146
+ function createPauseWaiter$1(queueEmitter) {
147
+ let paused = false;
148
+ let waiters = [];
149
+ const onPause = () => {
150
+ paused = true;
151
+ };
152
+ const onResume = () => {
153
+ paused = false;
154
+ waiters.forEach(resolve => resolve());
155
+ waiters = [];
156
+ };
157
+ queueEmitter.on('pause', onPause);
158
+ queueEmitter.on('resume', onResume);
159
+ return {
160
+ async waitIfPaused() {
161
+ if (!paused) return;
162
+ await new Promise(resolve => {
163
+ waiters.push(resolve);
164
+ });
165
+ },
166
+ cleanup() {
167
+ queueEmitter.removeListener('pause', onPause);
168
+ queueEmitter.removeListener('resume', onResume);
169
+ waiters.forEach(resolve => resolve());
170
+ waiters = [];
171
+ }
172
+ };
173
+ }
174
+ function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
175
+ function addParsedDoc(parsed, file) {
176
+ const context = {
177
+ fileName: file
178
+ };
179
+ const doc = typeof transform === 'function' ? transform(parsed, context) : parsed;
180
+
181
+ // if doc is null/undefined we'll skip indexing it
182
+ if (doc === null || typeof doc === 'undefined') {
183
+ return;
184
+ }
185
+
186
+ // the transform callback may return an array of docs so we can emit
187
+ // multiple docs from a single line
188
+ if (Array.isArray(doc)) {
189
+ doc.forEach(d => {
190
+ if (d === null || typeof d === 'undefined') return;
191
+ indexer.add(d);
192
+ });
193
+ return;
194
+ }
195
+ indexer.add(doc);
196
+ }
197
+ async function processParquetFile(file) {
198
+ const {
199
+ waitIfPaused,
200
+ cleanup
201
+ } = createPauseWaiter$1(indexer.queueEmitter);
202
+ const reader = await parquet.ParquetReader.openFile(file);
203
+ try {
204
+ const cursor = reader.getCursor();
205
+ while (true) {
206
+ // eslint-disable-next-line no-await-in-loop
207
+ const row = await cursor.next();
208
+ if (row === null || typeof row === 'undefined') {
209
+ break;
210
+ }
211
+ addParsedDoc(row, file);
212
+ // eslint-disable-next-line no-await-in-loop
213
+ await waitIfPaused();
214
+ }
215
+ logger.info({
216
+ file
217
+ }, 'Read entire file');
218
+ } finally {
219
+ cleanup();
220
+ await reader.close();
221
+ }
222
+ }
223
+ async function processArrowFile(file) {
224
+ const {
225
+ waitIfPaused,
226
+ cleanup
227
+ } = createPauseWaiter$1(indexer.queueEmitter);
228
+ try {
229
+ const reader = await arrow.RecordBatchReader.from(fs.createReadStream(file));
230
+ for await (const recordBatch of reader) {
231
+ const {
232
+ fields
233
+ } = recordBatch.schema;
234
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
235
+ const row = {};
236
+ fields.forEach(field => {
237
+ const vector = recordBatch.getChild(field.name);
238
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
239
+ });
240
+ addParsedDoc(row, file);
241
+ // eslint-disable-next-line no-await-in-loop
242
+ await waitIfPaused();
243
+ }
244
+ }
245
+ logger.info({
246
+ file
247
+ }, 'Read entire file');
248
+ } finally {
249
+ cleanup();
250
+ }
251
+ }
252
+ function processStreamFile(file, buildStream, errorMessage) {
253
+ return new Promise((resolve, reject) => {
254
+ let finished = false;
255
+ const s = buildStream();
256
+ const onPause = () => {
257
+ if (finished) return;
258
+ s.pause();
259
+ };
260
+ const onResume = () => {
261
+ if (finished) return;
262
+ s.resume();
263
+ };
264
+ function cleanup() {
265
+ indexer.queueEmitter.removeListener('pause', onPause);
266
+ indexer.queueEmitter.removeListener('resume', onResume);
267
+ }
268
+ indexer.queueEmitter.on('pause', onPause);
269
+ indexer.queueEmitter.on('resume', onResume);
270
+ s.on('end', () => {
271
+ finished = true;
272
+ cleanup();
273
+ logger.info({
274
+ file
275
+ }, 'Read entire file');
276
+ resolve();
277
+ });
278
+ s.on('error', err => {
279
+ finished = true;
280
+ cleanup();
281
+ logger.error({
282
+ err,
283
+ file
284
+ }, errorMessage);
285
+ reject(err);
286
+ });
287
+ });
288
+ }
289
+ function processNdjsonFile(file) {
290
+ let skippedHeader = false;
291
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
104
292
  try {
105
293
  // skip empty lines
106
294
  if (line === '') {
107
295
  return;
108
296
  }
109
- const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
110
-
111
- // if doc is undefined we'll skip indexing it
112
- if (typeof doc === 'undefined') {
113
- s.resume();
297
+ if (skipHeader && !skippedHeader) {
298
+ skippedHeader = true;
114
299
  return;
115
300
  }
116
-
117
- // the transform callback may return an array of docs so we can emit
118
- // multiple docs from a single line
119
- if (Array.isArray(doc)) {
120
- doc.forEach(d => indexer.add(d));
121
- return;
122
- }
123
- indexer.add(doc);
124
- } catch (e) {
125
- console.log('error', e);
301
+ const parsed = JSON.parse(line);
302
+ addParsedDoc(parsed, file);
303
+ } catch (err) {
304
+ logger.error({
305
+ err,
306
+ file
307
+ }, 'Failed to process NDJSON line');
126
308
  }
127
309
  }).on('error', err => {
128
- console.log('Error while reading file.', err);
129
- }).on('end', () => {
130
- if (verbose) console.log('Read entire file: ', file);
131
- if (files.length > 0) {
132
- startIndex(files);
133
- return;
310
+ logger.error({
311
+ err,
312
+ file
313
+ }, 'Error while reading file');
314
+ })), 'Error while reading file');
315
+ }
316
+ function processCsvFile(file) {
317
+ const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
318
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(parse(parserOptions)).pipe(es.mapSync(record => {
319
+ try {
320
+ addParsedDoc(record, file);
321
+ } catch (err) {
322
+ logger.error({
323
+ err,
324
+ file
325
+ }, 'Failed to process CSV record');
134
326
  }
327
+ }).on('error', err => {
328
+ logger.error({
329
+ err,
330
+ file
331
+ }, 'Error while reading CSV file');
332
+ })), 'Error while reading CSV file');
333
+ }
334
+ async function processFile(file) {
335
+ if (sourceFormat === 'csv') {
336
+ await processCsvFile(file);
337
+ return;
338
+ }
339
+ if (sourceFormat === 'ndjson') {
340
+ await processNdjsonFile(file);
341
+ return;
342
+ }
343
+ if (sourceFormat === 'parquet') {
344
+ await processParquetFile(file);
345
+ return;
346
+ }
347
+ if (sourceFormat === 'arrow') {
348
+ await processArrowFile(file);
349
+ return;
350
+ }
351
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
352
+ }
353
+ async function startIndex(files) {
354
+ if (files.length === 0) {
135
355
  indexer.finish();
136
- finished = true;
137
- }));
138
- indexer.queueEmitter.on('pause', () => {
139
- if (finished) return;
140
- s.pause();
141
- });
142
- indexer.queueEmitter.on('resume', () => {
143
- if (finished) return;
144
- s.resume();
145
- });
356
+ return;
357
+ }
358
+ try {
359
+ for (const file of files) {
360
+ // eslint-disable-next-line no-await-in-loop
361
+ await processFile(file);
362
+ }
363
+ } catch (err) {
364
+ logger.error({
365
+ err,
366
+ files
367
+ }, 'Error while processing files');
368
+ } finally {
369
+ indexer.finish();
370
+ }
146
371
  }
147
372
  return () => {
148
373
  try {
149
374
  const files = globSync(fileName);
150
375
  startIndex(files);
151
- } catch (error) {
152
- console.log('Error matching files:', error);
376
+ } catch (err) {
377
+ logger.error({
378
+ err,
379
+ fileName
380
+ }, 'Error matching files');
381
+ indexer.finish();
153
382
  }
154
383
  };
155
384
  }
156
385
 
157
386
  const EventEmitter = require('events');
158
- const queueEmitter = new EventEmitter();
159
387
  const parallelCalls = 5;
160
388
 
161
389
  // a simple helper queue to bulk index documents
@@ -163,21 +391,20 @@ function indexQueueFactory({
163
391
  targetClient: client,
164
392
  targetIndexName,
165
393
  bufferSize = DEFAULT_BUFFER_SIZE,
166
- skipHeader = false
394
+ logger
167
395
  }) {
396
+ const queueEmitter = new EventEmitter();
168
397
  let docsPerSecond = 0;
169
398
  const flushBytes = bufferSize * 1024; // Convert KB to Bytes
170
399
  const highWaterMark = flushBytes * parallelCalls;
171
400
 
172
- // Create a Readable stream
173
- const stream = new Readable({
174
- read() {},
175
- // Implement read but we manage pushing manually
401
+ // Create a PassThrough stream (readable + writable) for proper backpressure
402
+ const stream = new PassThrough({
176
403
  highWaterMark // Buffer size for backpressure management
177
404
  });
178
405
  async function* ndjsonStreamIterator(readableStream) {
179
406
  let buffer = ''; // To hold the incomplete data
180
- let skippedHeader = false;
407
+
181
408
  try {
182
409
  // Iterate over the stream using async iteration
183
410
  for await (const chunk of readableStream) {
@@ -191,16 +418,15 @@ function indexQueueFactory({
191
418
 
192
419
  // Yield each complete JSON object
193
420
  for (const line of lines) {
194
- if (line.trim()) {
195
- try {
196
- if (!skipHeader || skipHeader && !skippedHeader) {
197
- yield JSON.parse(line); // Parse and yield the JSON object
198
- skippedHeader = true;
199
- }
200
- } catch (err) {
201
- // Handle JSON parse errors if necessary
202
- console.error('Failed to parse JSON:', err);
203
- }
421
+ if (!line.trim()) {
422
+ continue;
423
+ }
424
+ try {
425
+ yield JSON.parse(line); // Parse and yield the JSON object
426
+ } catch (err) {
427
+ logger.error({
428
+ err
429
+ }, 'Failed to parse JSON from NDJSON stream');
204
430
  }
205
431
  }
206
432
  }
@@ -210,7 +436,9 @@ function indexQueueFactory({
210
436
  try {
211
437
  yield JSON.parse(buffer);
212
438
  } catch (err) {
213
- console.error('Failed to parse final JSON:', err);
439
+ logger.error({
440
+ err
441
+ }, 'Failed to parse final JSON from NDJSON stream');
214
442
  }
215
443
  }
216
444
  } finally {
@@ -236,7 +464,7 @@ function indexQueueFactory({
236
464
  flushInterval: 1000,
237
465
  refreshOnCompletion: true,
238
466
  datasource: ndjsonStreamIterator(stream),
239
- onDocument(doc) {
467
+ onDocument() {
240
468
  docsPerSecond++;
241
469
  return {
242
470
  index: {
@@ -245,9 +473,13 @@ function indexQueueFactory({
245
473
  };
246
474
  }
247
475
  });
248
- } catch (error) {
249
- console.error('Error during bulk indexing:', error);
250
- throw error;
476
+ } catch (err) {
477
+ logger.error({
478
+ err,
479
+ targetIndexName
480
+ }, 'Error during bulk indexing');
481
+ queueEmitter.emit('error', err);
482
+ throw err;
251
483
  } finally {
252
484
  // Clean up interval
253
485
  clearInterval(interval);
@@ -276,7 +508,7 @@ function indexQueueFactory({
276
508
  if (finished) {
277
509
  throw new Error('Unexpected doc added after indexer should finish.');
278
510
  }
279
- const canContinue = stream.push(`${JSON.stringify(doc)}\n`);
511
+ const canContinue = stream.write(`${JSON.stringify(doc)}\n`);
280
512
  if (!canContinue) {
281
513
  queueEmitter.emit('pause');
282
514
 
@@ -289,7 +521,7 @@ function indexQueueFactory({
289
521
  },
290
522
  finish: () => {
291
523
  finished = true;
292
- stream.push(null);
524
+ stream.end();
293
525
  },
294
526
  queueEmitter
295
527
  };
@@ -297,7 +529,7 @@ function indexQueueFactory({
297
529
 
298
530
  // create a new progress bar instance and use shades_classic theme
299
531
  const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
300
- function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
532
+ function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
301
533
  return async function indexReader() {
302
534
  let docsNum = 0;
303
535
  let scrollId;
@@ -316,8 +548,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
316
548
  maxRetries: 0
317
549
  });
318
550
  return Object.keys(response.fields);
319
- } catch (e) {
320
- console.log('error', e);
551
+ } catch (err) {
552
+ logger.error({
553
+ err,
554
+ sourceIndexName
555
+ }, 'Failed to fetch populated fields');
321
556
  }
322
557
  }
323
558
  function search(fields) {
@@ -361,8 +596,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
361
596
  return;
362
597
  }
363
598
  indexer.add(doc);
364
- } catch (e) {
365
- console.log('error', e);
599
+ } catch (err) {
600
+ logger.error({
601
+ err
602
+ }, 'Failed to process source index document');
366
603
  }
367
604
  }
368
605
  async function fetchNextResponse() {
@@ -406,48 +643,339 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
406
643
  };
407
644
  }
408
645
 
409
- function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
410
- function startIndex() {
411
- let finished = false;
412
- const s = stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
646
+ const DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES = 100000;
647
+ const DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE = 1000;
648
+ function readSample(filePath, sampleBytes) {
649
+ const fd = fs.openSync(filePath, 'r');
650
+ try {
651
+ const buffer = Buffer.alloc(sampleBytes);
652
+ const bytesRead = fs.readSync(fd, buffer, 0, sampleBytes, 0);
653
+ return buffer.subarray(0, bytesRead).toString('utf8');
654
+ } finally {
655
+ fs.closeSync(fd);
656
+ }
657
+ }
658
+ function emptyInferenceResult(mappings) {
659
+ return {
660
+ mappings,
661
+ ingestPipeline: undefined
662
+ };
663
+ }
664
+ async function inferMappingsFromSource({
665
+ targetClient,
666
+ fileName,
667
+ sourceFormat,
668
+ csvOptions,
669
+ skipHeader,
670
+ mappings,
671
+ inferMappings,
672
+ inferMappingsOptions,
673
+ logger
674
+ }) {
675
+ if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
676
+ return emptyInferenceResult(mappings);
677
+ }
678
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
679
+ logger.info({
680
+ sourceFormat
681
+ }, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
682
+ return emptyInferenceResult(mappings);
683
+ }
684
+ if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
685
+ return emptyInferenceResult(mappings);
686
+ }
687
+ const files = globSync(fileName);
688
+ if (files.length === 0) {
689
+ logger.info({
690
+ fileName
691
+ }, 'No files matched for mapping inference');
692
+ return emptyInferenceResult(mappings);
693
+ }
694
+ const {
695
+ sampleBytes = DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES,
696
+ ...requestParams
697
+ } = inferMappingsOptions || {};
698
+ const sampleText = readSample(files[0], sampleBytes);
699
+ if (!sampleText || sampleText.trim() === '') {
700
+ logger.info('Skipping mapping inference because the sample text is empty');
701
+ return emptyInferenceResult(mappings);
702
+ }
703
+ const params = {
704
+ body: sampleText,
705
+ lines_to_sample: DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE,
706
+ ...requestParams
707
+ };
708
+ if (typeof params.format === 'undefined') {
709
+ params.format = sourceFormat === 'csv' ? 'delimited' : 'ndjson';
710
+ }
711
+ if (sourceFormat === 'csv') {
712
+ if (typeof params.delimiter === 'undefined' && typeof csvOptions?.delimiter === 'string') {
713
+ params.delimiter = csvOptions.delimiter;
714
+ }
715
+ if (typeof params.quote === 'undefined' && typeof csvOptions?.quote === 'string') {
716
+ params.quote = csvOptions.quote;
717
+ }
718
+ if (typeof params.has_header_row === 'undefined' && typeof csvOptions?.columns === 'boolean') {
719
+ params.has_header_row = csvOptions.columns;
720
+ }
721
+ if (typeof params.has_header_row === 'undefined' && skipHeader) {
722
+ params.has_header_row = true;
723
+ }
724
+ }
725
+ try {
726
+ const response = await targetClient.textStructure.findStructure(params);
727
+ if (response?.mappings) {
728
+ logger.info({
729
+ file: files[0]
730
+ }, 'Inferred mappings via _text_structure/find_structure');
731
+ }
732
+ if (response?.ingest_pipeline) {
733
+ logger.info('Inferred ingest pipeline via _text_structure/find_structure');
734
+ }
735
+ return {
736
+ mappings: response?.mappings || mappings,
737
+ ingestPipeline: response?.ingest_pipeline
738
+ };
739
+ } catch (err) {
740
+ logger.warn({
741
+ err
742
+ }, 'Could not infer mappings via _text_structure/find_structure');
743
+ return emptyInferenceResult(mappings);
744
+ }
745
+ }
746
+
747
+ const DEFAULT_LOG_LEVEL = 'info';
748
+ function resolveLogLevel(verbose = true) {
749
+ if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
750
+ return process.env.LOG_LEVEL;
751
+ }
752
+ return verbose ? DEFAULT_LOG_LEVEL : 'error';
753
+ }
754
+ function createLogger({
755
+ logger,
756
+ verbose = true
757
+ } = {}) {
758
+ if (logger && typeof logger === 'object') {
759
+ return logger;
760
+ }
761
+ return pino({
762
+ name: 'node-es-transformer',
763
+ level: resolveLogLevel(verbose),
764
+ timestamp: pino.stdTimeFunctions.isoTime,
765
+ serializers: {
766
+ err: pino.stdSerializers.err,
767
+ error: pino.stdSerializers.err
768
+ }
769
+ });
770
+ }
771
+ function createChildLogger(logger, bindings) {
772
+ if (!logger || typeof logger.child !== 'function') {
773
+ return logger;
774
+ }
775
+ return logger.child(bindings);
776
+ }
777
+
778
+ function createPauseWaiter(queueEmitter) {
779
+ let paused = false;
780
+ let waiters = [];
781
+ const onPause = () => {
782
+ paused = true;
783
+ };
784
+ const onResume = () => {
785
+ paused = false;
786
+ waiters.forEach(resolve => resolve());
787
+ waiters = [];
788
+ };
789
+ queueEmitter.on('pause', onPause);
790
+ queueEmitter.on('resume', onResume);
791
+ return {
792
+ async waitIfPaused() {
793
+ if (!paused) return;
794
+ await new Promise(resolve => {
795
+ waiters.push(resolve);
796
+ });
797
+ },
798
+ cleanup() {
799
+ queueEmitter.removeListener('pause', onPause);
800
+ queueEmitter.removeListener('resume', onResume);
801
+ waiters.forEach(resolve => resolve());
802
+ waiters = [];
803
+ }
804
+ };
805
+ }
806
+ async function readStreamToBuffer(stream) {
807
+ const chunks = [];
808
+ for await (const chunk of stream) {
809
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
810
+ }
811
+ return Buffer.concat(chunks);
812
+ }
813
+ function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
814
+ function addParsedDoc(parsed) {
815
+ const doc = typeof transform === 'function' ? transform(parsed) : parsed;
816
+
817
+ // if doc is null/undefined we'll skip indexing it
818
+ if (doc === null || typeof doc === 'undefined') {
819
+ return;
820
+ }
821
+
822
+ // the transform callback may return an array of docs so we can emit
823
+ // multiple docs from a single line
824
+ if (Array.isArray(doc)) {
825
+ doc.forEach(d => {
826
+ if (d === null || typeof d === 'undefined') return;
827
+ indexer.add(d);
828
+ });
829
+ return;
830
+ }
831
+ indexer.add(doc);
832
+ }
833
+ async function processParquetStream() {
834
+ const {
835
+ waitIfPaused,
836
+ cleanup
837
+ } = createPauseWaiter(indexer.queueEmitter);
838
+ const parquetBuffer = await readStreamToBuffer(stream);
839
+ const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
840
+ try {
841
+ const cursor = reader.getCursor();
842
+ while (true) {
843
+ // eslint-disable-next-line no-await-in-loop
844
+ const row = await cursor.next();
845
+ if (row === null || typeof row === 'undefined') {
846
+ break;
847
+ }
848
+ addParsedDoc(row);
849
+ // eslint-disable-next-line no-await-in-loop
850
+ await waitIfPaused();
851
+ }
852
+ logger.info('Read entire stream');
853
+ } finally {
854
+ cleanup();
855
+ await reader.close();
856
+ }
857
+ }
858
+ async function processArrowStream() {
859
+ const {
860
+ waitIfPaused,
861
+ cleanup
862
+ } = createPauseWaiter(indexer.queueEmitter);
863
+ try {
864
+ const reader = await arrow.RecordBatchReader.from(stream);
865
+ for await (const recordBatch of reader) {
866
+ const {
867
+ fields
868
+ } = recordBatch.schema;
869
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
870
+ const row = {};
871
+ fields.forEach(field => {
872
+ const vector = recordBatch.getChild(field.name);
873
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
874
+ });
875
+ addParsedDoc(row);
876
+ // eslint-disable-next-line no-await-in-loop
877
+ await waitIfPaused();
878
+ }
879
+ }
880
+ logger.info('Read entire stream');
881
+ } finally {
882
+ cleanup();
883
+ }
884
+ }
885
+ function processPipeline(buildPipeline, errorMessage) {
886
+ return new Promise((resolve, reject) => {
887
+ let finished = false;
888
+ const s = buildPipeline();
889
+ const onPause = () => {
890
+ if (finished) return;
891
+ s.pause();
892
+ };
893
+ const onResume = () => {
894
+ if (finished) return;
895
+ s.resume();
896
+ };
897
+ function cleanup() {
898
+ indexer.queueEmitter.removeListener('pause', onPause);
899
+ indexer.queueEmitter.removeListener('resume', onResume);
900
+ }
901
+ indexer.queueEmitter.on('pause', onPause);
902
+ indexer.queueEmitter.on('resume', onResume);
903
+ s.on('end', () => {
904
+ finished = true;
905
+ cleanup();
906
+ logger.info('Read entire stream');
907
+ resolve();
908
+ });
909
+ s.on('error', err => {
910
+ finished = true;
911
+ cleanup();
912
+ logger.error({
913
+ err
914
+ }, errorMessage);
915
+ reject(err);
916
+ });
917
+ });
918
+ }
919
+ function processCsvStream() {
920
+ return processPipeline(() => stream.pipe(parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
921
+ try {
922
+ addParsedDoc(record);
923
+ } catch (err) {
924
+ logger.error({
925
+ err
926
+ }, 'Failed to process CSV stream record');
927
+ }
928
+ }).on('error', err => {
929
+ logger.error({
930
+ err
931
+ }, 'Error while reading CSV stream');
932
+ })), 'Error while reading CSV stream');
933
+ }
934
+ function processNdjsonStream() {
935
+ let skippedHeader = false;
936
+ return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
413
937
  try {
414
938
  // skip empty lines
415
939
  if (line === '') {
416
940
  return;
417
941
  }
418
- const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
419
-
420
- // if doc is undefined we'll skip indexing it
421
- if (typeof doc === 'undefined') {
422
- s.resume();
423
- return;
424
- }
425
-
426
- // the transform callback may return an array of docs so we can emit
427
- // multiple docs from a single line
428
- if (Array.isArray(doc)) {
429
- doc.forEach(d => indexer.add(d));
942
+ if (skipHeader && !skippedHeader) {
943
+ skippedHeader = true;
430
944
  return;
431
945
  }
432
- indexer.add(doc);
433
- } catch (e) {
434
- console.log('error', e);
946
+ const parsed = JSON.parse(line);
947
+ addParsedDoc(parsed);
948
+ } catch (err) {
949
+ logger.error({
950
+ err
951
+ }, 'Failed to process NDJSON stream line');
435
952
  }
436
953
  }).on('error', err => {
437
- console.log('Error while reading stream.', err);
438
- }).on('end', () => {
439
- if (verbose) console.log('Read entire stream.');
954
+ logger.error({
955
+ err
956
+ }, 'Error while reading stream');
957
+ })), 'Error while reading stream');
958
+ }
959
+ async function startIndex() {
960
+ try {
961
+ if (sourceFormat === 'csv') {
962
+ await processCsvStream();
963
+ } else if (sourceFormat === 'ndjson') {
964
+ await processNdjsonStream();
965
+ } else if (sourceFormat === 'parquet') {
966
+ await processParquetStream();
967
+ } else if (sourceFormat === 'arrow') {
968
+ await processArrowStream();
969
+ } else {
970
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
971
+ }
972
+ } catch (err) {
973
+ logger.error({
974
+ err
975
+ }, 'Error while reading stream');
976
+ } finally {
440
977
  indexer.finish();
441
- finished = true;
442
- }));
443
- indexer.queueEmitter.on('pause', () => {
444
- if (finished) return;
445
- s.pause();
446
- });
447
- indexer.queueEmitter.on('resume', () => {
448
- if (finished) return;
449
- s.resume();
450
- });
978
+ }
451
979
  }
452
980
  return () => {
453
981
  startIndex();
@@ -528,22 +1056,31 @@ async function transformer({
528
1056
  searchSize = DEFAULT_SEARCH_SIZE,
529
1057
  stream,
530
1058
  fileName,
1059
+ sourceFormat = 'ndjson',
1060
+ csvOptions = {},
531
1061
  splitRegex = /\n/,
532
1062
  sourceIndexName,
533
1063
  targetIndexName,
534
1064
  mappings,
535
1065
  mappingsOverride = false,
1066
+ inferMappings = false,
1067
+ inferMappingsOptions = {},
536
1068
  indexMappingTotalFieldsLimit,
537
1069
  pipeline,
538
1070
  populatedFields = false,
539
1071
  query,
540
1072
  skipHeader = false,
541
1073
  transform,
542
- verbose = true
1074
+ verbose = true,
1075
+ logger: loggerInput
543
1076
  }) {
544
1077
  if (typeof targetIndexName === 'undefined') {
545
1078
  throw Error('targetIndexName must be specified.');
546
1079
  }
1080
+ const logger = createLogger({
1081
+ logger: loggerInput,
1082
+ verbose
1083
+ });
547
1084
  const defaultClientConfig = {
548
1085
  node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
549
1086
  };
@@ -551,23 +1088,47 @@ async function transformer({
551
1088
  // Support both old (config) and new (client instance) patterns
552
1089
  const sourceClient = await getOrCreateClient(sourceClientInput || sourceClientConfig, defaultClientConfig, sourceClientVersion);
553
1090
  const targetClient = await getOrCreateClient(targetClientInput || targetClientConfig || sourceClientInput || sourceClientConfig, defaultClientConfig, targetClientVersion);
1091
+ const inferenceResult = await inferMappingsFromSource({
1092
+ targetClient,
1093
+ fileName,
1094
+ sourceFormat,
1095
+ csvOptions,
1096
+ skipHeader,
1097
+ mappings,
1098
+ inferMappings,
1099
+ inferMappingsOptions,
1100
+ logger: createChildLogger(logger, {
1101
+ component: 'mapping-inference'
1102
+ })
1103
+ });
554
1104
  const createMapping = createMappingFactory({
555
1105
  sourceClient,
556
1106
  sourceIndexName,
557
1107
  targetClient,
558
1108
  targetIndexName,
559
- mappings,
1109
+ mappings: inferenceResult.mappings,
1110
+ inferredIngestPipeline: inferenceResult.ingestPipeline,
560
1111
  mappingsOverride,
561
1112
  indexMappingTotalFieldsLimit,
562
- verbose,
563
1113
  deleteIndex,
564
- pipeline
1114
+ pipeline,
1115
+ logger: createChildLogger(logger, {
1116
+ component: 'create-mapping'
1117
+ })
565
1118
  });
566
1119
  const indexer = indexQueueFactory({
567
1120
  targetClient,
568
1121
  targetIndexName,
569
1122
  bufferSize,
570
- skipHeader});
1123
+ logger: createChildLogger(logger, {
1124
+ component: 'index-queue'
1125
+ })
1126
+ });
1127
+ function validateSourceFormat() {
1128
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
1129
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
1130
+ }
1131
+ }
571
1132
  function getReader() {
572
1133
  if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
573
1134
  throw Error('Only either one of fileName or sourceIndexName can be specified.');
@@ -576,17 +1137,28 @@ async function transformer({
576
1137
  throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
577
1138
  }
578
1139
  if (typeof fileName !== 'undefined') {
579
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose);
1140
+ validateSourceFormat();
1141
+ return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1142
+ component: 'file-reader'
1143
+ }));
580
1144
  }
581
1145
  if (typeof sourceIndexName !== 'undefined') {
582
- return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
1146
+ return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
1147
+ component: 'index-reader'
1148
+ }));
583
1149
  }
584
1150
  if (typeof stream !== 'undefined') {
585
- return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
1151
+ validateSourceFormat();
1152
+ return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1153
+ component: 'stream-reader'
1154
+ }));
586
1155
  }
587
1156
  return null;
588
1157
  }
589
1158
  const reader = getReader();
1159
+ if (typeof reader !== 'function') {
1160
+ throw Error('One of fileName, sourceIndexName, or stream must be specified.');
1161
+ }
590
1162
  try {
591
1163
  const indexExists = await targetClient.indices.exists({
592
1164
  index: targetIndexName
@@ -603,8 +1175,11 @@ async function transformer({
603
1175
  } else {
604
1176
  reader();
605
1177
  }
606
- } catch (error) {
607
- console.error('Error checking index existence:', error);
1178
+ } catch (err) {
1179
+ logger.error({
1180
+ err,
1181
+ targetIndexName
1182
+ }, 'Error checking index existence');
608
1183
  } finally {
609
1184
  // targetClient.close();
610
1185
  }