node-es-transformer 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,12 +2,35 @@
2
2
 
3
3
  var elasticsearch9 = require('es9');
4
4
  var elasticsearch8 = require('es8');
5
+ var parquet = require('@dsnp/parquetjs');
6
+ var arrow = require('apache-arrow');
5
7
  var fs = require('fs');
8
+ var csvParse = require('csv-parse');
6
9
  var es = require('event-stream');
7
10
  var glob = require('glob');
8
11
  var split = require('split2');
9
12
  var stream = require('stream');
10
13
  var cliProgress = require('cli-progress');
14
+ var pino = require('pino');
15
+
16
+ function _interopNamespaceDefault(e) {
17
+ var n = Object.create(null);
18
+ if (e) {
19
+ Object.keys(e).forEach(function (k) {
20
+ if (k !== 'default') {
21
+ var d = Object.getOwnPropertyDescriptor(e, k);
22
+ Object.defineProperty(n, k, d.get ? d : {
23
+ enumerable: true,
24
+ get: function () { return e[k]; }
25
+ });
26
+ }
27
+ });
28
+ }
29
+ n.default = e;
30
+ return Object.freeze(n);
31
+ }
32
+
33
+ var arrow__namespace = /*#__PURE__*/_interopNamespaceDefault(arrow);
11
34
 
12
35
  // In earlier versions this was used to set the number of docs to index in a
13
36
  // single bulk request. Since we switched to use the helpers.bulk() method from
@@ -25,14 +48,16 @@ function createMappingFactory({
25
48
  targetClient,
26
49
  targetIndexName,
27
50
  mappings,
51
+ inferredIngestPipeline,
28
52
  mappingsOverride,
29
53
  indexMappingTotalFieldsLimit,
30
- verbose,
31
54
  deleteIndex,
32
- pipeline
55
+ pipeline,
56
+ logger
33
57
  }) {
34
58
  return async () => {
35
59
  let targetMappings = mappingsOverride ? undefined : mappings;
60
+ let defaultPipeline = pipeline;
36
61
  if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
37
62
  try {
38
63
  const mapping = await sourceClient.indices.getMapping({
@@ -47,7 +72,10 @@ function createMappingFactory({
47
72
  }
48
73
  }
49
74
  } catch (err) {
50
- console.log('Error reading source mapping', err);
75
+ logger.error({
76
+ err,
77
+ sourceIndexName
78
+ }, 'Error reading source mapping');
51
79
  return;
52
80
  }
53
81
  }
@@ -71,93 +99,312 @@ function createMappingFactory({
71
99
  });
72
100
  }
73
101
  if (indexExists === false || deleteIndex === true) {
74
- const resp = await targetClient.indices.create({
75
- index: targetIndexName,
76
- mappings: targetMappings,
77
- ...(pipeline !== undefined ? {
78
- settings: {
79
- index: {
80
- default_pipeline: pipeline
81
- }
82
- }
102
+ if (typeof defaultPipeline === 'undefined' && typeof inferredIngestPipeline === 'object' && inferredIngestPipeline !== null && typeof targetClient?.ingest?.putPipeline === 'function') {
103
+ const inferredPipelineName = `${targetIndexName}-inferred-pipeline`;
104
+ try {
105
+ await targetClient.ingest.putPipeline({
106
+ id: inferredPipelineName,
107
+ ...inferredIngestPipeline
108
+ });
109
+ defaultPipeline = inferredPipelineName;
110
+ logger.info({
111
+ inferredPipelineName
112
+ }, 'Created inferred ingest pipeline');
113
+ } catch (err) {
114
+ logger.error({
115
+ err,
116
+ inferredPipelineName
117
+ }, 'Error creating inferred ingest pipeline');
118
+ }
119
+ }
120
+ const settings = {
121
+ ...(defaultPipeline !== undefined ? {
122
+ 'index.default_pipeline': defaultPipeline
83
123
  } : {}),
84
124
  ...(indexMappingTotalFieldsLimit !== undefined ? {
85
- settings: {
86
- 'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
87
- 'index.number_of_shards': 1,
88
- 'index.number_of_replicas': 0
89
- }
125
+ 'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
126
+ 'index.number_of_shards': 1,
127
+ 'index.number_of_replicas': 0
128
+ } : {})
129
+ };
130
+ const response = await targetClient.indices.create({
131
+ index: targetIndexName,
132
+ mappings: targetMappings,
133
+ ...(Object.keys(settings).length > 0 ? {
134
+ settings
90
135
  } : {})
91
136
  });
92
- if (verbose) console.log('Created target mapping', resp);
137
+ logger.info({
138
+ targetIndexName,
139
+ response
140
+ }, 'Created target mapping');
93
141
  }
94
142
  } catch (err) {
95
- console.log('Error creating target mapping', err);
143
+ logger.error({
144
+ err,
145
+ targetIndexName
146
+ }, 'Error creating target mapping');
96
147
  }
97
148
  }
98
149
  };
99
150
  }
100
151
 
101
- function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
102
- function startIndex(files) {
103
- let finished = false;
104
- const file = files.shift();
105
- const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
152
+ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
153
+ const options = {
154
+ bom: true,
155
+ columns: true,
156
+ trim: true,
157
+ skip_empty_lines: true,
158
+ ...csvOptions
159
+ };
160
+ const consumesHeader = options.columns === true || typeof options.columns === 'function';
161
+ if (skipHeader && !consumesHeader && typeof options.from_line === 'undefined') {
162
+ options.from_line = 2;
163
+ }
164
+ return options;
165
+ }
166
+
167
+ function createPauseWaiter$1(queueEmitter) {
168
+ let paused = false;
169
+ let waiters = [];
170
+ const onPause = () => {
171
+ paused = true;
172
+ };
173
+ const onResume = () => {
174
+ paused = false;
175
+ waiters.forEach(resolve => resolve());
176
+ waiters = [];
177
+ };
178
+ queueEmitter.on('pause', onPause);
179
+ queueEmitter.on('resume', onResume);
180
+ return {
181
+ async waitIfPaused() {
182
+ if (!paused) return;
183
+ await new Promise(resolve => {
184
+ waiters.push(resolve);
185
+ });
186
+ },
187
+ cleanup() {
188
+ queueEmitter.removeListener('pause', onPause);
189
+ queueEmitter.removeListener('resume', onResume);
190
+ waiters.forEach(resolve => resolve());
191
+ waiters = [];
192
+ }
193
+ };
194
+ }
195
+ function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
196
+ function addParsedDoc(parsed, file) {
197
+ const context = {
198
+ fileName: file
199
+ };
200
+ const doc = typeof transform === 'function' ? transform(parsed, context) : parsed;
201
+
202
+ // if doc is null/undefined we'll skip indexing it
203
+ if (doc === null || typeof doc === 'undefined') {
204
+ return;
205
+ }
206
+
207
+ // the transform callback may return an array of docs so we can emit
208
+ // multiple docs from a single line
209
+ if (Array.isArray(doc)) {
210
+ doc.forEach(d => {
211
+ if (d === null || typeof d === 'undefined') return;
212
+ indexer.add(d);
213
+ });
214
+ return;
215
+ }
216
+ indexer.add(doc);
217
+ }
218
+ async function processParquetFile(file) {
219
+ const {
220
+ waitIfPaused,
221
+ cleanup
222
+ } = createPauseWaiter$1(indexer.queueEmitter);
223
+ const reader = await parquet.ParquetReader.openFile(file);
224
+ try {
225
+ const cursor = reader.getCursor();
226
+ while (true) {
227
+ // eslint-disable-next-line no-await-in-loop
228
+ const row = await cursor.next();
229
+ if (row === null || typeof row === 'undefined') {
230
+ break;
231
+ }
232
+ addParsedDoc(row, file);
233
+ // eslint-disable-next-line no-await-in-loop
234
+ await waitIfPaused();
235
+ }
236
+ logger.info({
237
+ file
238
+ }, 'Read entire file');
239
+ } finally {
240
+ cleanup();
241
+ await reader.close();
242
+ }
243
+ }
244
+ async function processArrowFile(file) {
245
+ const {
246
+ waitIfPaused,
247
+ cleanup
248
+ } = createPauseWaiter$1(indexer.queueEmitter);
249
+ try {
250
+ const reader = await arrow__namespace.RecordBatchReader.from(fs.createReadStream(file));
251
+ for await (const recordBatch of reader) {
252
+ const {
253
+ fields
254
+ } = recordBatch.schema;
255
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
256
+ const row = {};
257
+ fields.forEach(field => {
258
+ const vector = recordBatch.getChild(field.name);
259
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
260
+ });
261
+ addParsedDoc(row, file);
262
+ // eslint-disable-next-line no-await-in-loop
263
+ await waitIfPaused();
264
+ }
265
+ }
266
+ logger.info({
267
+ file
268
+ }, 'Read entire file');
269
+ } finally {
270
+ cleanup();
271
+ }
272
+ }
273
+ function processStreamFile(file, buildStream, errorMessage) {
274
+ return new Promise((resolve, reject) => {
275
+ let finished = false;
276
+ const s = buildStream();
277
+ const onPause = () => {
278
+ if (finished) return;
279
+ s.pause();
280
+ };
281
+ const onResume = () => {
282
+ if (finished) return;
283
+ s.resume();
284
+ };
285
+ function cleanup() {
286
+ indexer.queueEmitter.removeListener('pause', onPause);
287
+ indexer.queueEmitter.removeListener('resume', onResume);
288
+ }
289
+ indexer.queueEmitter.on('pause', onPause);
290
+ indexer.queueEmitter.on('resume', onResume);
291
+ s.on('end', () => {
292
+ finished = true;
293
+ cleanup();
294
+ logger.info({
295
+ file
296
+ }, 'Read entire file');
297
+ resolve();
298
+ });
299
+ s.on('error', err => {
300
+ finished = true;
301
+ cleanup();
302
+ logger.error({
303
+ err,
304
+ file
305
+ }, errorMessage);
306
+ reject(err);
307
+ });
308
+ });
309
+ }
310
+ function processNdjsonFile(file) {
311
+ let skippedHeader = false;
312
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
106
313
  try {
107
314
  // skip empty lines
108
315
  if (line === '') {
109
316
  return;
110
317
  }
111
- const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
112
-
113
- // if doc is undefined we'll skip indexing it
114
- if (typeof doc === 'undefined') {
115
- s.resume();
116
- return;
117
- }
118
-
119
- // the transform callback may return an array of docs so we can emit
120
- // multiple docs from a single line
121
- if (Array.isArray(doc)) {
122
- doc.forEach(d => indexer.add(d));
318
+ if (skipHeader && !skippedHeader) {
319
+ skippedHeader = true;
123
320
  return;
124
321
  }
125
- indexer.add(doc);
126
- } catch (e) {
127
- console.log('error', e);
322
+ const parsed = JSON.parse(line);
323
+ addParsedDoc(parsed, file);
324
+ } catch (err) {
325
+ logger.error({
326
+ err,
327
+ file
328
+ }, 'Failed to process NDJSON line');
128
329
  }
129
330
  }).on('error', err => {
130
- console.log('Error while reading file.', err);
131
- }).on('end', () => {
132
- if (verbose) console.log('Read entire file: ', file);
133
- if (files.length > 0) {
134
- startIndex(files);
135
- return;
331
+ logger.error({
332
+ err,
333
+ file
334
+ }, 'Error while reading file');
335
+ })), 'Error while reading file');
336
+ }
337
+ function processCsvFile(file) {
338
+ const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
339
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
340
+ try {
341
+ addParsedDoc(record, file);
342
+ } catch (err) {
343
+ logger.error({
344
+ err,
345
+ file
346
+ }, 'Failed to process CSV record');
347
+ }
348
+ }).on('error', err => {
349
+ logger.error({
350
+ err,
351
+ file
352
+ }, 'Error while reading CSV file');
353
+ })), 'Error while reading CSV file');
354
+ }
355
+ async function processFile(file) {
356
+ if (sourceFormat === 'csv') {
357
+ await processCsvFile(file);
358
+ return;
359
+ }
360
+ if (sourceFormat === 'ndjson') {
361
+ await processNdjsonFile(file);
362
+ return;
363
+ }
364
+ if (sourceFormat === 'parquet') {
365
+ await processParquetFile(file);
366
+ return;
367
+ }
368
+ if (sourceFormat === 'arrow') {
369
+ await processArrowFile(file);
370
+ return;
371
+ }
372
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
373
+ }
374
+ async function startIndex(files) {
375
+ if (files.length === 0) {
376
+ indexer.finish();
377
+ return;
378
+ }
379
+ try {
380
+ for (const file of files) {
381
+ // eslint-disable-next-line no-await-in-loop
382
+ await processFile(file);
136
383
  }
384
+ } catch (err) {
385
+ logger.error({
386
+ err,
387
+ files
388
+ }, 'Error while processing files');
389
+ } finally {
137
390
  indexer.finish();
138
- finished = true;
139
- }));
140
- indexer.queueEmitter.on('pause', () => {
141
- if (finished) return;
142
- s.pause();
143
- });
144
- indexer.queueEmitter.on('resume', () => {
145
- if (finished) return;
146
- s.resume();
147
- });
391
+ }
148
392
  }
149
393
  return () => {
150
394
  try {
151
395
  const files = glob.globSync(fileName);
152
396
  startIndex(files);
153
- } catch (error) {
154
- console.log('Error matching files:', error);
397
+ } catch (err) {
398
+ logger.error({
399
+ err,
400
+ fileName
401
+ }, 'Error matching files');
402
+ indexer.finish();
155
403
  }
156
404
  };
157
405
  }
158
406
 
159
407
  const EventEmitter = require('events');
160
- const queueEmitter = new EventEmitter();
161
408
  const parallelCalls = 5;
162
409
 
163
410
  // a simple helper queue to bulk index documents
@@ -165,21 +412,20 @@ function indexQueueFactory({
165
412
  targetClient: client,
166
413
  targetIndexName,
167
414
  bufferSize = DEFAULT_BUFFER_SIZE,
168
- skipHeader = false
415
+ logger
169
416
  }) {
417
+ const queueEmitter = new EventEmitter();
170
418
  let docsPerSecond = 0;
171
419
  const flushBytes = bufferSize * 1024; // Convert KB to Bytes
172
420
  const highWaterMark = flushBytes * parallelCalls;
173
421
 
174
- // Create a Readable stream
175
- const stream$1 = new stream.Readable({
176
- read() {},
177
- // Implement read but we manage pushing manually
422
+ // Create a PassThrough stream (readable + writable) for proper backpressure
423
+ const stream$1 = new stream.PassThrough({
178
424
  highWaterMark // Buffer size for backpressure management
179
425
  });
180
426
  async function* ndjsonStreamIterator(readableStream) {
181
427
  let buffer = ''; // To hold the incomplete data
182
- let skippedHeader = false;
428
+
183
429
  try {
184
430
  // Iterate over the stream using async iteration
185
431
  for await (const chunk of readableStream) {
@@ -193,16 +439,15 @@ function indexQueueFactory({
193
439
 
194
440
  // Yield each complete JSON object
195
441
  for (const line of lines) {
196
- if (line.trim()) {
197
- try {
198
- if (!skipHeader || skipHeader && !skippedHeader) {
199
- yield JSON.parse(line); // Parse and yield the JSON object
200
- skippedHeader = true;
201
- }
202
- } catch (err) {
203
- // Handle JSON parse errors if necessary
204
- console.error('Failed to parse JSON:', err);
205
- }
442
+ if (!line.trim()) {
443
+ continue;
444
+ }
445
+ try {
446
+ yield JSON.parse(line); // Parse and yield the JSON object
447
+ } catch (err) {
448
+ logger.error({
449
+ err
450
+ }, 'Failed to parse JSON from NDJSON stream');
206
451
  }
207
452
  }
208
453
  }
@@ -212,7 +457,9 @@ function indexQueueFactory({
212
457
  try {
213
458
  yield JSON.parse(buffer);
214
459
  } catch (err) {
215
- console.error('Failed to parse final JSON:', err);
460
+ logger.error({
461
+ err
462
+ }, 'Failed to parse final JSON from NDJSON stream');
216
463
  }
217
464
  }
218
465
  } finally {
@@ -238,7 +485,7 @@ function indexQueueFactory({
238
485
  flushInterval: 1000,
239
486
  refreshOnCompletion: true,
240
487
  datasource: ndjsonStreamIterator(stream$1),
241
- onDocument(doc) {
488
+ onDocument() {
242
489
  docsPerSecond++;
243
490
  return {
244
491
  index: {
@@ -247,9 +494,13 @@ function indexQueueFactory({
247
494
  };
248
495
  }
249
496
  });
250
- } catch (error) {
251
- console.error('Error during bulk indexing:', error);
252
- throw error;
497
+ } catch (err) {
498
+ logger.error({
499
+ err,
500
+ targetIndexName
501
+ }, 'Error during bulk indexing');
502
+ queueEmitter.emit('error', err);
503
+ throw err;
253
504
  } finally {
254
505
  // Clean up interval
255
506
  clearInterval(interval);
@@ -278,7 +529,7 @@ function indexQueueFactory({
278
529
  if (finished) {
279
530
  throw new Error('Unexpected doc added after indexer should finish.');
280
531
  }
281
- const canContinue = stream$1.push(`${JSON.stringify(doc)}\n`);
532
+ const canContinue = stream$1.write(`${JSON.stringify(doc)}\n`);
282
533
  if (!canContinue) {
283
534
  queueEmitter.emit('pause');
284
535
 
@@ -291,7 +542,7 @@ function indexQueueFactory({
291
542
  },
292
543
  finish: () => {
293
544
  finished = true;
294
- stream$1.push(null);
545
+ stream$1.end();
295
546
  },
296
547
  queueEmitter
297
548
  };
@@ -299,7 +550,7 @@ function indexQueueFactory({
299
550
 
300
551
  // create a new progress bar instance and use shades_classic theme
301
552
  const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
302
- function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
553
+ function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
303
554
  return async function indexReader() {
304
555
  let docsNum = 0;
305
556
  let scrollId;
@@ -318,8 +569,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
318
569
  maxRetries: 0
319
570
  });
320
571
  return Object.keys(response.fields);
321
- } catch (e) {
322
- console.log('error', e);
572
+ } catch (err) {
573
+ logger.error({
574
+ err,
575
+ sourceIndexName
576
+ }, 'Failed to fetch populated fields');
323
577
  }
324
578
  }
325
579
  function search(fields) {
@@ -363,8 +617,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
363
617
  return;
364
618
  }
365
619
  indexer.add(doc);
366
- } catch (e) {
367
- console.log('error', e);
620
+ } catch (err) {
621
+ logger.error({
622
+ err
623
+ }, 'Failed to process source index document');
368
624
  }
369
625
  }
370
626
  async function fetchNextResponse() {
@@ -408,48 +664,339 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
408
664
  };
409
665
  }
410
666
 
411
- function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
412
- function startIndex() {
413
- let finished = false;
414
- const s = stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
667
+ const DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES = 100000;
668
+ const DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE = 1000;
669
+ function readSample(filePath, sampleBytes) {
670
+ const fd = fs.openSync(filePath, 'r');
671
+ try {
672
+ const buffer = Buffer.alloc(sampleBytes);
673
+ const bytesRead = fs.readSync(fd, buffer, 0, sampleBytes, 0);
674
+ return buffer.subarray(0, bytesRead).toString('utf8');
675
+ } finally {
676
+ fs.closeSync(fd);
677
+ }
678
+ }
679
+ function emptyInferenceResult(mappings) {
680
+ return {
681
+ mappings,
682
+ ingestPipeline: undefined
683
+ };
684
+ }
685
+ async function inferMappingsFromSource({
686
+ targetClient,
687
+ fileName,
688
+ sourceFormat,
689
+ csvOptions,
690
+ skipHeader,
691
+ mappings,
692
+ inferMappings,
693
+ inferMappingsOptions,
694
+ logger
695
+ }) {
696
+ if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
697
+ return emptyInferenceResult(mappings);
698
+ }
699
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
700
+ logger.info({
701
+ sourceFormat
702
+ }, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
703
+ return emptyInferenceResult(mappings);
704
+ }
705
+ if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
706
+ return emptyInferenceResult(mappings);
707
+ }
708
+ const files = glob.globSync(fileName);
709
+ if (files.length === 0) {
710
+ logger.info({
711
+ fileName
712
+ }, 'No files matched for mapping inference');
713
+ return emptyInferenceResult(mappings);
714
+ }
715
+ const {
716
+ sampleBytes = DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES,
717
+ ...requestParams
718
+ } = inferMappingsOptions || {};
719
+ const sampleText = readSample(files[0], sampleBytes);
720
+ if (!sampleText || sampleText.trim() === '') {
721
+ logger.info('Skipping mapping inference because the sample text is empty');
722
+ return emptyInferenceResult(mappings);
723
+ }
724
+ const params = {
725
+ body: sampleText,
726
+ lines_to_sample: DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE,
727
+ ...requestParams
728
+ };
729
+ if (typeof params.format === 'undefined') {
730
+ params.format = sourceFormat === 'csv' ? 'delimited' : 'ndjson';
731
+ }
732
+ if (sourceFormat === 'csv') {
733
+ if (typeof params.delimiter === 'undefined' && typeof csvOptions?.delimiter === 'string') {
734
+ params.delimiter = csvOptions.delimiter;
735
+ }
736
+ if (typeof params.quote === 'undefined' && typeof csvOptions?.quote === 'string') {
737
+ params.quote = csvOptions.quote;
738
+ }
739
+ if (typeof params.has_header_row === 'undefined' && typeof csvOptions?.columns === 'boolean') {
740
+ params.has_header_row = csvOptions.columns;
741
+ }
742
+ if (typeof params.has_header_row === 'undefined' && skipHeader) {
743
+ params.has_header_row = true;
744
+ }
745
+ }
746
+ try {
747
+ const response = await targetClient.textStructure.findStructure(params);
748
+ if (response?.mappings) {
749
+ logger.info({
750
+ file: files[0]
751
+ }, 'Inferred mappings via _text_structure/find_structure');
752
+ }
753
+ if (response?.ingest_pipeline) {
754
+ logger.info('Inferred ingest pipeline via _text_structure/find_structure');
755
+ }
756
+ return {
757
+ mappings: response?.mappings || mappings,
758
+ ingestPipeline: response?.ingest_pipeline
759
+ };
760
+ } catch (err) {
761
+ logger.warn({
762
+ err
763
+ }, 'Could not infer mappings via _text_structure/find_structure');
764
+ return emptyInferenceResult(mappings);
765
+ }
766
+ }
767
+
768
+ const DEFAULT_LOG_LEVEL = 'info';
769
+ function resolveLogLevel(verbose = true) {
770
+ if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
771
+ return process.env.LOG_LEVEL;
772
+ }
773
+ return verbose ? DEFAULT_LOG_LEVEL : 'error';
774
+ }
775
+ function createLogger({
776
+ logger,
777
+ verbose = true
778
+ } = {}) {
779
+ if (logger && typeof logger === 'object') {
780
+ return logger;
781
+ }
782
+ return pino({
783
+ name: 'node-es-transformer',
784
+ level: resolveLogLevel(verbose),
785
+ timestamp: pino.stdTimeFunctions.isoTime,
786
+ serializers: {
787
+ err: pino.stdSerializers.err,
788
+ error: pino.stdSerializers.err
789
+ }
790
+ });
791
+ }
792
+ function createChildLogger(logger, bindings) {
793
+ if (!logger || typeof logger.child !== 'function') {
794
+ return logger;
795
+ }
796
+ return logger.child(bindings);
797
+ }
798
+
799
+ function createPauseWaiter(queueEmitter) {
800
+ let paused = false;
801
+ let waiters = [];
802
+ const onPause = () => {
803
+ paused = true;
804
+ };
805
+ const onResume = () => {
806
+ paused = false;
807
+ waiters.forEach(resolve => resolve());
808
+ waiters = [];
809
+ };
810
+ queueEmitter.on('pause', onPause);
811
+ queueEmitter.on('resume', onResume);
812
+ return {
813
+ async waitIfPaused() {
814
+ if (!paused) return;
815
+ await new Promise(resolve => {
816
+ waiters.push(resolve);
817
+ });
818
+ },
819
+ cleanup() {
820
+ queueEmitter.removeListener('pause', onPause);
821
+ queueEmitter.removeListener('resume', onResume);
822
+ waiters.forEach(resolve => resolve());
823
+ waiters = [];
824
+ }
825
+ };
826
+ }
827
+ async function readStreamToBuffer(stream) {
828
+ const chunks = [];
829
+ for await (const chunk of stream) {
830
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
831
+ }
832
+ return Buffer.concat(chunks);
833
+ }
834
+ function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
835
+ function addParsedDoc(parsed) {
836
+ const doc = typeof transform === 'function' ? transform(parsed) : parsed;
837
+
838
+ // if doc is null/undefined we'll skip indexing it
839
+ if (doc === null || typeof doc === 'undefined') {
840
+ return;
841
+ }
842
+
843
+ // the transform callback may return an array of docs so we can emit
844
+ // multiple docs from a single line
845
+ if (Array.isArray(doc)) {
846
+ doc.forEach(d => {
847
+ if (d === null || typeof d === 'undefined') return;
848
+ indexer.add(d);
849
+ });
850
+ return;
851
+ }
852
+ indexer.add(doc);
853
+ }
854
+ async function processParquetStream() {
855
+ const {
856
+ waitIfPaused,
857
+ cleanup
858
+ } = createPauseWaiter(indexer.queueEmitter);
859
+ const parquetBuffer = await readStreamToBuffer(stream);
860
+ const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
861
+ try {
862
+ const cursor = reader.getCursor();
863
+ while (true) {
864
+ // eslint-disable-next-line no-await-in-loop
865
+ const row = await cursor.next();
866
+ if (row === null || typeof row === 'undefined') {
867
+ break;
868
+ }
869
+ addParsedDoc(row);
870
+ // eslint-disable-next-line no-await-in-loop
871
+ await waitIfPaused();
872
+ }
873
+ logger.info('Read entire stream');
874
+ } finally {
875
+ cleanup();
876
+ await reader.close();
877
+ }
878
+ }
879
+ async function processArrowStream() {
880
+ const {
881
+ waitIfPaused,
882
+ cleanup
883
+ } = createPauseWaiter(indexer.queueEmitter);
884
+ try {
885
+ const reader = await arrow__namespace.RecordBatchReader.from(stream);
886
+ for await (const recordBatch of reader) {
887
+ const {
888
+ fields
889
+ } = recordBatch.schema;
890
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
891
+ const row = {};
892
+ fields.forEach(field => {
893
+ const vector = recordBatch.getChild(field.name);
894
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
895
+ });
896
+ addParsedDoc(row);
897
+ // eslint-disable-next-line no-await-in-loop
898
+ await waitIfPaused();
899
+ }
900
+ }
901
+ logger.info('Read entire stream');
902
+ } finally {
903
+ cleanup();
904
+ }
905
+ }
906
+ function processPipeline(buildPipeline, errorMessage) {
907
+ return new Promise((resolve, reject) => {
908
+ let finished = false;
909
+ const s = buildPipeline();
910
+ const onPause = () => {
911
+ if (finished) return;
912
+ s.pause();
913
+ };
914
+ const onResume = () => {
915
+ if (finished) return;
916
+ s.resume();
917
+ };
918
+ function cleanup() {
919
+ indexer.queueEmitter.removeListener('pause', onPause);
920
+ indexer.queueEmitter.removeListener('resume', onResume);
921
+ }
922
+ indexer.queueEmitter.on('pause', onPause);
923
+ indexer.queueEmitter.on('resume', onResume);
924
+ s.on('end', () => {
925
+ finished = true;
926
+ cleanup();
927
+ logger.info('Read entire stream');
928
+ resolve();
929
+ });
930
+ s.on('error', err => {
931
+ finished = true;
932
+ cleanup();
933
+ logger.error({
934
+ err
935
+ }, errorMessage);
936
+ reject(err);
937
+ });
938
+ });
939
+ }
940
+ function processCsvStream() {
941
+ return processPipeline(() => stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
942
+ try {
943
+ addParsedDoc(record);
944
+ } catch (err) {
945
+ logger.error({
946
+ err
947
+ }, 'Failed to process CSV stream record');
948
+ }
949
+ }).on('error', err => {
950
+ logger.error({
951
+ err
952
+ }, 'Error while reading CSV stream');
953
+ })), 'Error while reading CSV stream');
954
+ }
955
+ function processNdjsonStream() {
956
+ let skippedHeader = false;
957
+ return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
415
958
  try {
416
959
  // skip empty lines
417
960
  if (line === '') {
418
961
  return;
419
962
  }
420
- const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
421
-
422
- // if doc is undefined we'll skip indexing it
423
- if (typeof doc === 'undefined') {
424
- s.resume();
425
- return;
426
- }
427
-
428
- // the transform callback may return an array of docs so we can emit
429
- // multiple docs from a single line
430
- if (Array.isArray(doc)) {
431
- doc.forEach(d => indexer.add(d));
963
+ if (skipHeader && !skippedHeader) {
964
+ skippedHeader = true;
432
965
  return;
433
966
  }
434
- indexer.add(doc);
435
- } catch (e) {
436
- console.log('error', e);
967
+ const parsed = JSON.parse(line);
968
+ addParsedDoc(parsed);
969
+ } catch (err) {
970
+ logger.error({
971
+ err
972
+ }, 'Failed to process NDJSON stream line');
437
973
  }
438
974
  }).on('error', err => {
439
- console.log('Error while reading stream.', err);
440
- }).on('end', () => {
441
- if (verbose) console.log('Read entire stream.');
975
+ logger.error({
976
+ err
977
+ }, 'Error while reading stream');
978
+ })), 'Error while reading stream');
979
+ }
980
+ async function startIndex() {
981
+ try {
982
+ if (sourceFormat === 'csv') {
983
+ await processCsvStream();
984
+ } else if (sourceFormat === 'ndjson') {
985
+ await processNdjsonStream();
986
+ } else if (sourceFormat === 'parquet') {
987
+ await processParquetStream();
988
+ } else if (sourceFormat === 'arrow') {
989
+ await processArrowStream();
990
+ } else {
991
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
992
+ }
993
+ } catch (err) {
994
+ logger.error({
995
+ err
996
+ }, 'Error while reading stream');
997
+ } finally {
442
998
  indexer.finish();
443
- finished = true;
444
- }));
445
- indexer.queueEmitter.on('pause', () => {
446
- if (finished) return;
447
- s.pause();
448
- });
449
- indexer.queueEmitter.on('resume', () => {
450
- if (finished) return;
451
- s.resume();
452
- });
999
+ }
453
1000
  }
454
1001
  return () => {
455
1002
  startIndex();
@@ -530,22 +1077,31 @@ async function transformer({
530
1077
  searchSize = DEFAULT_SEARCH_SIZE,
531
1078
  stream,
532
1079
  fileName,
1080
+ sourceFormat = 'ndjson',
1081
+ csvOptions = {},
533
1082
  splitRegex = /\n/,
534
1083
  sourceIndexName,
535
1084
  targetIndexName,
536
1085
  mappings,
537
1086
  mappingsOverride = false,
1087
+ inferMappings = false,
1088
+ inferMappingsOptions = {},
538
1089
  indexMappingTotalFieldsLimit,
539
1090
  pipeline,
540
1091
  populatedFields = false,
541
1092
  query,
542
1093
  skipHeader = false,
543
1094
  transform,
544
- verbose = true
1095
+ verbose = true,
1096
+ logger: loggerInput
545
1097
  }) {
546
1098
  if (typeof targetIndexName === 'undefined') {
547
1099
  throw Error('targetIndexName must be specified.');
548
1100
  }
1101
+ const logger = createLogger({
1102
+ logger: loggerInput,
1103
+ verbose
1104
+ });
549
1105
  const defaultClientConfig = {
550
1106
  node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
551
1107
  };
@@ -553,23 +1109,47 @@ async function transformer({
553
1109
  // Support both old (config) and new (client instance) patterns
554
1110
  const sourceClient = await getOrCreateClient(sourceClientInput || sourceClientConfig, defaultClientConfig, sourceClientVersion);
555
1111
  const targetClient = await getOrCreateClient(targetClientInput || targetClientConfig || sourceClientInput || sourceClientConfig, defaultClientConfig, targetClientVersion);
1112
+ const inferenceResult = await inferMappingsFromSource({
1113
+ targetClient,
1114
+ fileName,
1115
+ sourceFormat,
1116
+ csvOptions,
1117
+ skipHeader,
1118
+ mappings,
1119
+ inferMappings,
1120
+ inferMappingsOptions,
1121
+ logger: createChildLogger(logger, {
1122
+ component: 'mapping-inference'
1123
+ })
1124
+ });
556
1125
  const createMapping = createMappingFactory({
557
1126
  sourceClient,
558
1127
  sourceIndexName,
559
1128
  targetClient,
560
1129
  targetIndexName,
561
- mappings,
1130
+ mappings: inferenceResult.mappings,
1131
+ inferredIngestPipeline: inferenceResult.ingestPipeline,
562
1132
  mappingsOverride,
563
1133
  indexMappingTotalFieldsLimit,
564
- verbose,
565
1134
  deleteIndex,
566
- pipeline
1135
+ pipeline,
1136
+ logger: createChildLogger(logger, {
1137
+ component: 'create-mapping'
1138
+ })
567
1139
  });
568
1140
  const indexer = indexQueueFactory({
569
1141
  targetClient,
570
1142
  targetIndexName,
571
1143
  bufferSize,
572
- skipHeader});
1144
+ logger: createChildLogger(logger, {
1145
+ component: 'index-queue'
1146
+ })
1147
+ });
1148
+ function validateSourceFormat() {
1149
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
1150
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
1151
+ }
1152
+ }
573
1153
  function getReader() {
574
1154
  if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
575
1155
  throw Error('Only either one of fileName or sourceIndexName can be specified.');
@@ -578,17 +1158,28 @@ async function transformer({
578
1158
  throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
579
1159
  }
580
1160
  if (typeof fileName !== 'undefined') {
581
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose);
1161
+ validateSourceFormat();
1162
+ return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1163
+ component: 'file-reader'
1164
+ }));
582
1165
  }
583
1166
  if (typeof sourceIndexName !== 'undefined') {
584
- return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
1167
+ return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
1168
+ component: 'index-reader'
1169
+ }));
585
1170
  }
586
1171
  if (typeof stream !== 'undefined') {
587
- return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
1172
+ validateSourceFormat();
1173
+ return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1174
+ component: 'stream-reader'
1175
+ }));
588
1176
  }
589
1177
  return null;
590
1178
  }
591
1179
  const reader = getReader();
1180
+ if (typeof reader !== 'function') {
1181
+ throw Error('One of fileName, sourceIndexName, or stream must be specified.');
1182
+ }
592
1183
  try {
593
1184
  const indexExists = await targetClient.indices.exists({
594
1185
  index: targetIndexName
@@ -605,8 +1196,11 @@ async function transformer({
605
1196
  } else {
606
1197
  reader();
607
1198
  }
608
- } catch (error) {
609
- console.error('Error checking index existence:', error);
1199
+ } catch (err) {
1200
+ logger.error({
1201
+ err,
1202
+ targetIndexName
1203
+ }, 'Error checking index existence');
610
1204
  } finally {
611
1205
  // targetClient.close();
612
1206
  }