node-es-transformer 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,8 @@
2
2
 
3
3
  var elasticsearch9 = require('es9');
4
4
  var elasticsearch8 = require('es8');
5
+ var parquet = require('@dsnp/parquetjs');
6
+ var arrow = require('apache-arrow');
5
7
  var fs = require('fs');
6
8
  var csvParse = require('csv-parse');
7
9
  var es = require('event-stream');
@@ -9,6 +11,26 @@ var glob = require('glob');
9
11
  var split = require('split2');
10
12
  var stream = require('stream');
11
13
  var cliProgress = require('cli-progress');
14
+ var pino = require('pino');
15
+
16
+ function _interopNamespaceDefault(e) {
17
+ var n = Object.create(null);
18
+ if (e) {
19
+ Object.keys(e).forEach(function (k) {
20
+ if (k !== 'default') {
21
+ var d = Object.getOwnPropertyDescriptor(e, k);
22
+ Object.defineProperty(n, k, d.get ? d : {
23
+ enumerable: true,
24
+ get: function () { return e[k]; }
25
+ });
26
+ }
27
+ });
28
+ }
29
+ n.default = e;
30
+ return Object.freeze(n);
31
+ }
32
+
33
+ var arrow__namespace = /*#__PURE__*/_interopNamespaceDefault(arrow);
12
34
 
13
35
  // In earlier versions this was used to set the number of docs to index in a
14
36
  // single bulk request. Since we switched to use the helpers.bulk() method from
@@ -29,9 +51,9 @@ function createMappingFactory({
29
51
  inferredIngestPipeline,
30
52
  mappingsOverride,
31
53
  indexMappingTotalFieldsLimit,
32
- verbose,
33
54
  deleteIndex,
34
- pipeline
55
+ pipeline,
56
+ logger
35
57
  }) {
36
58
  return async () => {
37
59
  let targetMappings = mappingsOverride ? undefined : mappings;
@@ -50,7 +72,10 @@ function createMappingFactory({
50
72
  }
51
73
  }
52
74
  } catch (err) {
53
- console.log('Error reading source mapping', err);
75
+ logger.error({
76
+ err,
77
+ sourceIndexName
78
+ }, 'Error reading source mapping');
54
79
  return;
55
80
  }
56
81
  }
@@ -82,9 +107,14 @@ function createMappingFactory({
82
107
  ...inferredIngestPipeline
83
108
  });
84
109
  defaultPipeline = inferredPipelineName;
85
- if (verbose) console.log(`Created inferred ingest pipeline ${inferredPipelineName}`);
110
+ logger.info({
111
+ inferredPipelineName
112
+ }, 'Created inferred ingest pipeline');
86
113
  } catch (err) {
87
- console.log('Error creating inferred ingest pipeline', err);
114
+ logger.error({
115
+ err,
116
+ inferredPipelineName
117
+ }, 'Error creating inferred ingest pipeline');
88
118
  }
89
119
  }
90
120
  const settings = {
@@ -97,17 +127,23 @@ function createMappingFactory({
97
127
  'index.number_of_replicas': 0
98
128
  } : {})
99
129
  };
100
- const resp = await targetClient.indices.create({
130
+ const response = await targetClient.indices.create({
101
131
  index: targetIndexName,
102
132
  mappings: targetMappings,
103
133
  ...(Object.keys(settings).length > 0 ? {
104
134
  settings
105
135
  } : {})
106
136
  });
107
- if (verbose) console.log('Created target mapping', resp);
137
+ logger.info({
138
+ targetIndexName,
139
+ response
140
+ }, 'Created target mapping');
108
141
  }
109
142
  } catch (err) {
110
- console.log('Error creating target mapping', err);
143
+ logger.error({
144
+ err,
145
+ targetIndexName
146
+ }, 'Error creating target mapping');
111
147
  }
112
148
  }
113
149
  };
@@ -128,8 +164,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
128
164
  return options;
129
165
  }
130
166
 
131
- function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
132
- function addParsedDoc(parsed, file, streamRef) {
167
+ function createPauseWaiter$1(queueEmitter) {
168
+ let paused = false;
169
+ let waiters = [];
170
+ const onPause = () => {
171
+ paused = true;
172
+ };
173
+ const onResume = () => {
174
+ paused = false;
175
+ waiters.forEach(resolve => resolve());
176
+ waiters = [];
177
+ };
178
+ queueEmitter.on('pause', onPause);
179
+ queueEmitter.on('resume', onResume);
180
+ return {
181
+ async waitIfPaused() {
182
+ if (!paused) return;
183
+ await new Promise(resolve => {
184
+ waiters.push(resolve);
185
+ });
186
+ },
187
+ cleanup() {
188
+ queueEmitter.removeListener('pause', onPause);
189
+ queueEmitter.removeListener('resume', onResume);
190
+ waiters.forEach(resolve => resolve());
191
+ waiters = [];
192
+ }
193
+ };
194
+ }
195
+ function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
196
+ function addParsedDoc(parsed, file) {
133
197
  const context = {
134
198
  fileName: file
135
199
  };
@@ -137,7 +201,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
137
201
 
138
202
  // if doc is null/undefined we'll skip indexing it
139
203
  if (doc === null || typeof doc === 'undefined') {
140
- streamRef.resume();
141
204
  return;
142
205
  }
143
206
 
@@ -152,9 +215,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
152
215
  }
153
216
  indexer.add(doc);
154
217
  }
155
- function createNdjsonReader(file) {
218
+ async function processParquetFile(file) {
219
+ const {
220
+ waitIfPaused,
221
+ cleanup
222
+ } = createPauseWaiter$1(indexer.queueEmitter);
223
+ const reader = await parquet.ParquetReader.openFile(file);
224
+ try {
225
+ const cursor = reader.getCursor();
226
+ while (true) {
227
+ // eslint-disable-next-line no-await-in-loop
228
+ const row = await cursor.next();
229
+ if (row === null || typeof row === 'undefined') {
230
+ break;
231
+ }
232
+ addParsedDoc(row, file);
233
+ // eslint-disable-next-line no-await-in-loop
234
+ await waitIfPaused();
235
+ }
236
+ logger.info({
237
+ file
238
+ }, 'Read entire file');
239
+ } finally {
240
+ cleanup();
241
+ await reader.close();
242
+ }
243
+ }
244
+ async function processArrowFile(file) {
245
+ const {
246
+ waitIfPaused,
247
+ cleanup
248
+ } = createPauseWaiter$1(indexer.queueEmitter);
249
+ try {
250
+ const reader = await arrow__namespace.RecordBatchReader.from(fs.createReadStream(file));
251
+ for await (const recordBatch of reader) {
252
+ const {
253
+ fields
254
+ } = recordBatch.schema;
255
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
256
+ const row = {};
257
+ fields.forEach(field => {
258
+ const vector = recordBatch.getChild(field.name);
259
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
260
+ });
261
+ addParsedDoc(row, file);
262
+ // eslint-disable-next-line no-await-in-loop
263
+ await waitIfPaused();
264
+ }
265
+ }
266
+ logger.info({
267
+ file
268
+ }, 'Read entire file');
269
+ } finally {
270
+ cleanup();
271
+ }
272
+ }
273
+ function processStreamFile(file, buildStream, errorMessage) {
274
+ return new Promise((resolve, reject) => {
275
+ let finished = false;
276
+ const s = buildStream();
277
+ const onPause = () => {
278
+ if (finished) return;
279
+ s.pause();
280
+ };
281
+ const onResume = () => {
282
+ if (finished) return;
283
+ s.resume();
284
+ };
285
+ function cleanup() {
286
+ indexer.queueEmitter.removeListener('pause', onPause);
287
+ indexer.queueEmitter.removeListener('resume', onResume);
288
+ }
289
+ indexer.queueEmitter.on('pause', onPause);
290
+ indexer.queueEmitter.on('resume', onResume);
291
+ s.on('end', () => {
292
+ finished = true;
293
+ cleanup();
294
+ logger.info({
295
+ file
296
+ }, 'Read entire file');
297
+ resolve();
298
+ });
299
+ s.on('error', err => {
300
+ finished = true;
301
+ cleanup();
302
+ logger.error({
303
+ err,
304
+ file
305
+ }, errorMessage);
306
+ reject(err);
307
+ });
308
+ });
309
+ }
310
+ function processNdjsonFile(file) {
156
311
  let skippedHeader = false;
157
- const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
312
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
158
313
  try {
159
314
  // skip empty lines
160
315
  if (line === '') {
@@ -165,60 +320,86 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
165
320
  return;
166
321
  }
167
322
  const parsed = JSON.parse(line);
168
- addParsedDoc(parsed, file, s);
169
- } catch (e) {
170
- console.log('error', e);
323
+ addParsedDoc(parsed, file);
324
+ } catch (err) {
325
+ logger.error({
326
+ err,
327
+ file
328
+ }, 'Failed to process NDJSON line');
171
329
  }
172
330
  }).on('error', err => {
173
- console.log('Error while reading file.', err);
174
- }));
175
- return s;
331
+ logger.error({
332
+ err,
333
+ file
334
+ }, 'Error while reading file');
335
+ })), 'Error while reading file');
176
336
  }
177
- function createCsvReader(file) {
337
+ function processCsvFile(file) {
178
338
  const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
179
- const s = fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
339
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
180
340
  try {
181
- addParsedDoc(record, file, s);
182
- } catch (e) {
183
- console.log('error', e);
341
+ addParsedDoc(record, file);
342
+ } catch (err) {
343
+ logger.error({
344
+ err,
345
+ file
346
+ }, 'Failed to process CSV record');
184
347
  }
185
348
  }).on('error', err => {
186
- console.log('Error while reading CSV file.', err);
187
- }));
188
- return s;
349
+ logger.error({
350
+ err,
351
+ file
352
+ }, 'Error while reading CSV file');
353
+ })), 'Error while reading CSV file');
189
354
  }
190
- function startIndex(files) {
191
- let finished = false;
355
+ async function processFile(file) {
356
+ if (sourceFormat === 'csv') {
357
+ await processCsvFile(file);
358
+ return;
359
+ }
360
+ if (sourceFormat === 'ndjson') {
361
+ await processNdjsonFile(file);
362
+ return;
363
+ }
364
+ if (sourceFormat === 'parquet') {
365
+ await processParquetFile(file);
366
+ return;
367
+ }
368
+ if (sourceFormat === 'arrow') {
369
+ await processArrowFile(file);
370
+ return;
371
+ }
372
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
373
+ }
374
+ async function startIndex(files) {
192
375
  if (files.length === 0) {
193
376
  indexer.finish();
194
377
  return;
195
378
  }
196
- const file = files.shift();
197
- const s = sourceFormat === 'csv' ? createCsvReader(file) : createNdjsonReader(file);
198
- s.on('end', () => {
199
- if (verbose) console.log('Read entire file: ', file);
200
- if (files.length > 0) {
201
- startIndex(files);
202
- return;
379
+ try {
380
+ for (const file of files) {
381
+ // eslint-disable-next-line no-await-in-loop
382
+ await processFile(file);
203
383
  }
384
+ } catch (err) {
385
+ logger.error({
386
+ err,
387
+ files
388
+ }, 'Error while processing files');
389
+ } finally {
204
390
  indexer.finish();
205
- finished = true;
206
- });
207
- indexer.queueEmitter.on('pause', () => {
208
- if (finished) return;
209
- s.pause();
210
- });
211
- indexer.queueEmitter.on('resume', () => {
212
- if (finished) return;
213
- s.resume();
214
- });
391
+ }
215
392
  }
216
393
  return () => {
217
394
  try {
218
395
  const files = glob.globSync(fileName);
219
396
  startIndex(files);
220
- } catch (error) {
221
- console.log('Error matching files:', error);
397
+ } catch (err) {
398
+ logger.error({
399
+ err,
400
+ fileName
401
+ }, 'Error matching files');
402
+ indexer.finish();
222
403
  }
223
404
  };
224
405
  }
@@ -230,7 +411,8 @@ const parallelCalls = 5;
230
411
  function indexQueueFactory({
231
412
  targetClient: client,
232
413
  targetIndexName,
233
- bufferSize = DEFAULT_BUFFER_SIZE
414
+ bufferSize = DEFAULT_BUFFER_SIZE,
415
+ logger
234
416
  }) {
235
417
  const queueEmitter = new EventEmitter();
236
418
  let docsPerSecond = 0;
@@ -263,8 +445,9 @@ function indexQueueFactory({
263
445
  try {
264
446
  yield JSON.parse(line); // Parse and yield the JSON object
265
447
  } catch (err) {
266
- // Handle JSON parse errors if necessary
267
- console.error('Failed to parse JSON:', err);
448
+ logger.error({
449
+ err
450
+ }, 'Failed to parse JSON from NDJSON stream');
268
451
  }
269
452
  }
270
453
  }
@@ -274,7 +457,9 @@ function indexQueueFactory({
274
457
  try {
275
458
  yield JSON.parse(buffer);
276
459
  } catch (err) {
277
- console.error('Failed to parse final JSON:', err);
460
+ logger.error({
461
+ err
462
+ }, 'Failed to parse final JSON from NDJSON stream');
278
463
  }
279
464
  }
280
465
  } finally {
@@ -300,7 +485,7 @@ function indexQueueFactory({
300
485
  flushInterval: 1000,
301
486
  refreshOnCompletion: true,
302
487
  datasource: ndjsonStreamIterator(stream$1),
303
- onDocument(doc) {
488
+ onDocument() {
304
489
  docsPerSecond++;
305
490
  return {
306
491
  index: {
@@ -309,9 +494,13 @@ function indexQueueFactory({
309
494
  };
310
495
  }
311
496
  });
312
- } catch (error) {
313
- console.error('Error during bulk indexing:', error);
314
- throw error;
497
+ } catch (err) {
498
+ logger.error({
499
+ err,
500
+ targetIndexName
501
+ }, 'Error during bulk indexing');
502
+ queueEmitter.emit('error', err);
503
+ throw err;
315
504
  } finally {
316
505
  // Clean up interval
317
506
  clearInterval(interval);
@@ -361,7 +550,7 @@ function indexQueueFactory({
361
550
 
362
551
  // create a new progress bar instance and use shades_classic theme
363
552
  const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
364
- function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
553
+ function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
365
554
  return async function indexReader() {
366
555
  let docsNum = 0;
367
556
  let scrollId;
@@ -380,8 +569,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
380
569
  maxRetries: 0
381
570
  });
382
571
  return Object.keys(response.fields);
383
- } catch (e) {
384
- console.log('error', e);
572
+ } catch (err) {
573
+ logger.error({
574
+ err,
575
+ sourceIndexName
576
+ }, 'Failed to fetch populated fields');
385
577
  }
386
578
  }
387
579
  function search(fields) {
@@ -425,8 +617,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
425
617
  return;
426
618
  }
427
619
  indexer.add(doc);
428
- } catch (e) {
429
- console.log('error', e);
620
+ } catch (err) {
621
+ logger.error({
622
+ err
623
+ }, 'Failed to process source index document');
430
624
  }
431
625
  }
432
626
  async function fetchNextResponse() {
@@ -497,17 +691,25 @@ async function inferMappingsFromSource({
497
691
  mappings,
498
692
  inferMappings,
499
693
  inferMappingsOptions,
500
- verbose
694
+ logger
501
695
  }) {
502
696
  if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
503
697
  return emptyInferenceResult(mappings);
504
698
  }
699
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
700
+ logger.info({
701
+ sourceFormat
702
+ }, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
703
+ return emptyInferenceResult(mappings);
704
+ }
505
705
  if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
506
706
  return emptyInferenceResult(mappings);
507
707
  }
508
708
  const files = glob.globSync(fileName);
509
709
  if (files.length === 0) {
510
- if (verbose) console.log(`No files matched for mapping inference: ${fileName}`);
710
+ logger.info({
711
+ fileName
712
+ }, 'No files matched for mapping inference');
511
713
  return emptyInferenceResult(mappings);
512
714
  }
513
715
  const {
@@ -516,7 +718,7 @@ async function inferMappingsFromSource({
516
718
  } = inferMappingsOptions || {};
517
719
  const sampleText = readSample(files[0], sampleBytes);
518
720
  if (!sampleText || sampleText.trim() === '') {
519
- if (verbose) console.log('Skipping mapping inference because the sample text is empty.');
721
+ logger.info('Skipping mapping inference because the sample text is empty');
520
722
  return emptyInferenceResult(mappings);
521
723
  }
522
724
  const params = {
@@ -543,31 +745,98 @@ async function inferMappingsFromSource({
543
745
  }
544
746
  try {
545
747
  const response = await targetClient.textStructure.findStructure(params);
546
- if (response?.mappings && verbose) {
547
- console.log(`Inferred mappings via _text_structure/find_structure from ${files[0]}`);
748
+ if (response?.mappings) {
749
+ logger.info({
750
+ file: files[0]
751
+ }, 'Inferred mappings via _text_structure/find_structure');
548
752
  }
549
- if (response?.ingest_pipeline && verbose) {
550
- console.log('Inferred ingest pipeline via _text_structure/find_structure');
753
+ if (response?.ingest_pipeline) {
754
+ logger.info('Inferred ingest pipeline via _text_structure/find_structure');
551
755
  }
552
756
  return {
553
757
  mappings: response?.mappings || mappings,
554
758
  ingestPipeline: response?.ingest_pipeline
555
759
  };
556
- } catch (error) {
557
- if (verbose) {
558
- console.log('Could not infer mappings via _text_structure/find_structure:', error.message);
559
- }
760
+ } catch (err) {
761
+ logger.warn({
762
+ err
763
+ }, 'Could not infer mappings via _text_structure/find_structure');
560
764
  return emptyInferenceResult(mappings);
561
765
  }
562
766
  }
563
767
 
564
- function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
565
- function addParsedDoc(parsed, streamRef) {
768
+ const DEFAULT_LOG_LEVEL = 'info';
769
+ function resolveLogLevel(verbose = true) {
770
+ if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
771
+ return process.env.LOG_LEVEL;
772
+ }
773
+ return verbose ? DEFAULT_LOG_LEVEL : 'error';
774
+ }
775
+ function createLogger({
776
+ logger,
777
+ verbose = true
778
+ } = {}) {
779
+ if (logger && typeof logger === 'object') {
780
+ return logger;
781
+ }
782
+ return pino({
783
+ name: 'node-es-transformer',
784
+ level: resolveLogLevel(verbose),
785
+ timestamp: pino.stdTimeFunctions.isoTime,
786
+ serializers: {
787
+ err: pino.stdSerializers.err,
788
+ error: pino.stdSerializers.err
789
+ }
790
+ });
791
+ }
792
+ function createChildLogger(logger, bindings) {
793
+ if (!logger || typeof logger.child !== 'function') {
794
+ return logger;
795
+ }
796
+ return logger.child(bindings);
797
+ }
798
+
799
+ function createPauseWaiter(queueEmitter) {
800
+ let paused = false;
801
+ let waiters = [];
802
+ const onPause = () => {
803
+ paused = true;
804
+ };
805
+ const onResume = () => {
806
+ paused = false;
807
+ waiters.forEach(resolve => resolve());
808
+ waiters = [];
809
+ };
810
+ queueEmitter.on('pause', onPause);
811
+ queueEmitter.on('resume', onResume);
812
+ return {
813
+ async waitIfPaused() {
814
+ if (!paused) return;
815
+ await new Promise(resolve => {
816
+ waiters.push(resolve);
817
+ });
818
+ },
819
+ cleanup() {
820
+ queueEmitter.removeListener('pause', onPause);
821
+ queueEmitter.removeListener('resume', onResume);
822
+ waiters.forEach(resolve => resolve());
823
+ waiters = [];
824
+ }
825
+ };
826
+ }
827
+ async function readStreamToBuffer(stream) {
828
+ const chunks = [];
829
+ for await (const chunk of stream) {
830
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
831
+ }
832
+ return Buffer.concat(chunks);
833
+ }
834
+ function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
835
+ function addParsedDoc(parsed) {
566
836
  const doc = typeof transform === 'function' ? transform(parsed) : parsed;
567
837
 
568
838
  // if doc is null/undefined we'll skip indexing it
569
839
  if (doc === null || typeof doc === 'undefined') {
570
- streamRef.resume();
571
840
  return;
572
841
  }
573
842
 
@@ -582,50 +851,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
582
851
  }
583
852
  indexer.add(doc);
584
853
  }
585
- function startIndex() {
586
- let finished = false;
587
- const s = sourceFormat === 'csv' ? stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
854
+ async function processParquetStream() {
855
+ const {
856
+ waitIfPaused,
857
+ cleanup
858
+ } = createPauseWaiter(indexer.queueEmitter);
859
+ const parquetBuffer = await readStreamToBuffer(stream);
860
+ const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
861
+ try {
862
+ const cursor = reader.getCursor();
863
+ while (true) {
864
+ // eslint-disable-next-line no-await-in-loop
865
+ const row = await cursor.next();
866
+ if (row === null || typeof row === 'undefined') {
867
+ break;
868
+ }
869
+ addParsedDoc(row);
870
+ // eslint-disable-next-line no-await-in-loop
871
+ await waitIfPaused();
872
+ }
873
+ logger.info('Read entire stream');
874
+ } finally {
875
+ cleanup();
876
+ await reader.close();
877
+ }
878
+ }
879
+ async function processArrowStream() {
880
+ const {
881
+ waitIfPaused,
882
+ cleanup
883
+ } = createPauseWaiter(indexer.queueEmitter);
884
+ try {
885
+ const reader = await arrow__namespace.RecordBatchReader.from(stream);
886
+ for await (const recordBatch of reader) {
887
+ const {
888
+ fields
889
+ } = recordBatch.schema;
890
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
891
+ const row = {};
892
+ fields.forEach(field => {
893
+ const vector = recordBatch.getChild(field.name);
894
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
895
+ });
896
+ addParsedDoc(row);
897
+ // eslint-disable-next-line no-await-in-loop
898
+ await waitIfPaused();
899
+ }
900
+ }
901
+ logger.info('Read entire stream');
902
+ } finally {
903
+ cleanup();
904
+ }
905
+ }
906
+ function processPipeline(buildPipeline, errorMessage) {
907
+ return new Promise((resolve, reject) => {
908
+ let finished = false;
909
+ const s = buildPipeline();
910
+ const onPause = () => {
911
+ if (finished) return;
912
+ s.pause();
913
+ };
914
+ const onResume = () => {
915
+ if (finished) return;
916
+ s.resume();
917
+ };
918
+ function cleanup() {
919
+ indexer.queueEmitter.removeListener('pause', onPause);
920
+ indexer.queueEmitter.removeListener('resume', onResume);
921
+ }
922
+ indexer.queueEmitter.on('pause', onPause);
923
+ indexer.queueEmitter.on('resume', onResume);
924
+ s.on('end', () => {
925
+ finished = true;
926
+ cleanup();
927
+ logger.info('Read entire stream');
928
+ resolve();
929
+ });
930
+ s.on('error', err => {
931
+ finished = true;
932
+ cleanup();
933
+ logger.error({
934
+ err
935
+ }, errorMessage);
936
+ reject(err);
937
+ });
938
+ });
939
+ }
940
+ function processCsvStream() {
941
+ return processPipeline(() => stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
588
942
  try {
589
- addParsedDoc(record, s);
590
- } catch (e) {
591
- console.log('error', e);
943
+ addParsedDoc(record);
944
+ } catch (err) {
945
+ logger.error({
946
+ err
947
+ }, 'Failed to process CSV stream record');
592
948
  }
593
949
  }).on('error', err => {
594
- console.log('Error while reading CSV stream.', err);
595
- })) : (() => {
596
- let skippedHeader = false;
597
- return stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
598
- try {
599
- // skip empty lines
600
- if (line === '') {
601
- return;
602
- }
603
- if (skipHeader && !skippedHeader) {
604
- skippedHeader = true;
605
- return;
606
- }
607
- const parsed = JSON.parse(line);
608
- addParsedDoc(parsed, s);
609
- } catch (e) {
610
- console.log('error', e);
950
+ logger.error({
951
+ err
952
+ }, 'Error while reading CSV stream');
953
+ })), 'Error while reading CSV stream');
954
+ }
955
+ function processNdjsonStream() {
956
+ let skippedHeader = false;
957
+ return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
958
+ try {
959
+ // skip empty lines
960
+ if (line === '') {
961
+ return;
611
962
  }
612
- }).on('error', err => {
613
- console.log('Error while reading stream.', err);
614
- }));
615
- })();
616
- s.on('end', () => {
617
- if (verbose) console.log('Read entire stream.');
963
+ if (skipHeader && !skippedHeader) {
964
+ skippedHeader = true;
965
+ return;
966
+ }
967
+ const parsed = JSON.parse(line);
968
+ addParsedDoc(parsed);
969
+ } catch (err) {
970
+ logger.error({
971
+ err
972
+ }, 'Failed to process NDJSON stream line');
973
+ }
974
+ }).on('error', err => {
975
+ logger.error({
976
+ err
977
+ }, 'Error while reading stream');
978
+ })), 'Error while reading stream');
979
+ }
980
+ async function startIndex() {
981
+ try {
982
+ if (sourceFormat === 'csv') {
983
+ await processCsvStream();
984
+ } else if (sourceFormat === 'ndjson') {
985
+ await processNdjsonStream();
986
+ } else if (sourceFormat === 'parquet') {
987
+ await processParquetStream();
988
+ } else if (sourceFormat === 'arrow') {
989
+ await processArrowStream();
990
+ } else {
991
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
992
+ }
993
+ } catch (err) {
994
+ logger.error({
995
+ err
996
+ }, 'Error while reading stream');
997
+ } finally {
618
998
  indexer.finish();
619
- finished = true;
620
- });
621
- indexer.queueEmitter.on('pause', () => {
622
- if (finished) return;
623
- s.pause();
624
- });
625
- indexer.queueEmitter.on('resume', () => {
626
- if (finished) return;
627
- s.resume();
628
- });
999
+ }
629
1000
  }
630
1001
  return () => {
631
1002
  startIndex();
@@ -721,11 +1092,16 @@ async function transformer({
721
1092
  query,
722
1093
  skipHeader = false,
723
1094
  transform,
724
- verbose = true
1095
+ verbose = true,
1096
+ logger: loggerInput
725
1097
  }) {
726
1098
  if (typeof targetIndexName === 'undefined') {
727
1099
  throw Error('targetIndexName must be specified.');
728
1100
  }
1101
+ const logger = createLogger({
1102
+ logger: loggerInput,
1103
+ verbose
1104
+ });
729
1105
  const defaultClientConfig = {
730
1106
  node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
731
1107
  };
@@ -742,7 +1118,9 @@ async function transformer({
742
1118
  mappings,
743
1119
  inferMappings,
744
1120
  inferMappingsOptions,
745
- verbose
1121
+ logger: createChildLogger(logger, {
1122
+ component: 'mapping-inference'
1123
+ })
746
1124
  });
747
1125
  const createMapping = createMappingFactory({
748
1126
  sourceClient,
@@ -753,17 +1131,23 @@ async function transformer({
753
1131
  inferredIngestPipeline: inferenceResult.ingestPipeline,
754
1132
  mappingsOverride,
755
1133
  indexMappingTotalFieldsLimit,
756
- verbose,
757
1134
  deleteIndex,
758
- pipeline
1135
+ pipeline,
1136
+ logger: createChildLogger(logger, {
1137
+ component: 'create-mapping'
1138
+ })
759
1139
  });
760
1140
  const indexer = indexQueueFactory({
761
1141
  targetClient,
762
1142
  targetIndexName,
763
- bufferSize});
1143
+ bufferSize,
1144
+ logger: createChildLogger(logger, {
1145
+ component: 'index-queue'
1146
+ })
1147
+ });
764
1148
  function validateSourceFormat() {
765
- if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
766
- throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "csv".`);
1149
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
1150
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
767
1151
  }
768
1152
  }
769
1153
  function getReader() {
@@ -775,18 +1159,27 @@ async function transformer({
775
1159
  }
776
1160
  if (typeof fileName !== 'undefined') {
777
1161
  validateSourceFormat();
778
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1162
+ return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1163
+ component: 'file-reader'
1164
+ }));
779
1165
  }
780
1166
  if (typeof sourceIndexName !== 'undefined') {
781
- return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
1167
+ return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
1168
+ component: 'index-reader'
1169
+ }));
782
1170
  }
783
1171
  if (typeof stream !== 'undefined') {
784
1172
  validateSourceFormat();
785
- return streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1173
+ return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1174
+ component: 'stream-reader'
1175
+ }));
786
1176
  }
787
1177
  return null;
788
1178
  }
789
1179
  const reader = getReader();
1180
+ if (typeof reader !== 'function') {
1181
+ throw Error('One of fileName, sourceIndexName, or stream must be specified.');
1182
+ }
790
1183
  try {
791
1184
  const indexExists = await targetClient.indices.exists({
792
1185
  index: targetIndexName
@@ -803,8 +1196,11 @@ async function transformer({
803
1196
  } else {
804
1197
  reader();
805
1198
  }
806
- } catch (error) {
807
- console.error('Error checking index existence:', error);
1199
+ } catch (err) {
1200
+ logger.error({
1201
+ err,
1202
+ targetIndexName
1203
+ }, 'Error checking index existence');
808
1204
  } finally {
809
1205
  // targetClient.close();
810
1206
  }