node-es-transformer 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
1
  import elasticsearch9 from 'es9';
2
2
  import elasticsearch8 from 'es8';
3
+ import parquet from '@dsnp/parquetjs';
4
+ import * as arrow from 'apache-arrow';
3
5
  import fs from 'fs';
4
6
  import { parse } from 'csv-parse';
5
7
  import es from 'event-stream';
@@ -7,6 +9,7 @@ import { globSync } from 'glob';
7
9
  import split from 'split2';
8
10
  import { PassThrough } from 'stream';
9
11
  import cliProgress from 'cli-progress';
12
+ import pino from 'pino';
10
13
 
11
14
  // In earlier versions this was used to set the number of docs to index in a
12
15
  // single bulk request. Since we switched to use the helpers.bulk() method from
@@ -27,9 +30,9 @@ function createMappingFactory({
27
30
  inferredIngestPipeline,
28
31
  mappingsOverride,
29
32
  indexMappingTotalFieldsLimit,
30
- verbose,
31
33
  deleteIndex,
32
- pipeline
34
+ pipeline,
35
+ logger
33
36
  }) {
34
37
  return async () => {
35
38
  let targetMappings = mappingsOverride ? undefined : mappings;
@@ -48,7 +51,10 @@ function createMappingFactory({
48
51
  }
49
52
  }
50
53
  } catch (err) {
51
- console.log('Error reading source mapping', err);
54
+ logger.error({
55
+ err,
56
+ sourceIndexName
57
+ }, 'Error reading source mapping');
52
58
  return;
53
59
  }
54
60
  }
@@ -80,9 +86,14 @@ function createMappingFactory({
80
86
  ...inferredIngestPipeline
81
87
  });
82
88
  defaultPipeline = inferredPipelineName;
83
- if (verbose) console.log(`Created inferred ingest pipeline ${inferredPipelineName}`);
89
+ logger.info({
90
+ inferredPipelineName
91
+ }, 'Created inferred ingest pipeline');
84
92
  } catch (err) {
85
- console.log('Error creating inferred ingest pipeline', err);
93
+ logger.error({
94
+ err,
95
+ inferredPipelineName
96
+ }, 'Error creating inferred ingest pipeline');
86
97
  }
87
98
  }
88
99
  const settings = {
@@ -95,17 +106,23 @@ function createMappingFactory({
95
106
  'index.number_of_replicas': 0
96
107
  } : {})
97
108
  };
98
- const resp = await targetClient.indices.create({
109
+ const response = await targetClient.indices.create({
99
110
  index: targetIndexName,
100
111
  mappings: targetMappings,
101
112
  ...(Object.keys(settings).length > 0 ? {
102
113
  settings
103
114
  } : {})
104
115
  });
105
- if (verbose) console.log('Created target mapping', resp);
116
+ logger.info({
117
+ targetIndexName,
118
+ response
119
+ }, 'Created target mapping');
106
120
  }
107
121
  } catch (err) {
108
- console.log('Error creating target mapping', err);
122
+ logger.error({
123
+ err,
124
+ targetIndexName
125
+ }, 'Error creating target mapping');
109
126
  }
110
127
  }
111
128
  };
@@ -126,8 +143,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
126
143
  return options;
127
144
  }
128
145
 
129
- function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
130
- function addParsedDoc(parsed, file, streamRef) {
146
+ function createPauseWaiter$1(queueEmitter) {
147
+ let paused = false;
148
+ let waiters = [];
149
+ const onPause = () => {
150
+ paused = true;
151
+ };
152
+ const onResume = () => {
153
+ paused = false;
154
+ waiters.forEach(resolve => resolve());
155
+ waiters = [];
156
+ };
157
+ queueEmitter.on('pause', onPause);
158
+ queueEmitter.on('resume', onResume);
159
+ return {
160
+ async waitIfPaused() {
161
+ if (!paused) return;
162
+ await new Promise(resolve => {
163
+ waiters.push(resolve);
164
+ });
165
+ },
166
+ cleanup() {
167
+ queueEmitter.removeListener('pause', onPause);
168
+ queueEmitter.removeListener('resume', onResume);
169
+ waiters.forEach(resolve => resolve());
170
+ waiters = [];
171
+ }
172
+ };
173
+ }
174
+ function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
175
+ function addParsedDoc(parsed, file) {
131
176
  const context = {
132
177
  fileName: file
133
178
  };
@@ -135,7 +180,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
135
180
 
136
181
  // if doc is null/undefined we'll skip indexing it
137
182
  if (doc === null || typeof doc === 'undefined') {
138
- streamRef.resume();
139
183
  return;
140
184
  }
141
185
 
@@ -150,9 +194,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
150
194
  }
151
195
  indexer.add(doc);
152
196
  }
153
- function createNdjsonReader(file) {
197
+ async function processParquetFile(file) {
198
+ const {
199
+ waitIfPaused,
200
+ cleanup
201
+ } = createPauseWaiter$1(indexer.queueEmitter);
202
+ const reader = await parquet.ParquetReader.openFile(file);
203
+ try {
204
+ const cursor = reader.getCursor();
205
+ while (true) {
206
+ // eslint-disable-next-line no-await-in-loop
207
+ const row = await cursor.next();
208
+ if (row === null || typeof row === 'undefined') {
209
+ break;
210
+ }
211
+ addParsedDoc(row, file);
212
+ // eslint-disable-next-line no-await-in-loop
213
+ await waitIfPaused();
214
+ }
215
+ logger.info({
216
+ file
217
+ }, 'Read entire file');
218
+ } finally {
219
+ cleanup();
220
+ await reader.close();
221
+ }
222
+ }
223
+ async function processArrowFile(file) {
224
+ const {
225
+ waitIfPaused,
226
+ cleanup
227
+ } = createPauseWaiter$1(indexer.queueEmitter);
228
+ try {
229
+ const reader = await arrow.RecordBatchReader.from(fs.createReadStream(file));
230
+ for await (const recordBatch of reader) {
231
+ const {
232
+ fields
233
+ } = recordBatch.schema;
234
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
235
+ const row = {};
236
+ fields.forEach(field => {
237
+ const vector = recordBatch.getChild(field.name);
238
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
239
+ });
240
+ addParsedDoc(row, file);
241
+ // eslint-disable-next-line no-await-in-loop
242
+ await waitIfPaused();
243
+ }
244
+ }
245
+ logger.info({
246
+ file
247
+ }, 'Read entire file');
248
+ } finally {
249
+ cleanup();
250
+ }
251
+ }
252
+ function processStreamFile(file, buildStream, errorMessage) {
253
+ return new Promise((resolve, reject) => {
254
+ let finished = false;
255
+ const s = buildStream();
256
+ const onPause = () => {
257
+ if (finished) return;
258
+ s.pause();
259
+ };
260
+ const onResume = () => {
261
+ if (finished) return;
262
+ s.resume();
263
+ };
264
+ function cleanup() {
265
+ indexer.queueEmitter.removeListener('pause', onPause);
266
+ indexer.queueEmitter.removeListener('resume', onResume);
267
+ }
268
+ indexer.queueEmitter.on('pause', onPause);
269
+ indexer.queueEmitter.on('resume', onResume);
270
+ s.on('end', () => {
271
+ finished = true;
272
+ cleanup();
273
+ logger.info({
274
+ file
275
+ }, 'Read entire file');
276
+ resolve();
277
+ });
278
+ s.on('error', err => {
279
+ finished = true;
280
+ cleanup();
281
+ logger.error({
282
+ err,
283
+ file
284
+ }, errorMessage);
285
+ reject(err);
286
+ });
287
+ });
288
+ }
289
+ function processNdjsonFile(file) {
154
290
  let skippedHeader = false;
155
- const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
291
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
156
292
  try {
157
293
  // skip empty lines
158
294
  if (line === '') {
@@ -163,60 +299,86 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
163
299
  return;
164
300
  }
165
301
  const parsed = JSON.parse(line);
166
- addParsedDoc(parsed, file, s);
167
- } catch (e) {
168
- console.log('error', e);
302
+ addParsedDoc(parsed, file);
303
+ } catch (err) {
304
+ logger.error({
305
+ err,
306
+ file
307
+ }, 'Failed to process NDJSON line');
169
308
  }
170
309
  }).on('error', err => {
171
- console.log('Error while reading file.', err);
172
- }));
173
- return s;
310
+ logger.error({
311
+ err,
312
+ file
313
+ }, 'Error while reading file');
314
+ })), 'Error while reading file');
174
315
  }
175
- function createCsvReader(file) {
316
+ function processCsvFile(file) {
176
317
  const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
177
- const s = fs.createReadStream(file).pipe(parse(parserOptions)).pipe(es.mapSync(record => {
318
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(parse(parserOptions)).pipe(es.mapSync(record => {
178
319
  try {
179
- addParsedDoc(record, file, s);
180
- } catch (e) {
181
- console.log('error', e);
320
+ addParsedDoc(record, file);
321
+ } catch (err) {
322
+ logger.error({
323
+ err,
324
+ file
325
+ }, 'Failed to process CSV record');
182
326
  }
183
327
  }).on('error', err => {
184
- console.log('Error while reading CSV file.', err);
185
- }));
186
- return s;
328
+ logger.error({
329
+ err,
330
+ file
331
+ }, 'Error while reading CSV file');
332
+ })), 'Error while reading CSV file');
187
333
  }
188
- function startIndex(files) {
189
- let finished = false;
334
+ async function processFile(file) {
335
+ if (sourceFormat === 'csv') {
336
+ await processCsvFile(file);
337
+ return;
338
+ }
339
+ if (sourceFormat === 'ndjson') {
340
+ await processNdjsonFile(file);
341
+ return;
342
+ }
343
+ if (sourceFormat === 'parquet') {
344
+ await processParquetFile(file);
345
+ return;
346
+ }
347
+ if (sourceFormat === 'arrow') {
348
+ await processArrowFile(file);
349
+ return;
350
+ }
351
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
352
+ }
353
+ async function startIndex(files) {
190
354
  if (files.length === 0) {
191
355
  indexer.finish();
192
356
  return;
193
357
  }
194
- const file = files.shift();
195
- const s = sourceFormat === 'csv' ? createCsvReader(file) : createNdjsonReader(file);
196
- s.on('end', () => {
197
- if (verbose) console.log('Read entire file: ', file);
198
- if (files.length > 0) {
199
- startIndex(files);
200
- return;
358
+ try {
359
+ for (const file of files) {
360
+ // eslint-disable-next-line no-await-in-loop
361
+ await processFile(file);
201
362
  }
363
+ } catch (err) {
364
+ logger.error({
365
+ err,
366
+ files
367
+ }, 'Error while processing files');
368
+ } finally {
202
369
  indexer.finish();
203
- finished = true;
204
- });
205
- indexer.queueEmitter.on('pause', () => {
206
- if (finished) return;
207
- s.pause();
208
- });
209
- indexer.queueEmitter.on('resume', () => {
210
- if (finished) return;
211
- s.resume();
212
- });
370
+ }
213
371
  }
214
372
  return () => {
215
373
  try {
216
374
  const files = globSync(fileName);
217
375
  startIndex(files);
218
- } catch (error) {
219
- console.log('Error matching files:', error);
376
+ } catch (err) {
377
+ logger.error({
378
+ err,
379
+ fileName
380
+ }, 'Error matching files');
381
+ indexer.finish();
220
382
  }
221
383
  };
222
384
  }
@@ -228,7 +390,8 @@ const parallelCalls = 5;
228
390
  function indexQueueFactory({
229
391
  targetClient: client,
230
392
  targetIndexName,
231
- bufferSize = DEFAULT_BUFFER_SIZE
393
+ bufferSize = DEFAULT_BUFFER_SIZE,
394
+ logger
232
395
  }) {
233
396
  const queueEmitter = new EventEmitter();
234
397
  let docsPerSecond = 0;
@@ -261,8 +424,9 @@ function indexQueueFactory({
261
424
  try {
262
425
  yield JSON.parse(line); // Parse and yield the JSON object
263
426
  } catch (err) {
264
- // Handle JSON parse errors if necessary
265
- console.error('Failed to parse JSON:', err);
427
+ logger.error({
428
+ err
429
+ }, 'Failed to parse JSON from NDJSON stream');
266
430
  }
267
431
  }
268
432
  }
@@ -272,7 +436,9 @@ function indexQueueFactory({
272
436
  try {
273
437
  yield JSON.parse(buffer);
274
438
  } catch (err) {
275
- console.error('Failed to parse final JSON:', err);
439
+ logger.error({
440
+ err
441
+ }, 'Failed to parse final JSON from NDJSON stream');
276
442
  }
277
443
  }
278
444
  } finally {
@@ -298,7 +464,7 @@ function indexQueueFactory({
298
464
  flushInterval: 1000,
299
465
  refreshOnCompletion: true,
300
466
  datasource: ndjsonStreamIterator(stream),
301
- onDocument(doc) {
467
+ onDocument() {
302
468
  docsPerSecond++;
303
469
  return {
304
470
  index: {
@@ -307,9 +473,13 @@ function indexQueueFactory({
307
473
  };
308
474
  }
309
475
  });
310
- } catch (error) {
311
- console.error('Error during bulk indexing:', error);
312
- throw error;
476
+ } catch (err) {
477
+ logger.error({
478
+ err,
479
+ targetIndexName
480
+ }, 'Error during bulk indexing');
481
+ queueEmitter.emit('error', err);
482
+ throw err;
313
483
  } finally {
314
484
  // Clean up interval
315
485
  clearInterval(interval);
@@ -359,7 +529,7 @@ function indexQueueFactory({
359
529
 
360
530
  // create a new progress bar instance and use shades_classic theme
361
531
  const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
362
- function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
532
+ function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
363
533
  return async function indexReader() {
364
534
  let docsNum = 0;
365
535
  let scrollId;
@@ -378,8 +548,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
378
548
  maxRetries: 0
379
549
  });
380
550
  return Object.keys(response.fields);
381
- } catch (e) {
382
- console.log('error', e);
551
+ } catch (err) {
552
+ logger.error({
553
+ err,
554
+ sourceIndexName
555
+ }, 'Failed to fetch populated fields');
383
556
  }
384
557
  }
385
558
  function search(fields) {
@@ -423,8 +596,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
423
596
  return;
424
597
  }
425
598
  indexer.add(doc);
426
- } catch (e) {
427
- console.log('error', e);
599
+ } catch (err) {
600
+ logger.error({
601
+ err
602
+ }, 'Failed to process source index document');
428
603
  }
429
604
  }
430
605
  async function fetchNextResponse() {
@@ -495,17 +670,25 @@ async function inferMappingsFromSource({
495
670
  mappings,
496
671
  inferMappings,
497
672
  inferMappingsOptions,
498
- verbose
673
+ logger
499
674
  }) {
500
675
  if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
501
676
  return emptyInferenceResult(mappings);
502
677
  }
678
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
679
+ logger.info({
680
+ sourceFormat
681
+ }, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
682
+ return emptyInferenceResult(mappings);
683
+ }
503
684
  if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
504
685
  return emptyInferenceResult(mappings);
505
686
  }
506
687
  const files = globSync(fileName);
507
688
  if (files.length === 0) {
508
- if (verbose) console.log(`No files matched for mapping inference: ${fileName}`);
689
+ logger.info({
690
+ fileName
691
+ }, 'No files matched for mapping inference');
509
692
  return emptyInferenceResult(mappings);
510
693
  }
511
694
  const {
@@ -514,7 +697,7 @@ async function inferMappingsFromSource({
514
697
  } = inferMappingsOptions || {};
515
698
  const sampleText = readSample(files[0], sampleBytes);
516
699
  if (!sampleText || sampleText.trim() === '') {
517
- if (verbose) console.log('Skipping mapping inference because the sample text is empty.');
700
+ logger.info('Skipping mapping inference because the sample text is empty');
518
701
  return emptyInferenceResult(mappings);
519
702
  }
520
703
  const params = {
@@ -541,31 +724,98 @@ async function inferMappingsFromSource({
541
724
  }
542
725
  try {
543
726
  const response = await targetClient.textStructure.findStructure(params);
544
- if (response?.mappings && verbose) {
545
- console.log(`Inferred mappings via _text_structure/find_structure from ${files[0]}`);
727
+ if (response?.mappings) {
728
+ logger.info({
729
+ file: files[0]
730
+ }, 'Inferred mappings via _text_structure/find_structure');
546
731
  }
547
- if (response?.ingest_pipeline && verbose) {
548
- console.log('Inferred ingest pipeline via _text_structure/find_structure');
732
+ if (response?.ingest_pipeline) {
733
+ logger.info('Inferred ingest pipeline via _text_structure/find_structure');
549
734
  }
550
735
  return {
551
736
  mappings: response?.mappings || mappings,
552
737
  ingestPipeline: response?.ingest_pipeline
553
738
  };
554
- } catch (error) {
555
- if (verbose) {
556
- console.log('Could not infer mappings via _text_structure/find_structure:', error.message);
557
- }
739
+ } catch (err) {
740
+ logger.warn({
741
+ err
742
+ }, 'Could not infer mappings via _text_structure/find_structure');
558
743
  return emptyInferenceResult(mappings);
559
744
  }
560
745
  }
561
746
 
562
- function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
563
- function addParsedDoc(parsed, streamRef) {
747
+ const DEFAULT_LOG_LEVEL = 'info';
748
+ function resolveLogLevel(verbose = true) {
749
+ if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
750
+ return process.env.LOG_LEVEL;
751
+ }
752
+ return verbose ? DEFAULT_LOG_LEVEL : 'error';
753
+ }
754
+ function createLogger({
755
+ logger,
756
+ verbose = true
757
+ } = {}) {
758
+ if (logger && typeof logger === 'object') {
759
+ return logger;
760
+ }
761
+ return pino({
762
+ name: 'node-es-transformer',
763
+ level: resolveLogLevel(verbose),
764
+ timestamp: pino.stdTimeFunctions.isoTime,
765
+ serializers: {
766
+ err: pino.stdSerializers.err,
767
+ error: pino.stdSerializers.err
768
+ }
769
+ });
770
+ }
771
+ function createChildLogger(logger, bindings) {
772
+ if (!logger || typeof logger.child !== 'function') {
773
+ return logger;
774
+ }
775
+ return logger.child(bindings);
776
+ }
777
+
778
+ function createPauseWaiter(queueEmitter) {
779
+ let paused = false;
780
+ let waiters = [];
781
+ const onPause = () => {
782
+ paused = true;
783
+ };
784
+ const onResume = () => {
785
+ paused = false;
786
+ waiters.forEach(resolve => resolve());
787
+ waiters = [];
788
+ };
789
+ queueEmitter.on('pause', onPause);
790
+ queueEmitter.on('resume', onResume);
791
+ return {
792
+ async waitIfPaused() {
793
+ if (!paused) return;
794
+ await new Promise(resolve => {
795
+ waiters.push(resolve);
796
+ });
797
+ },
798
+ cleanup() {
799
+ queueEmitter.removeListener('pause', onPause);
800
+ queueEmitter.removeListener('resume', onResume);
801
+ waiters.forEach(resolve => resolve());
802
+ waiters = [];
803
+ }
804
+ };
805
+ }
806
+ async function readStreamToBuffer(stream) {
807
+ const chunks = [];
808
+ for await (const chunk of stream) {
809
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
810
+ }
811
+ return Buffer.concat(chunks);
812
+ }
813
+ function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
814
+ function addParsedDoc(parsed) {
564
815
  const doc = typeof transform === 'function' ? transform(parsed) : parsed;
565
816
 
566
817
  // if doc is null/undefined we'll skip indexing it
567
818
  if (doc === null || typeof doc === 'undefined') {
568
- streamRef.resume();
569
819
  return;
570
820
  }
571
821
 
@@ -580,50 +830,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
580
830
  }
581
831
  indexer.add(doc);
582
832
  }
583
- function startIndex() {
584
- let finished = false;
585
- const s = sourceFormat === 'csv' ? stream.pipe(parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
833
+ async function processParquetStream() {
834
+ const {
835
+ waitIfPaused,
836
+ cleanup
837
+ } = createPauseWaiter(indexer.queueEmitter);
838
+ const parquetBuffer = await readStreamToBuffer(stream);
839
+ const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
840
+ try {
841
+ const cursor = reader.getCursor();
842
+ while (true) {
843
+ // eslint-disable-next-line no-await-in-loop
844
+ const row = await cursor.next();
845
+ if (row === null || typeof row === 'undefined') {
846
+ break;
847
+ }
848
+ addParsedDoc(row);
849
+ // eslint-disable-next-line no-await-in-loop
850
+ await waitIfPaused();
851
+ }
852
+ logger.info('Read entire stream');
853
+ } finally {
854
+ cleanup();
855
+ await reader.close();
856
+ }
857
+ }
858
+ async function processArrowStream() {
859
+ const {
860
+ waitIfPaused,
861
+ cleanup
862
+ } = createPauseWaiter(indexer.queueEmitter);
863
+ try {
864
+ const reader = await arrow.RecordBatchReader.from(stream);
865
+ for await (const recordBatch of reader) {
866
+ const {
867
+ fields
868
+ } = recordBatch.schema;
869
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
870
+ const row = {};
871
+ fields.forEach(field => {
872
+ const vector = recordBatch.getChild(field.name);
873
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
874
+ });
875
+ addParsedDoc(row);
876
+ // eslint-disable-next-line no-await-in-loop
877
+ await waitIfPaused();
878
+ }
879
+ }
880
+ logger.info('Read entire stream');
881
+ } finally {
882
+ cleanup();
883
+ }
884
+ }
885
+ function processPipeline(buildPipeline, errorMessage) {
886
+ return new Promise((resolve, reject) => {
887
+ let finished = false;
888
+ const s = buildPipeline();
889
+ const onPause = () => {
890
+ if (finished) return;
891
+ s.pause();
892
+ };
893
+ const onResume = () => {
894
+ if (finished) return;
895
+ s.resume();
896
+ };
897
+ function cleanup() {
898
+ indexer.queueEmitter.removeListener('pause', onPause);
899
+ indexer.queueEmitter.removeListener('resume', onResume);
900
+ }
901
+ indexer.queueEmitter.on('pause', onPause);
902
+ indexer.queueEmitter.on('resume', onResume);
903
+ s.on('end', () => {
904
+ finished = true;
905
+ cleanup();
906
+ logger.info('Read entire stream');
907
+ resolve();
908
+ });
909
+ s.on('error', err => {
910
+ finished = true;
911
+ cleanup();
912
+ logger.error({
913
+ err
914
+ }, errorMessage);
915
+ reject(err);
916
+ });
917
+ });
918
+ }
919
+ function processCsvStream() {
920
+ return processPipeline(() => stream.pipe(parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
586
921
  try {
587
- addParsedDoc(record, s);
588
- } catch (e) {
589
- console.log('error', e);
922
+ addParsedDoc(record);
923
+ } catch (err) {
924
+ logger.error({
925
+ err
926
+ }, 'Failed to process CSV stream record');
590
927
  }
591
928
  }).on('error', err => {
592
- console.log('Error while reading CSV stream.', err);
593
- })) : (() => {
594
- let skippedHeader = false;
595
- return stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
596
- try {
597
- // skip empty lines
598
- if (line === '') {
599
- return;
600
- }
601
- if (skipHeader && !skippedHeader) {
602
- skippedHeader = true;
603
- return;
604
- }
605
- const parsed = JSON.parse(line);
606
- addParsedDoc(parsed, s);
607
- } catch (e) {
608
- console.log('error', e);
929
+ logger.error({
930
+ err
931
+ }, 'Error while reading CSV stream');
932
+ })), 'Error while reading CSV stream');
933
+ }
934
+ function processNdjsonStream() {
935
+ let skippedHeader = false;
936
+ return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
937
+ try {
938
+ // skip empty lines
939
+ if (line === '') {
940
+ return;
609
941
  }
610
- }).on('error', err => {
611
- console.log('Error while reading stream.', err);
612
- }));
613
- })();
614
- s.on('end', () => {
615
- if (verbose) console.log('Read entire stream.');
942
+ if (skipHeader && !skippedHeader) {
943
+ skippedHeader = true;
944
+ return;
945
+ }
946
+ const parsed = JSON.parse(line);
947
+ addParsedDoc(parsed);
948
+ } catch (err) {
949
+ logger.error({
950
+ err
951
+ }, 'Failed to process NDJSON stream line');
952
+ }
953
+ }).on('error', err => {
954
+ logger.error({
955
+ err
956
+ }, 'Error while reading stream');
957
+ })), 'Error while reading stream');
958
+ }
959
+ async function startIndex() {
960
+ try {
961
+ if (sourceFormat === 'csv') {
962
+ await processCsvStream();
963
+ } else if (sourceFormat === 'ndjson') {
964
+ await processNdjsonStream();
965
+ } else if (sourceFormat === 'parquet') {
966
+ await processParquetStream();
967
+ } else if (sourceFormat === 'arrow') {
968
+ await processArrowStream();
969
+ } else {
970
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
971
+ }
972
+ } catch (err) {
973
+ logger.error({
974
+ err
975
+ }, 'Error while reading stream');
976
+ } finally {
616
977
  indexer.finish();
617
- finished = true;
618
- });
619
- indexer.queueEmitter.on('pause', () => {
620
- if (finished) return;
621
- s.pause();
622
- });
623
- indexer.queueEmitter.on('resume', () => {
624
- if (finished) return;
625
- s.resume();
626
- });
978
+ }
627
979
  }
628
980
  return () => {
629
981
  startIndex();
@@ -719,11 +1071,16 @@ async function transformer({
719
1071
  query,
720
1072
  skipHeader = false,
721
1073
  transform,
722
- verbose = true
1074
+ verbose = true,
1075
+ logger: loggerInput
723
1076
  }) {
724
1077
  if (typeof targetIndexName === 'undefined') {
725
1078
  throw Error('targetIndexName must be specified.');
726
1079
  }
1080
+ const logger = createLogger({
1081
+ logger: loggerInput,
1082
+ verbose
1083
+ });
727
1084
  const defaultClientConfig = {
728
1085
  node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
729
1086
  };
@@ -740,7 +1097,9 @@ async function transformer({
740
1097
  mappings,
741
1098
  inferMappings,
742
1099
  inferMappingsOptions,
743
- verbose
1100
+ logger: createChildLogger(logger, {
1101
+ component: 'mapping-inference'
1102
+ })
744
1103
  });
745
1104
  const createMapping = createMappingFactory({
746
1105
  sourceClient,
@@ -751,17 +1110,23 @@ async function transformer({
751
1110
  inferredIngestPipeline: inferenceResult.ingestPipeline,
752
1111
  mappingsOverride,
753
1112
  indexMappingTotalFieldsLimit,
754
- verbose,
755
1113
  deleteIndex,
756
- pipeline
1114
+ pipeline,
1115
+ logger: createChildLogger(logger, {
1116
+ component: 'create-mapping'
1117
+ })
757
1118
  });
758
1119
  const indexer = indexQueueFactory({
759
1120
  targetClient,
760
1121
  targetIndexName,
761
- bufferSize});
1122
+ bufferSize,
1123
+ logger: createChildLogger(logger, {
1124
+ component: 'index-queue'
1125
+ })
1126
+ });
762
1127
  function validateSourceFormat() {
763
- if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
764
- throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "csv".`);
1128
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
1129
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
765
1130
  }
766
1131
  }
767
1132
  function getReader() {
@@ -773,18 +1138,27 @@ async function transformer({
773
1138
  }
774
1139
  if (typeof fileName !== 'undefined') {
775
1140
  validateSourceFormat();
776
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1141
+ return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1142
+ component: 'file-reader'
1143
+ }));
777
1144
  }
778
1145
  if (typeof sourceIndexName !== 'undefined') {
779
- return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
1146
+ return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
1147
+ component: 'index-reader'
1148
+ }));
780
1149
  }
781
1150
  if (typeof stream !== 'undefined') {
782
1151
  validateSourceFormat();
783
- return streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1152
+ return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1153
+ component: 'stream-reader'
1154
+ }));
784
1155
  }
785
1156
  return null;
786
1157
  }
787
1158
  const reader = getReader();
1159
+ if (typeof reader !== 'function') {
1160
+ throw Error('One of fileName, sourceIndexName, or stream must be specified.');
1161
+ }
788
1162
  try {
789
1163
  const indexExists = await targetClient.indices.exists({
790
1164
  index: targetIndexName
@@ -801,8 +1175,11 @@ async function transformer({
801
1175
  } else {
802
1176
  reader();
803
1177
  }
804
- } catch (error) {
805
- console.error('Error checking index existence:', error);
1178
+ } catch (err) {
1179
+ logger.error({
1180
+ err,
1181
+ targetIndexName
1182
+ }, 'Error checking index existence');
806
1183
  } finally {
807
1184
  // targetClient.close();
808
1185
  }