node-es-transformer 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,10 @@
2
2
 
3
3
  var elasticsearch9 = require('es9');
4
4
  var elasticsearch8 = require('es8');
5
+ var parquet = require('@dsnp/parquetjs');
6
+ var zlib = require('zlib');
7
+ var compression_js = require('@dsnp/parquetjs/dist/lib/compression.js');
8
+ var arrow = require('apache-arrow');
5
9
  var fs = require('fs');
6
10
  var csvParse = require('csv-parse');
7
11
  var es = require('event-stream');
@@ -9,6 +13,26 @@ var glob = require('glob');
9
13
  var split = require('split2');
10
14
  var stream = require('stream');
11
15
  var cliProgress = require('cli-progress');
16
+ var pino = require('pino');
17
+
18
+ function _interopNamespaceDefault(e) {
19
+ var n = Object.create(null);
20
+ if (e) {
21
+ Object.keys(e).forEach(function (k) {
22
+ if (k !== 'default') {
23
+ var d = Object.getOwnPropertyDescriptor(e, k);
24
+ Object.defineProperty(n, k, d.get ? d : {
25
+ enumerable: true,
26
+ get: function () { return e[k]; }
27
+ });
28
+ }
29
+ });
30
+ }
31
+ n.default = e;
32
+ return Object.freeze(n);
33
+ }
34
+
35
+ var arrow__namespace = /*#__PURE__*/_interopNamespaceDefault(arrow);
12
36
 
13
37
  // In earlier versions this was used to set the number of docs to index in a
14
38
  // single bulk request. Since we switched to use the helpers.bulk() method from
@@ -29,9 +53,9 @@ function createMappingFactory({
29
53
  inferredIngestPipeline,
30
54
  mappingsOverride,
31
55
  indexMappingTotalFieldsLimit,
32
- verbose,
33
56
  deleteIndex,
34
- pipeline
57
+ pipeline,
58
+ logger
35
59
  }) {
36
60
  return async () => {
37
61
  let targetMappings = mappingsOverride ? undefined : mappings;
@@ -50,7 +74,10 @@ function createMappingFactory({
50
74
  }
51
75
  }
52
76
  } catch (err) {
53
- console.log('Error reading source mapping', err);
77
+ logger.error({
78
+ err,
79
+ sourceIndexName
80
+ }, 'Error reading source mapping');
54
81
  return;
55
82
  }
56
83
  }
@@ -82,9 +109,14 @@ function createMappingFactory({
82
109
  ...inferredIngestPipeline
83
110
  });
84
111
  defaultPipeline = inferredPipelineName;
85
- if (verbose) console.log(`Created inferred ingest pipeline ${inferredPipelineName}`);
112
+ logger.info({
113
+ inferredPipelineName
114
+ }, 'Created inferred ingest pipeline');
86
115
  } catch (err) {
87
- console.log('Error creating inferred ingest pipeline', err);
116
+ logger.error({
117
+ err,
118
+ inferredPipelineName
119
+ }, 'Error creating inferred ingest pipeline');
88
120
  }
89
121
  }
90
122
  const settings = {
@@ -97,22 +129,54 @@ function createMappingFactory({
97
129
  'index.number_of_replicas': 0
98
130
  } : {})
99
131
  };
100
- const resp = await targetClient.indices.create({
132
+ const response = await targetClient.indices.create({
101
133
  index: targetIndexName,
102
134
  mappings: targetMappings,
103
135
  ...(Object.keys(settings).length > 0 ? {
104
136
  settings
105
137
  } : {})
106
138
  });
107
- if (verbose) console.log('Created target mapping', resp);
139
+ logger.info({
140
+ targetIndexName,
141
+ response
142
+ }, 'Created target mapping');
108
143
  }
109
144
  } catch (err) {
110
- console.log('Error creating target mapping', err);
145
+ logger.error({
146
+ err,
147
+ targetIndexName
148
+ }, 'Error creating target mapping');
111
149
  }
112
150
  }
113
151
  };
114
152
  }
115
153
 
154
+ function registerZstdCompression() {
155
+ if (compression_js.PARQUET_COMPRESSION_METHODS.ZSTD) {
156
+ return;
157
+ }
158
+ if (typeof zlib.zstdCompressSync !== 'function' || typeof zlib.zstdDecompressSync !== 'function') {
159
+ compression_js.PARQUET_COMPRESSION_METHODS.ZSTD = {
160
+ deflate() {
161
+ throw new Error('ZSTD compression requires Node.js with zstd support.');
162
+ },
163
+ inflate() {
164
+ throw new Error('ZSTD compression requires Node.js with zstd support.');
165
+ }
166
+ };
167
+ return;
168
+ }
169
+ compression_js.PARQUET_COMPRESSION_METHODS.ZSTD = {
170
+ deflate(value) {
171
+ return zlib.zstdCompressSync(value);
172
+ },
173
+ inflate(value) {
174
+ return zlib.zstdDecompressSync(value);
175
+ }
176
+ };
177
+ }
178
+ registerZstdCompression();
179
+
116
180
  function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
117
181
  const options = {
118
182
  bom: true,
@@ -128,8 +192,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
128
192
  return options;
129
193
  }
130
194
 
131
- function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
132
- function addParsedDoc(parsed, file, streamRef) {
195
+ function createPauseWaiter$1(queueEmitter) {
196
+ let paused = false;
197
+ let waiters = [];
198
+ const onPause = () => {
199
+ paused = true;
200
+ };
201
+ const onResume = () => {
202
+ paused = false;
203
+ waiters.forEach(resolve => resolve());
204
+ waiters = [];
205
+ };
206
+ queueEmitter.on('pause', onPause);
207
+ queueEmitter.on('resume', onResume);
208
+ return {
209
+ async waitIfPaused() {
210
+ if (!paused) return;
211
+ await new Promise(resolve => {
212
+ waiters.push(resolve);
213
+ });
214
+ },
215
+ cleanup() {
216
+ queueEmitter.removeListener('pause', onPause);
217
+ queueEmitter.removeListener('resume', onResume);
218
+ waiters.forEach(resolve => resolve());
219
+ waiters = [];
220
+ }
221
+ };
222
+ }
223
+ function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
224
+ function addParsedDoc(parsed, file) {
133
225
  const context = {
134
226
  fileName: file
135
227
  };
@@ -137,7 +229,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
137
229
 
138
230
  // if doc is null/undefined we'll skip indexing it
139
231
  if (doc === null || typeof doc === 'undefined') {
140
- streamRef.resume();
141
232
  return;
142
233
  }
143
234
 
@@ -152,9 +243,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
152
243
  }
153
244
  indexer.add(doc);
154
245
  }
155
- function createNdjsonReader(file) {
246
+ async function processParquetFile(file) {
247
+ const {
248
+ waitIfPaused,
249
+ cleanup
250
+ } = createPauseWaiter$1(indexer.queueEmitter);
251
+ const reader = await parquet.ParquetReader.openFile(file);
252
+ try {
253
+ const cursor = reader.getCursor();
254
+ while (true) {
255
+ // eslint-disable-next-line no-await-in-loop
256
+ const row = await cursor.next();
257
+ if (row === null || typeof row === 'undefined') {
258
+ break;
259
+ }
260
+ addParsedDoc(row, file);
261
+ // eslint-disable-next-line no-await-in-loop
262
+ await waitIfPaused();
263
+ }
264
+ logger.info({
265
+ file
266
+ }, 'Read entire file');
267
+ } finally {
268
+ cleanup();
269
+ await reader.close();
270
+ }
271
+ }
272
+ async function processArrowFile(file) {
273
+ const {
274
+ waitIfPaused,
275
+ cleanup
276
+ } = createPauseWaiter$1(indexer.queueEmitter);
277
+ try {
278
+ const reader = await arrow__namespace.RecordBatchReader.from(fs.createReadStream(file));
279
+ for await (const recordBatch of reader) {
280
+ const {
281
+ fields
282
+ } = recordBatch.schema;
283
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
284
+ const row = {};
285
+ fields.forEach(field => {
286
+ const vector = recordBatch.getChild(field.name);
287
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
288
+ });
289
+ addParsedDoc(row, file);
290
+ // eslint-disable-next-line no-await-in-loop
291
+ await waitIfPaused();
292
+ }
293
+ }
294
+ logger.info({
295
+ file
296
+ }, 'Read entire file');
297
+ } finally {
298
+ cleanup();
299
+ }
300
+ }
301
+ function processStreamFile(file, buildStream, errorMessage) {
302
+ return new Promise((resolve, reject) => {
303
+ let finished = false;
304
+ const s = buildStream();
305
+ const onPause = () => {
306
+ if (finished) return;
307
+ s.pause();
308
+ };
309
+ const onResume = () => {
310
+ if (finished) return;
311
+ s.resume();
312
+ };
313
+ function cleanup() {
314
+ indexer.queueEmitter.removeListener('pause', onPause);
315
+ indexer.queueEmitter.removeListener('resume', onResume);
316
+ }
317
+ indexer.queueEmitter.on('pause', onPause);
318
+ indexer.queueEmitter.on('resume', onResume);
319
+ s.on('end', () => {
320
+ finished = true;
321
+ cleanup();
322
+ logger.info({
323
+ file
324
+ }, 'Read entire file');
325
+ resolve();
326
+ });
327
+ s.on('error', err => {
328
+ finished = true;
329
+ cleanup();
330
+ logger.error({
331
+ err,
332
+ file
333
+ }, errorMessage);
334
+ reject(err);
335
+ });
336
+ });
337
+ }
338
+ function processNdjsonFile(file) {
156
339
  let skippedHeader = false;
157
- const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
340
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
158
341
  try {
159
342
  // skip empty lines
160
343
  if (line === '') {
@@ -165,72 +348,115 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
165
348
  return;
166
349
  }
167
350
  const parsed = JSON.parse(line);
168
- addParsedDoc(parsed, file, s);
169
- } catch (e) {
170
- console.log('error', e);
351
+ addParsedDoc(parsed, file);
352
+ } catch (err) {
353
+ logger.error({
354
+ err,
355
+ file
356
+ }, 'Failed to process NDJSON line');
171
357
  }
172
358
  }).on('error', err => {
173
- console.log('Error while reading file.', err);
174
- }));
175
- return s;
359
+ logger.error({
360
+ err,
361
+ file
362
+ }, 'Error while reading file');
363
+ })), 'Error while reading file');
176
364
  }
177
- function createCsvReader(file) {
365
+ function processCsvFile(file) {
178
366
  const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
179
- const s = fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
367
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
180
368
  try {
181
- addParsedDoc(record, file, s);
182
- } catch (e) {
183
- console.log('error', e);
369
+ addParsedDoc(record, file);
370
+ } catch (err) {
371
+ logger.error({
372
+ err,
373
+ file
374
+ }, 'Failed to process CSV record');
184
375
  }
185
376
  }).on('error', err => {
186
- console.log('Error while reading CSV file.', err);
187
- }));
188
- return s;
377
+ logger.error({
378
+ err,
379
+ file
380
+ }, 'Error while reading CSV file');
381
+ })), 'Error while reading CSV file');
189
382
  }
190
- function startIndex(files) {
191
- let finished = false;
383
+ async function processFile(file) {
384
+ if (sourceFormat === 'csv') {
385
+ await processCsvFile(file);
386
+ return;
387
+ }
388
+ if (sourceFormat === 'ndjson') {
389
+ await processNdjsonFile(file);
390
+ return;
391
+ }
392
+ if (sourceFormat === 'parquet') {
393
+ await processParquetFile(file);
394
+ return;
395
+ }
396
+ if (sourceFormat === 'arrow') {
397
+ await processArrowFile(file);
398
+ return;
399
+ }
400
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
401
+ }
402
+ async function startIndex(files) {
192
403
  if (files.length === 0) {
193
404
  indexer.finish();
194
405
  return;
195
406
  }
196
- const file = files.shift();
197
- const s = sourceFormat === 'csv' ? createCsvReader(file) : createNdjsonReader(file);
198
- s.on('end', () => {
199
- if (verbose) console.log('Read entire file: ', file);
200
- if (files.length > 0) {
201
- startIndex(files);
202
- return;
407
+ try {
408
+ for (const file of files) {
409
+ // eslint-disable-next-line no-await-in-loop
410
+ await processFile(file);
203
411
  }
412
+ } catch (err) {
413
+ logger.error({
414
+ err,
415
+ files
416
+ }, 'Error while processing files');
417
+ } finally {
204
418
  indexer.finish();
205
- finished = true;
206
- });
207
- indexer.queueEmitter.on('pause', () => {
208
- if (finished) return;
209
- s.pause();
210
- });
211
- indexer.queueEmitter.on('resume', () => {
212
- if (finished) return;
213
- s.resume();
214
- });
419
+ }
215
420
  }
216
421
  return () => {
217
422
  try {
218
423
  const files = glob.globSync(fileName);
219
424
  startIndex(files);
220
- } catch (error) {
221
- console.log('Error matching files:', error);
425
+ } catch (err) {
426
+ logger.error({
427
+ err,
428
+ fileName
429
+ }, 'Error matching files');
430
+ indexer.finish();
222
431
  }
223
432
  };
224
433
  }
225
434
 
226
435
  const EventEmitter = require('events');
227
436
  const parallelCalls = 5;
437
+ const MAX_SAFE_BIGINT = BigInt(Number.MAX_SAFE_INTEGER);
438
+ const MIN_SAFE_BIGINT = BigInt(Number.MIN_SAFE_INTEGER);
439
+ function coerceBigInt(value) {
440
+ if (value >= MIN_SAFE_BIGINT && value <= MAX_SAFE_BIGINT) {
441
+ return Number(value);
442
+ }
443
+ return value.toString();
444
+ }
445
+ function safeStringify(doc) {
446
+ return JSON.stringify(doc, (_key, value) => {
447
+ if (typeof value === 'bigint') {
448
+ return coerceBigInt(value);
449
+ }
450
+ return value;
451
+ });
452
+ }
228
453
 
229
454
  // a simple helper queue to bulk index documents
230
455
  function indexQueueFactory({
231
456
  targetClient: client,
232
457
  targetIndexName,
233
- bufferSize = DEFAULT_BUFFER_SIZE
458
+ bufferSize = DEFAULT_BUFFER_SIZE,
459
+ logger
234
460
  }) {
235
461
  const queueEmitter = new EventEmitter();
236
462
  let docsPerSecond = 0;
@@ -263,8 +489,9 @@ function indexQueueFactory({
263
489
  try {
264
490
  yield JSON.parse(line); // Parse and yield the JSON object
265
491
  } catch (err) {
266
- // Handle JSON parse errors if necessary
267
- console.error('Failed to parse JSON:', err);
492
+ logger.error({
493
+ err
494
+ }, 'Failed to parse JSON from NDJSON stream');
268
495
  }
269
496
  }
270
497
  }
@@ -274,7 +501,9 @@ function indexQueueFactory({
274
501
  try {
275
502
  yield JSON.parse(buffer);
276
503
  } catch (err) {
277
- console.error('Failed to parse final JSON:', err);
504
+ logger.error({
505
+ err
506
+ }, 'Failed to parse final JSON from NDJSON stream');
278
507
  }
279
508
  }
280
509
  } finally {
@@ -300,7 +529,7 @@ function indexQueueFactory({
300
529
  flushInterval: 1000,
301
530
  refreshOnCompletion: true,
302
531
  datasource: ndjsonStreamIterator(stream$1),
303
- onDocument(doc) {
532
+ onDocument() {
304
533
  docsPerSecond++;
305
534
  return {
306
535
  index: {
@@ -309,9 +538,13 @@ function indexQueueFactory({
309
538
  };
310
539
  }
311
540
  });
312
- } catch (error) {
313
- console.error('Error during bulk indexing:', error);
314
- throw error;
541
+ } catch (err) {
542
+ logger.error({
543
+ err,
544
+ targetIndexName
545
+ }, 'Error during bulk indexing');
546
+ queueEmitter.emit('error', err);
547
+ throw err;
315
548
  } finally {
316
549
  // Clean up interval
317
550
  clearInterval(interval);
@@ -340,7 +573,7 @@ function indexQueueFactory({
340
573
  if (finished) {
341
574
  throw new Error('Unexpected doc added after indexer should finish.');
342
575
  }
343
- const canContinue = stream$1.write(`${JSON.stringify(doc)}\n`);
576
+ const canContinue = stream$1.write(`${safeStringify(doc)}\n`);
344
577
  if (!canContinue) {
345
578
  queueEmitter.emit('pause');
346
579
 
@@ -361,7 +594,7 @@ function indexQueueFactory({
361
594
 
362
595
  // create a new progress bar instance and use shades_classic theme
363
596
  const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
364
- function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
597
+ function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
365
598
  return async function indexReader() {
366
599
  let docsNum = 0;
367
600
  let scrollId;
@@ -380,8 +613,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
380
613
  maxRetries: 0
381
614
  });
382
615
  return Object.keys(response.fields);
383
- } catch (e) {
384
- console.log('error', e);
616
+ } catch (err) {
617
+ logger.error({
618
+ err,
619
+ sourceIndexName
620
+ }, 'Failed to fetch populated fields');
385
621
  }
386
622
  }
387
623
  function search(fields) {
@@ -425,8 +661,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
425
661
  return;
426
662
  }
427
663
  indexer.add(doc);
428
- } catch (e) {
429
- console.log('error', e);
664
+ } catch (err) {
665
+ logger.error({
666
+ err
667
+ }, 'Failed to process source index document');
430
668
  }
431
669
  }
432
670
  async function fetchNextResponse() {
@@ -497,17 +735,25 @@ async function inferMappingsFromSource({
497
735
  mappings,
498
736
  inferMappings,
499
737
  inferMappingsOptions,
500
- verbose
738
+ logger
501
739
  }) {
502
740
  if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
503
741
  return emptyInferenceResult(mappings);
504
742
  }
743
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
744
+ logger.info({
745
+ sourceFormat
746
+ }, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
747
+ return emptyInferenceResult(mappings);
748
+ }
505
749
  if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
506
750
  return emptyInferenceResult(mappings);
507
751
  }
508
752
  const files = glob.globSync(fileName);
509
753
  if (files.length === 0) {
510
- if (verbose) console.log(`No files matched for mapping inference: ${fileName}`);
754
+ logger.info({
755
+ fileName
756
+ }, 'No files matched for mapping inference');
511
757
  return emptyInferenceResult(mappings);
512
758
  }
513
759
  const {
@@ -516,7 +762,7 @@ async function inferMappingsFromSource({
516
762
  } = inferMappingsOptions || {};
517
763
  const sampleText = readSample(files[0], sampleBytes);
518
764
  if (!sampleText || sampleText.trim() === '') {
519
- if (verbose) console.log('Skipping mapping inference because the sample text is empty.');
765
+ logger.info('Skipping mapping inference because the sample text is empty');
520
766
  return emptyInferenceResult(mappings);
521
767
  }
522
768
  const params = {
@@ -543,31 +789,98 @@ async function inferMappingsFromSource({
543
789
  }
544
790
  try {
545
791
  const response = await targetClient.textStructure.findStructure(params);
546
- if (response?.mappings && verbose) {
547
- console.log(`Inferred mappings via _text_structure/find_structure from ${files[0]}`);
792
+ if (response?.mappings) {
793
+ logger.info({
794
+ file: files[0]
795
+ }, 'Inferred mappings via _text_structure/find_structure');
548
796
  }
549
- if (response?.ingest_pipeline && verbose) {
550
- console.log('Inferred ingest pipeline via _text_structure/find_structure');
797
+ if (response?.ingest_pipeline) {
798
+ logger.info('Inferred ingest pipeline via _text_structure/find_structure');
551
799
  }
552
800
  return {
553
801
  mappings: response?.mappings || mappings,
554
802
  ingestPipeline: response?.ingest_pipeline
555
803
  };
556
- } catch (error) {
557
- if (verbose) {
558
- console.log('Could not infer mappings via _text_structure/find_structure:', error.message);
559
- }
804
+ } catch (err) {
805
+ logger.warn({
806
+ err
807
+ }, 'Could not infer mappings via _text_structure/find_structure');
560
808
  return emptyInferenceResult(mappings);
561
809
  }
562
810
  }
563
811
 
564
- function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
565
- function addParsedDoc(parsed, streamRef) {
812
+ const DEFAULT_LOG_LEVEL = 'info';
813
+ function resolveLogLevel(verbose = true) {
814
+ if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
815
+ return process.env.LOG_LEVEL;
816
+ }
817
+ return verbose ? DEFAULT_LOG_LEVEL : 'error';
818
+ }
819
+ function createLogger({
820
+ logger,
821
+ verbose = true
822
+ } = {}) {
823
+ if (logger && typeof logger === 'object') {
824
+ return logger;
825
+ }
826
+ return pino({
827
+ name: 'node-es-transformer',
828
+ level: resolveLogLevel(verbose),
829
+ timestamp: pino.stdTimeFunctions.isoTime,
830
+ serializers: {
831
+ err: pino.stdSerializers.err,
832
+ error: pino.stdSerializers.err
833
+ }
834
+ });
835
+ }
836
+ function createChildLogger(logger, bindings) {
837
+ if (!logger || typeof logger.child !== 'function') {
838
+ return logger;
839
+ }
840
+ return logger.child(bindings);
841
+ }
842
+
843
+ function createPauseWaiter(queueEmitter) {
844
+ let paused = false;
845
+ let waiters = [];
846
+ const onPause = () => {
847
+ paused = true;
848
+ };
849
+ const onResume = () => {
850
+ paused = false;
851
+ waiters.forEach(resolve => resolve());
852
+ waiters = [];
853
+ };
854
+ queueEmitter.on('pause', onPause);
855
+ queueEmitter.on('resume', onResume);
856
+ return {
857
+ async waitIfPaused() {
858
+ if (!paused) return;
859
+ await new Promise(resolve => {
860
+ waiters.push(resolve);
861
+ });
862
+ },
863
+ cleanup() {
864
+ queueEmitter.removeListener('pause', onPause);
865
+ queueEmitter.removeListener('resume', onResume);
866
+ waiters.forEach(resolve => resolve());
867
+ waiters = [];
868
+ }
869
+ };
870
+ }
871
+ async function readStreamToBuffer(stream) {
872
+ const chunks = [];
873
+ for await (const chunk of stream) {
874
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
875
+ }
876
+ return Buffer.concat(chunks);
877
+ }
878
+ function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
879
+ function addParsedDoc(parsed) {
566
880
  const doc = typeof transform === 'function' ? transform(parsed) : parsed;
567
881
 
568
882
  // if doc is null/undefined we'll skip indexing it
569
883
  if (doc === null || typeof doc === 'undefined') {
570
- streamRef.resume();
571
884
  return;
572
885
  }
573
886
 
@@ -582,50 +895,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
582
895
  }
583
896
  indexer.add(doc);
584
897
  }
585
- function startIndex() {
586
- let finished = false;
587
- const s = sourceFormat === 'csv' ? stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
898
+ async function processParquetStream() {
899
+ const {
900
+ waitIfPaused,
901
+ cleanup
902
+ } = createPauseWaiter(indexer.queueEmitter);
903
+ const parquetBuffer = await readStreamToBuffer(stream);
904
+ const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
905
+ try {
906
+ const cursor = reader.getCursor();
907
+ while (true) {
908
+ // eslint-disable-next-line no-await-in-loop
909
+ const row = await cursor.next();
910
+ if (row === null || typeof row === 'undefined') {
911
+ break;
912
+ }
913
+ addParsedDoc(row);
914
+ // eslint-disable-next-line no-await-in-loop
915
+ await waitIfPaused();
916
+ }
917
+ logger.info('Read entire stream');
918
+ } finally {
919
+ cleanup();
920
+ await reader.close();
921
+ }
922
+ }
923
+ async function processArrowStream() {
924
+ const {
925
+ waitIfPaused,
926
+ cleanup
927
+ } = createPauseWaiter(indexer.queueEmitter);
928
+ try {
929
+ const reader = await arrow__namespace.RecordBatchReader.from(stream);
930
+ for await (const recordBatch of reader) {
931
+ const {
932
+ fields
933
+ } = recordBatch.schema;
934
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
935
+ const row = {};
936
+ fields.forEach(field => {
937
+ const vector = recordBatch.getChild(field.name);
938
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
939
+ });
940
+ addParsedDoc(row);
941
+ // eslint-disable-next-line no-await-in-loop
942
+ await waitIfPaused();
943
+ }
944
+ }
945
+ logger.info('Read entire stream');
946
+ } finally {
947
+ cleanup();
948
+ }
949
+ }
950
+ function processPipeline(buildPipeline, errorMessage) {
951
+ return new Promise((resolve, reject) => {
952
+ let finished = false;
953
+ const s = buildPipeline();
954
+ const onPause = () => {
955
+ if (finished) return;
956
+ s.pause();
957
+ };
958
+ const onResume = () => {
959
+ if (finished) return;
960
+ s.resume();
961
+ };
962
+ function cleanup() {
963
+ indexer.queueEmitter.removeListener('pause', onPause);
964
+ indexer.queueEmitter.removeListener('resume', onResume);
965
+ }
966
+ indexer.queueEmitter.on('pause', onPause);
967
+ indexer.queueEmitter.on('resume', onResume);
968
+ s.on('end', () => {
969
+ finished = true;
970
+ cleanup();
971
+ logger.info('Read entire stream');
972
+ resolve();
973
+ });
974
+ s.on('error', err => {
975
+ finished = true;
976
+ cleanup();
977
+ logger.error({
978
+ err
979
+ }, errorMessage);
980
+ reject(err);
981
+ });
982
+ });
983
+ }
984
+ function processCsvStream() {
985
+ return processPipeline(() => stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
588
986
  try {
589
- addParsedDoc(record, s);
590
- } catch (e) {
591
- console.log('error', e);
987
+ addParsedDoc(record);
988
+ } catch (err) {
989
+ logger.error({
990
+ err
991
+ }, 'Failed to process CSV stream record');
592
992
  }
593
993
  }).on('error', err => {
594
- console.log('Error while reading CSV stream.', err);
595
- })) : (() => {
596
- let skippedHeader = false;
597
- return stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
598
- try {
599
- // skip empty lines
600
- if (line === '') {
601
- return;
602
- }
603
- if (skipHeader && !skippedHeader) {
604
- skippedHeader = true;
605
- return;
606
- }
607
- const parsed = JSON.parse(line);
608
- addParsedDoc(parsed, s);
609
- } catch (e) {
610
- console.log('error', e);
994
+ logger.error({
995
+ err
996
+ }, 'Error while reading CSV stream');
997
+ })), 'Error while reading CSV stream');
998
+ }
999
+ function processNdjsonStream() {
1000
+ let skippedHeader = false;
1001
+ return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
1002
+ try {
1003
+ // skip empty lines
1004
+ if (line === '') {
1005
+ return;
611
1006
  }
612
- }).on('error', err => {
613
- console.log('Error while reading stream.', err);
614
- }));
615
- })();
616
- s.on('end', () => {
617
- if (verbose) console.log('Read entire stream.');
1007
+ if (skipHeader && !skippedHeader) {
1008
+ skippedHeader = true;
1009
+ return;
1010
+ }
1011
+ const parsed = JSON.parse(line);
1012
+ addParsedDoc(parsed);
1013
+ } catch (err) {
1014
+ logger.error({
1015
+ err
1016
+ }, 'Failed to process NDJSON stream line');
1017
+ }
1018
+ }).on('error', err => {
1019
+ logger.error({
1020
+ err
1021
+ }, 'Error while reading stream');
1022
+ })), 'Error while reading stream');
1023
+ }
1024
+ async function startIndex() {
1025
+ try {
1026
+ if (sourceFormat === 'csv') {
1027
+ await processCsvStream();
1028
+ } else if (sourceFormat === 'ndjson') {
1029
+ await processNdjsonStream();
1030
+ } else if (sourceFormat === 'parquet') {
1031
+ await processParquetStream();
1032
+ } else if (sourceFormat === 'arrow') {
1033
+ await processArrowStream();
1034
+ } else {
1035
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
1036
+ }
1037
+ } catch (err) {
1038
+ logger.error({
1039
+ err
1040
+ }, 'Error while reading stream');
1041
+ } finally {
618
1042
  indexer.finish();
619
- finished = true;
620
- });
621
- indexer.queueEmitter.on('pause', () => {
622
- if (finished) return;
623
- s.pause();
624
- });
625
- indexer.queueEmitter.on('resume', () => {
626
- if (finished) return;
627
- s.resume();
628
- });
1043
+ }
629
1044
  }
630
1045
  return () => {
631
1046
  startIndex();
@@ -721,11 +1136,16 @@ async function transformer({
721
1136
  query,
722
1137
  skipHeader = false,
723
1138
  transform,
724
- verbose = true
1139
+ verbose = true,
1140
+ logger: loggerInput
725
1141
  }) {
726
1142
  if (typeof targetIndexName === 'undefined') {
727
1143
  throw Error('targetIndexName must be specified.');
728
1144
  }
1145
+ const logger = createLogger({
1146
+ logger: loggerInput,
1147
+ verbose
1148
+ });
729
1149
  const defaultClientConfig = {
730
1150
  node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
731
1151
  };
@@ -742,7 +1162,9 @@ async function transformer({
742
1162
  mappings,
743
1163
  inferMappings,
744
1164
  inferMappingsOptions,
745
- verbose
1165
+ logger: createChildLogger(logger, {
1166
+ component: 'mapping-inference'
1167
+ })
746
1168
  });
747
1169
  const createMapping = createMappingFactory({
748
1170
  sourceClient,
@@ -753,17 +1175,23 @@ async function transformer({
753
1175
  inferredIngestPipeline: inferenceResult.ingestPipeline,
754
1176
  mappingsOverride,
755
1177
  indexMappingTotalFieldsLimit,
756
- verbose,
757
1178
  deleteIndex,
758
- pipeline
1179
+ pipeline,
1180
+ logger: createChildLogger(logger, {
1181
+ component: 'create-mapping'
1182
+ })
759
1183
  });
760
1184
  const indexer = indexQueueFactory({
761
1185
  targetClient,
762
1186
  targetIndexName,
763
- bufferSize});
1187
+ bufferSize,
1188
+ logger: createChildLogger(logger, {
1189
+ component: 'index-queue'
1190
+ })
1191
+ });
764
1192
  function validateSourceFormat() {
765
- if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
766
- throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "csv".`);
1193
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
1194
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
767
1195
  }
768
1196
  }
769
1197
  function getReader() {
@@ -775,18 +1203,27 @@ async function transformer({
775
1203
  }
776
1204
  if (typeof fileName !== 'undefined') {
777
1205
  validateSourceFormat();
778
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1206
+ return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1207
+ component: 'file-reader'
1208
+ }));
779
1209
  }
780
1210
  if (typeof sourceIndexName !== 'undefined') {
781
- return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
1211
+ return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
1212
+ component: 'index-reader'
1213
+ }));
782
1214
  }
783
1215
  if (typeof stream !== 'undefined') {
784
1216
  validateSourceFormat();
785
- return streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1217
+ return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1218
+ component: 'stream-reader'
1219
+ }));
786
1220
  }
787
1221
  return null;
788
1222
  }
789
1223
  const reader = getReader();
1224
+ if (typeof reader !== 'function') {
1225
+ throw Error('One of fileName, sourceIndexName, or stream must be specified.');
1226
+ }
790
1227
  try {
791
1228
  const indexExists = await targetClient.indices.exists({
792
1229
  index: targetIndexName
@@ -803,8 +1240,11 @@ async function transformer({
803
1240
  } else {
804
1241
  reader();
805
1242
  }
806
- } catch (error) {
807
- console.error('Error checking index existence:', error);
1243
+ } catch (err) {
1244
+ logger.error({
1245
+ err,
1246
+ targetIndexName
1247
+ }, 'Error checking index existence');
808
1248
  } finally {
809
1249
  // targetClient.close();
810
1250
  }