node-es-transformer 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
  import elasticsearch9 from 'es9';
2
2
  import elasticsearch8 from 'es8';
3
+ import parquet from '@dsnp/parquetjs';
4
+ import zlib from 'zlib';
5
+ import { PARQUET_COMPRESSION_METHODS } from '@dsnp/parquetjs/dist/lib/compression.js';
6
+ import * as arrow from 'apache-arrow';
3
7
  import fs from 'fs';
4
8
  import { parse } from 'csv-parse';
5
9
  import es from 'event-stream';
@@ -7,6 +11,7 @@ import { globSync } from 'glob';
7
11
  import split from 'split2';
8
12
  import { PassThrough } from 'stream';
9
13
  import cliProgress from 'cli-progress';
14
+ import pino from 'pino';
10
15
 
11
16
  // In earlier versions this was used to set the number of docs to index in a
12
17
  // single bulk request. Since we switched to use the helpers.bulk() method from
@@ -27,9 +32,9 @@ function createMappingFactory({
27
32
  inferredIngestPipeline,
28
33
  mappingsOverride,
29
34
  indexMappingTotalFieldsLimit,
30
- verbose,
31
35
  deleteIndex,
32
- pipeline
36
+ pipeline,
37
+ logger
33
38
  }) {
34
39
  return async () => {
35
40
  let targetMappings = mappingsOverride ? undefined : mappings;
@@ -48,7 +53,10 @@ function createMappingFactory({
48
53
  }
49
54
  }
50
55
  } catch (err) {
51
- console.log('Error reading source mapping', err);
56
+ logger.error({
57
+ err,
58
+ sourceIndexName
59
+ }, 'Error reading source mapping');
52
60
  return;
53
61
  }
54
62
  }
@@ -80,9 +88,14 @@ function createMappingFactory({
80
88
  ...inferredIngestPipeline
81
89
  });
82
90
  defaultPipeline = inferredPipelineName;
83
- if (verbose) console.log(`Created inferred ingest pipeline ${inferredPipelineName}`);
91
+ logger.info({
92
+ inferredPipelineName
93
+ }, 'Created inferred ingest pipeline');
84
94
  } catch (err) {
85
- console.log('Error creating inferred ingest pipeline', err);
95
+ logger.error({
96
+ err,
97
+ inferredPipelineName
98
+ }, 'Error creating inferred ingest pipeline');
86
99
  }
87
100
  }
88
101
  const settings = {
@@ -95,22 +108,54 @@ function createMappingFactory({
95
108
  'index.number_of_replicas': 0
96
109
  } : {})
97
110
  };
98
- const resp = await targetClient.indices.create({
111
+ const response = await targetClient.indices.create({
99
112
  index: targetIndexName,
100
113
  mappings: targetMappings,
101
114
  ...(Object.keys(settings).length > 0 ? {
102
115
  settings
103
116
  } : {})
104
117
  });
105
- if (verbose) console.log('Created target mapping', resp);
118
+ logger.info({
119
+ targetIndexName,
120
+ response
121
+ }, 'Created target mapping');
106
122
  }
107
123
  } catch (err) {
108
- console.log('Error creating target mapping', err);
124
+ logger.error({
125
+ err,
126
+ targetIndexName
127
+ }, 'Error creating target mapping');
109
128
  }
110
129
  }
111
130
  };
112
131
  }
113
132
 
133
+ function registerZstdCompression() {
134
+ if (PARQUET_COMPRESSION_METHODS.ZSTD) {
135
+ return;
136
+ }
137
+ if (typeof zlib.zstdCompressSync !== 'function' || typeof zlib.zstdDecompressSync !== 'function') {
138
+ PARQUET_COMPRESSION_METHODS.ZSTD = {
139
+ deflate() {
140
+ throw new Error('ZSTD compression requires Node.js with zstd support.');
141
+ },
142
+ inflate() {
143
+ throw new Error('ZSTD compression requires Node.js with zstd support.');
144
+ }
145
+ };
146
+ return;
147
+ }
148
+ PARQUET_COMPRESSION_METHODS.ZSTD = {
149
+ deflate(value) {
150
+ return zlib.zstdCompressSync(value);
151
+ },
152
+ inflate(value) {
153
+ return zlib.zstdDecompressSync(value);
154
+ }
155
+ };
156
+ }
157
+ registerZstdCompression();
158
+
114
159
  function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
115
160
  const options = {
116
161
  bom: true,
@@ -126,8 +171,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
126
171
  return options;
127
172
  }
128
173
 
129
- function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
130
- function addParsedDoc(parsed, file, streamRef) {
174
+ function createPauseWaiter$1(queueEmitter) {
175
+ let paused = false;
176
+ let waiters = [];
177
+ const onPause = () => {
178
+ paused = true;
179
+ };
180
+ const onResume = () => {
181
+ paused = false;
182
+ waiters.forEach(resolve => resolve());
183
+ waiters = [];
184
+ };
185
+ queueEmitter.on('pause', onPause);
186
+ queueEmitter.on('resume', onResume);
187
+ return {
188
+ async waitIfPaused() {
189
+ if (!paused) return;
190
+ await new Promise(resolve => {
191
+ waiters.push(resolve);
192
+ });
193
+ },
194
+ cleanup() {
195
+ queueEmitter.removeListener('pause', onPause);
196
+ queueEmitter.removeListener('resume', onResume);
197
+ waiters.forEach(resolve => resolve());
198
+ waiters = [];
199
+ }
200
+ };
201
+ }
202
+ function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
203
+ function addParsedDoc(parsed, file) {
131
204
  const context = {
132
205
  fileName: file
133
206
  };
@@ -135,7 +208,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
135
208
 
136
209
  // if doc is null/undefined we'll skip indexing it
137
210
  if (doc === null || typeof doc === 'undefined') {
138
- streamRef.resume();
139
211
  return;
140
212
  }
141
213
 
@@ -150,9 +222,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
150
222
  }
151
223
  indexer.add(doc);
152
224
  }
153
- function createNdjsonReader(file) {
225
+ async function processParquetFile(file) {
226
+ const {
227
+ waitIfPaused,
228
+ cleanup
229
+ } = createPauseWaiter$1(indexer.queueEmitter);
230
+ const reader = await parquet.ParquetReader.openFile(file);
231
+ try {
232
+ const cursor = reader.getCursor();
233
+ while (true) {
234
+ // eslint-disable-next-line no-await-in-loop
235
+ const row = await cursor.next();
236
+ if (row === null || typeof row === 'undefined') {
237
+ break;
238
+ }
239
+ addParsedDoc(row, file);
240
+ // eslint-disable-next-line no-await-in-loop
241
+ await waitIfPaused();
242
+ }
243
+ logger.info({
244
+ file
245
+ }, 'Read entire file');
246
+ } finally {
247
+ cleanup();
248
+ await reader.close();
249
+ }
250
+ }
251
+ async function processArrowFile(file) {
252
+ const {
253
+ waitIfPaused,
254
+ cleanup
255
+ } = createPauseWaiter$1(indexer.queueEmitter);
256
+ try {
257
+ const reader = await arrow.RecordBatchReader.from(fs.createReadStream(file));
258
+ for await (const recordBatch of reader) {
259
+ const {
260
+ fields
261
+ } = recordBatch.schema;
262
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
263
+ const row = {};
264
+ fields.forEach(field => {
265
+ const vector = recordBatch.getChild(field.name);
266
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
267
+ });
268
+ addParsedDoc(row, file);
269
+ // eslint-disable-next-line no-await-in-loop
270
+ await waitIfPaused();
271
+ }
272
+ }
273
+ logger.info({
274
+ file
275
+ }, 'Read entire file');
276
+ } finally {
277
+ cleanup();
278
+ }
279
+ }
280
+ function processStreamFile(file, buildStream, errorMessage) {
281
+ return new Promise((resolve, reject) => {
282
+ let finished = false;
283
+ const s = buildStream();
284
+ const onPause = () => {
285
+ if (finished) return;
286
+ s.pause();
287
+ };
288
+ const onResume = () => {
289
+ if (finished) return;
290
+ s.resume();
291
+ };
292
+ function cleanup() {
293
+ indexer.queueEmitter.removeListener('pause', onPause);
294
+ indexer.queueEmitter.removeListener('resume', onResume);
295
+ }
296
+ indexer.queueEmitter.on('pause', onPause);
297
+ indexer.queueEmitter.on('resume', onResume);
298
+ s.on('end', () => {
299
+ finished = true;
300
+ cleanup();
301
+ logger.info({
302
+ file
303
+ }, 'Read entire file');
304
+ resolve();
305
+ });
306
+ s.on('error', err => {
307
+ finished = true;
308
+ cleanup();
309
+ logger.error({
310
+ err,
311
+ file
312
+ }, errorMessage);
313
+ reject(err);
314
+ });
315
+ });
316
+ }
317
+ function processNdjsonFile(file) {
154
318
  let skippedHeader = false;
155
- const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
319
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
156
320
  try {
157
321
  // skip empty lines
158
322
  if (line === '') {
@@ -163,72 +327,115 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
163
327
  return;
164
328
  }
165
329
  const parsed = JSON.parse(line);
166
- addParsedDoc(parsed, file, s);
167
- } catch (e) {
168
- console.log('error', e);
330
+ addParsedDoc(parsed, file);
331
+ } catch (err) {
332
+ logger.error({
333
+ err,
334
+ file
335
+ }, 'Failed to process NDJSON line');
169
336
  }
170
337
  }).on('error', err => {
171
- console.log('Error while reading file.', err);
172
- }));
173
- return s;
338
+ logger.error({
339
+ err,
340
+ file
341
+ }, 'Error while reading file');
342
+ })), 'Error while reading file');
174
343
  }
175
- function createCsvReader(file) {
344
+ function processCsvFile(file) {
176
345
  const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
177
- const s = fs.createReadStream(file).pipe(parse(parserOptions)).pipe(es.mapSync(record => {
346
+ return processStreamFile(file, () => fs.createReadStream(file).pipe(parse(parserOptions)).pipe(es.mapSync(record => {
178
347
  try {
179
- addParsedDoc(record, file, s);
180
- } catch (e) {
181
- console.log('error', e);
348
+ addParsedDoc(record, file);
349
+ } catch (err) {
350
+ logger.error({
351
+ err,
352
+ file
353
+ }, 'Failed to process CSV record');
182
354
  }
183
355
  }).on('error', err => {
184
- console.log('Error while reading CSV file.', err);
185
- }));
186
- return s;
356
+ logger.error({
357
+ err,
358
+ file
359
+ }, 'Error while reading CSV file');
360
+ })), 'Error while reading CSV file');
187
361
  }
188
- function startIndex(files) {
189
- let finished = false;
362
+ async function processFile(file) {
363
+ if (sourceFormat === 'csv') {
364
+ await processCsvFile(file);
365
+ return;
366
+ }
367
+ if (sourceFormat === 'ndjson') {
368
+ await processNdjsonFile(file);
369
+ return;
370
+ }
371
+ if (sourceFormat === 'parquet') {
372
+ await processParquetFile(file);
373
+ return;
374
+ }
375
+ if (sourceFormat === 'arrow') {
376
+ await processArrowFile(file);
377
+ return;
378
+ }
379
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
380
+ }
381
+ async function startIndex(files) {
190
382
  if (files.length === 0) {
191
383
  indexer.finish();
192
384
  return;
193
385
  }
194
- const file = files.shift();
195
- const s = sourceFormat === 'csv' ? createCsvReader(file) : createNdjsonReader(file);
196
- s.on('end', () => {
197
- if (verbose) console.log('Read entire file: ', file);
198
- if (files.length > 0) {
199
- startIndex(files);
200
- return;
386
+ try {
387
+ for (const file of files) {
388
+ // eslint-disable-next-line no-await-in-loop
389
+ await processFile(file);
201
390
  }
391
+ } catch (err) {
392
+ logger.error({
393
+ err,
394
+ files
395
+ }, 'Error while processing files');
396
+ } finally {
202
397
  indexer.finish();
203
- finished = true;
204
- });
205
- indexer.queueEmitter.on('pause', () => {
206
- if (finished) return;
207
- s.pause();
208
- });
209
- indexer.queueEmitter.on('resume', () => {
210
- if (finished) return;
211
- s.resume();
212
- });
398
+ }
213
399
  }
214
400
  return () => {
215
401
  try {
216
402
  const files = globSync(fileName);
217
403
  startIndex(files);
218
- } catch (error) {
219
- console.log('Error matching files:', error);
404
+ } catch (err) {
405
+ logger.error({
406
+ err,
407
+ fileName
408
+ }, 'Error matching files');
409
+ indexer.finish();
220
410
  }
221
411
  };
222
412
  }
223
413
 
224
414
  const EventEmitter = require('events');
225
415
  const parallelCalls = 5;
416
+ const MAX_SAFE_BIGINT = BigInt(Number.MAX_SAFE_INTEGER);
417
+ const MIN_SAFE_BIGINT = BigInt(Number.MIN_SAFE_INTEGER);
418
+ function coerceBigInt(value) {
419
+ if (value >= MIN_SAFE_BIGINT && value <= MAX_SAFE_BIGINT) {
420
+ return Number(value);
421
+ }
422
+ return value.toString();
423
+ }
424
+ function safeStringify(doc) {
425
+ return JSON.stringify(doc, (_key, value) => {
426
+ if (typeof value === 'bigint') {
427
+ return coerceBigInt(value);
428
+ }
429
+ return value;
430
+ });
431
+ }
226
432
 
227
433
  // a simple helper queue to bulk index documents
228
434
  function indexQueueFactory({
229
435
  targetClient: client,
230
436
  targetIndexName,
231
- bufferSize = DEFAULT_BUFFER_SIZE
437
+ bufferSize = DEFAULT_BUFFER_SIZE,
438
+ logger
232
439
  }) {
233
440
  const queueEmitter = new EventEmitter();
234
441
  let docsPerSecond = 0;
@@ -261,8 +468,9 @@ function indexQueueFactory({
261
468
  try {
262
469
  yield JSON.parse(line); // Parse and yield the JSON object
263
470
  } catch (err) {
264
- // Handle JSON parse errors if necessary
265
- console.error('Failed to parse JSON:', err);
471
+ logger.error({
472
+ err
473
+ }, 'Failed to parse JSON from NDJSON stream');
266
474
  }
267
475
  }
268
476
  }
@@ -272,7 +480,9 @@ function indexQueueFactory({
272
480
  try {
273
481
  yield JSON.parse(buffer);
274
482
  } catch (err) {
275
- console.error('Failed to parse final JSON:', err);
483
+ logger.error({
484
+ err
485
+ }, 'Failed to parse final JSON from NDJSON stream');
276
486
  }
277
487
  }
278
488
  } finally {
@@ -298,7 +508,7 @@ function indexQueueFactory({
298
508
  flushInterval: 1000,
299
509
  refreshOnCompletion: true,
300
510
  datasource: ndjsonStreamIterator(stream),
301
- onDocument(doc) {
511
+ onDocument() {
302
512
  docsPerSecond++;
303
513
  return {
304
514
  index: {
@@ -307,9 +517,13 @@ function indexQueueFactory({
307
517
  };
308
518
  }
309
519
  });
310
- } catch (error) {
311
- console.error('Error during bulk indexing:', error);
312
- throw error;
520
+ } catch (err) {
521
+ logger.error({
522
+ err,
523
+ targetIndexName
524
+ }, 'Error during bulk indexing');
525
+ queueEmitter.emit('error', err);
526
+ throw err;
313
527
  } finally {
314
528
  // Clean up interval
315
529
  clearInterval(interval);
@@ -338,7 +552,7 @@ function indexQueueFactory({
338
552
  if (finished) {
339
553
  throw new Error('Unexpected doc added after indexer should finish.');
340
554
  }
341
- const canContinue = stream.write(`${JSON.stringify(doc)}\n`);
555
+ const canContinue = stream.write(`${safeStringify(doc)}\n`);
342
556
  if (!canContinue) {
343
557
  queueEmitter.emit('pause');
344
558
 
@@ -359,7 +573,7 @@ function indexQueueFactory({
359
573
 
360
574
  // create a new progress bar instance and use shades_classic theme
361
575
  const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
362
- function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
576
+ function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
363
577
  return async function indexReader() {
364
578
  let docsNum = 0;
365
579
  let scrollId;
@@ -378,8 +592,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
378
592
  maxRetries: 0
379
593
  });
380
594
  return Object.keys(response.fields);
381
- } catch (e) {
382
- console.log('error', e);
595
+ } catch (err) {
596
+ logger.error({
597
+ err,
598
+ sourceIndexName
599
+ }, 'Failed to fetch populated fields');
383
600
  }
384
601
  }
385
602
  function search(fields) {
@@ -423,8 +640,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
423
640
  return;
424
641
  }
425
642
  indexer.add(doc);
426
- } catch (e) {
427
- console.log('error', e);
643
+ } catch (err) {
644
+ logger.error({
645
+ err
646
+ }, 'Failed to process source index document');
428
647
  }
429
648
  }
430
649
  async function fetchNextResponse() {
@@ -495,17 +714,25 @@ async function inferMappingsFromSource({
495
714
  mappings,
496
715
  inferMappings,
497
716
  inferMappingsOptions,
498
- verbose
717
+ logger
499
718
  }) {
500
719
  if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
501
720
  return emptyInferenceResult(mappings);
502
721
  }
722
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
723
+ logger.info({
724
+ sourceFormat
725
+ }, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
726
+ return emptyInferenceResult(mappings);
727
+ }
503
728
  if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
504
729
  return emptyInferenceResult(mappings);
505
730
  }
506
731
  const files = globSync(fileName);
507
732
  if (files.length === 0) {
508
- if (verbose) console.log(`No files matched for mapping inference: ${fileName}`);
733
+ logger.info({
734
+ fileName
735
+ }, 'No files matched for mapping inference');
509
736
  return emptyInferenceResult(mappings);
510
737
  }
511
738
  const {
@@ -514,7 +741,7 @@ async function inferMappingsFromSource({
514
741
  } = inferMappingsOptions || {};
515
742
  const sampleText = readSample(files[0], sampleBytes);
516
743
  if (!sampleText || sampleText.trim() === '') {
517
- if (verbose) console.log('Skipping mapping inference because the sample text is empty.');
744
+ logger.info('Skipping mapping inference because the sample text is empty');
518
745
  return emptyInferenceResult(mappings);
519
746
  }
520
747
  const params = {
@@ -541,31 +768,98 @@ async function inferMappingsFromSource({
541
768
  }
542
769
  try {
543
770
  const response = await targetClient.textStructure.findStructure(params);
544
- if (response?.mappings && verbose) {
545
- console.log(`Inferred mappings via _text_structure/find_structure from ${files[0]}`);
771
+ if (response?.mappings) {
772
+ logger.info({
773
+ file: files[0]
774
+ }, 'Inferred mappings via _text_structure/find_structure');
546
775
  }
547
- if (response?.ingest_pipeline && verbose) {
548
- console.log('Inferred ingest pipeline via _text_structure/find_structure');
776
+ if (response?.ingest_pipeline) {
777
+ logger.info('Inferred ingest pipeline via _text_structure/find_structure');
549
778
  }
550
779
  return {
551
780
  mappings: response?.mappings || mappings,
552
781
  ingestPipeline: response?.ingest_pipeline
553
782
  };
554
- } catch (error) {
555
- if (verbose) {
556
- console.log('Could not infer mappings via _text_structure/find_structure:', error.message);
557
- }
783
+ } catch (err) {
784
+ logger.warn({
785
+ err
786
+ }, 'Could not infer mappings via _text_structure/find_structure');
558
787
  return emptyInferenceResult(mappings);
559
788
  }
560
789
  }
561
790
 
562
- function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
563
- function addParsedDoc(parsed, streamRef) {
791
+ const DEFAULT_LOG_LEVEL = 'info';
792
+ function resolveLogLevel(verbose = true) {
793
+ if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
794
+ return process.env.LOG_LEVEL;
795
+ }
796
+ return verbose ? DEFAULT_LOG_LEVEL : 'error';
797
+ }
798
+ function createLogger({
799
+ logger,
800
+ verbose = true
801
+ } = {}) {
802
+ if (logger && typeof logger === 'object') {
803
+ return logger;
804
+ }
805
+ return pino({
806
+ name: 'node-es-transformer',
807
+ level: resolveLogLevel(verbose),
808
+ timestamp: pino.stdTimeFunctions.isoTime,
809
+ serializers: {
810
+ err: pino.stdSerializers.err,
811
+ error: pino.stdSerializers.err
812
+ }
813
+ });
814
+ }
815
+ function createChildLogger(logger, bindings) {
816
+ if (!logger || typeof logger.child !== 'function') {
817
+ return logger;
818
+ }
819
+ return logger.child(bindings);
820
+ }
821
+
822
+ function createPauseWaiter(queueEmitter) {
823
+ let paused = false;
824
+ let waiters = [];
825
+ const onPause = () => {
826
+ paused = true;
827
+ };
828
+ const onResume = () => {
829
+ paused = false;
830
+ waiters.forEach(resolve => resolve());
831
+ waiters = [];
832
+ };
833
+ queueEmitter.on('pause', onPause);
834
+ queueEmitter.on('resume', onResume);
835
+ return {
836
+ async waitIfPaused() {
837
+ if (!paused) return;
838
+ await new Promise(resolve => {
839
+ waiters.push(resolve);
840
+ });
841
+ },
842
+ cleanup() {
843
+ queueEmitter.removeListener('pause', onPause);
844
+ queueEmitter.removeListener('resume', onResume);
845
+ waiters.forEach(resolve => resolve());
846
+ waiters = [];
847
+ }
848
+ };
849
+ }
850
+ async function readStreamToBuffer(stream) {
851
+ const chunks = [];
852
+ for await (const chunk of stream) {
853
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
854
+ }
855
+ return Buffer.concat(chunks);
856
+ }
857
+ function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
858
+ function addParsedDoc(parsed) {
564
859
  const doc = typeof transform === 'function' ? transform(parsed) : parsed;
565
860
 
566
861
  // if doc is null/undefined we'll skip indexing it
567
862
  if (doc === null || typeof doc === 'undefined') {
568
- streamRef.resume();
569
863
  return;
570
864
  }
571
865
 
@@ -580,50 +874,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
580
874
  }
581
875
  indexer.add(doc);
582
876
  }
583
- function startIndex() {
584
- let finished = false;
585
- const s = sourceFormat === 'csv' ? stream.pipe(parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
877
+ async function processParquetStream() {
878
+ const {
879
+ waitIfPaused,
880
+ cleanup
881
+ } = createPauseWaiter(indexer.queueEmitter);
882
+ const parquetBuffer = await readStreamToBuffer(stream);
883
+ const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
884
+ try {
885
+ const cursor = reader.getCursor();
886
+ while (true) {
887
+ // eslint-disable-next-line no-await-in-loop
888
+ const row = await cursor.next();
889
+ if (row === null || typeof row === 'undefined') {
890
+ break;
891
+ }
892
+ addParsedDoc(row);
893
+ // eslint-disable-next-line no-await-in-loop
894
+ await waitIfPaused();
895
+ }
896
+ logger.info('Read entire stream');
897
+ } finally {
898
+ cleanup();
899
+ await reader.close();
900
+ }
901
+ }
902
+ async function processArrowStream() {
903
+ const {
904
+ waitIfPaused,
905
+ cleanup
906
+ } = createPauseWaiter(indexer.queueEmitter);
907
+ try {
908
+ const reader = await arrow.RecordBatchReader.from(stream);
909
+ for await (const recordBatch of reader) {
910
+ const {
911
+ fields
912
+ } = recordBatch.schema;
913
+ for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
914
+ const row = {};
915
+ fields.forEach(field => {
916
+ const vector = recordBatch.getChild(field.name);
917
+ row[field.name] = vector ? vector.get(rowIndex) : undefined;
918
+ });
919
+ addParsedDoc(row);
920
+ // eslint-disable-next-line no-await-in-loop
921
+ await waitIfPaused();
922
+ }
923
+ }
924
+ logger.info('Read entire stream');
925
+ } finally {
926
+ cleanup();
927
+ }
928
+ }
929
+ function processPipeline(buildPipeline, errorMessage) {
930
+ return new Promise((resolve, reject) => {
931
+ let finished = false;
932
+ const s = buildPipeline();
933
+ const onPause = () => {
934
+ if (finished) return;
935
+ s.pause();
936
+ };
937
+ const onResume = () => {
938
+ if (finished) return;
939
+ s.resume();
940
+ };
941
+ function cleanup() {
942
+ indexer.queueEmitter.removeListener('pause', onPause);
943
+ indexer.queueEmitter.removeListener('resume', onResume);
944
+ }
945
+ indexer.queueEmitter.on('pause', onPause);
946
+ indexer.queueEmitter.on('resume', onResume);
947
+ s.on('end', () => {
948
+ finished = true;
949
+ cleanup();
950
+ logger.info('Read entire stream');
951
+ resolve();
952
+ });
953
+ s.on('error', err => {
954
+ finished = true;
955
+ cleanup();
956
+ logger.error({
957
+ err
958
+ }, errorMessage);
959
+ reject(err);
960
+ });
961
+ });
962
+ }
963
+ function processCsvStream() {
964
+ return processPipeline(() => stream.pipe(parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
586
965
  try {
587
- addParsedDoc(record, s);
588
- } catch (e) {
589
- console.log('error', e);
966
+ addParsedDoc(record);
967
+ } catch (err) {
968
+ logger.error({
969
+ err
970
+ }, 'Failed to process CSV stream record');
590
971
  }
591
972
  }).on('error', err => {
592
- console.log('Error while reading CSV stream.', err);
593
- })) : (() => {
594
- let skippedHeader = false;
595
- return stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
596
- try {
597
- // skip empty lines
598
- if (line === '') {
599
- return;
600
- }
601
- if (skipHeader && !skippedHeader) {
602
- skippedHeader = true;
603
- return;
604
- }
605
- const parsed = JSON.parse(line);
606
- addParsedDoc(parsed, s);
607
- } catch (e) {
608
- console.log('error', e);
973
+ logger.error({
974
+ err
975
+ }, 'Error while reading CSV stream');
976
+ })), 'Error while reading CSV stream');
977
+ }
978
+ function processNdjsonStream() {
979
+ let skippedHeader = false;
980
+ return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
981
+ try {
982
+ // skip empty lines
983
+ if (line === '') {
984
+ return;
609
985
  }
610
- }).on('error', err => {
611
- console.log('Error while reading stream.', err);
612
- }));
613
- })();
614
- s.on('end', () => {
615
- if (verbose) console.log('Read entire stream.');
986
+ if (skipHeader && !skippedHeader) {
987
+ skippedHeader = true;
988
+ return;
989
+ }
990
+ const parsed = JSON.parse(line);
991
+ addParsedDoc(parsed);
992
+ } catch (err) {
993
+ logger.error({
994
+ err
995
+ }, 'Failed to process NDJSON stream line');
996
+ }
997
+ }).on('error', err => {
998
+ logger.error({
999
+ err
1000
+ }, 'Error while reading stream');
1001
+ })), 'Error while reading stream');
1002
+ }
1003
+ async function startIndex() {
1004
+ try {
1005
+ if (sourceFormat === 'csv') {
1006
+ await processCsvStream();
1007
+ } else if (sourceFormat === 'ndjson') {
1008
+ await processNdjsonStream();
1009
+ } else if (sourceFormat === 'parquet') {
1010
+ await processParquetStream();
1011
+ } else if (sourceFormat === 'arrow') {
1012
+ await processArrowStream();
1013
+ } else {
1014
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
1015
+ }
1016
+ } catch (err) {
1017
+ logger.error({
1018
+ err
1019
+ }, 'Error while reading stream');
1020
+ } finally {
616
1021
  indexer.finish();
617
- finished = true;
618
- });
619
- indexer.queueEmitter.on('pause', () => {
620
- if (finished) return;
621
- s.pause();
622
- });
623
- indexer.queueEmitter.on('resume', () => {
624
- if (finished) return;
625
- s.resume();
626
- });
1022
+ }
627
1023
  }
628
1024
  return () => {
629
1025
  startIndex();
@@ -719,11 +1115,16 @@ async function transformer({
719
1115
  query,
720
1116
  skipHeader = false,
721
1117
  transform,
722
- verbose = true
1118
+ verbose = true,
1119
+ logger: loggerInput
723
1120
  }) {
724
1121
  if (typeof targetIndexName === 'undefined') {
725
1122
  throw Error('targetIndexName must be specified.');
726
1123
  }
1124
+ const logger = createLogger({
1125
+ logger: loggerInput,
1126
+ verbose
1127
+ });
727
1128
  const defaultClientConfig = {
728
1129
  node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
729
1130
  };
@@ -740,7 +1141,9 @@ async function transformer({
740
1141
  mappings,
741
1142
  inferMappings,
742
1143
  inferMappingsOptions,
743
- verbose
1144
+ logger: createChildLogger(logger, {
1145
+ component: 'mapping-inference'
1146
+ })
744
1147
  });
745
1148
  const createMapping = createMappingFactory({
746
1149
  sourceClient,
@@ -751,17 +1154,23 @@ async function transformer({
751
1154
  inferredIngestPipeline: inferenceResult.ingestPipeline,
752
1155
  mappingsOverride,
753
1156
  indexMappingTotalFieldsLimit,
754
- verbose,
755
1157
  deleteIndex,
756
- pipeline
1158
+ pipeline,
1159
+ logger: createChildLogger(logger, {
1160
+ component: 'create-mapping'
1161
+ })
757
1162
  });
758
1163
  const indexer = indexQueueFactory({
759
1164
  targetClient,
760
1165
  targetIndexName,
761
- bufferSize});
1166
+ bufferSize,
1167
+ logger: createChildLogger(logger, {
1168
+ component: 'index-queue'
1169
+ })
1170
+ });
762
1171
  function validateSourceFormat() {
763
- if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
764
- throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "csv".`);
1172
+ if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
1173
+ throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
765
1174
  }
766
1175
  }
767
1176
  function getReader() {
@@ -773,18 +1182,27 @@ async function transformer({
773
1182
  }
774
1183
  if (typeof fileName !== 'undefined') {
775
1184
  validateSourceFormat();
776
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1185
+ return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1186
+ component: 'file-reader'
1187
+ }));
777
1188
  }
778
1189
  if (typeof sourceIndexName !== 'undefined') {
779
- return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
1190
+ return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
1191
+ component: 'index-reader'
1192
+ }));
780
1193
  }
781
1194
  if (typeof stream !== 'undefined') {
782
1195
  validateSourceFormat();
783
- return streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
1196
+ return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
1197
+ component: 'stream-reader'
1198
+ }));
784
1199
  }
785
1200
  return null;
786
1201
  }
787
1202
  const reader = getReader();
1203
+ if (typeof reader !== 'function') {
1204
+ throw Error('One of fileName, sourceIndexName, or stream must be specified.');
1205
+ }
788
1206
  try {
789
1207
  const indexExists = await targetClient.indices.exists({
790
1208
  index: targetIndexName
@@ -801,8 +1219,11 @@ async function transformer({
801
1219
  } else {
802
1220
  reader();
803
1221
  }
804
- } catch (error) {
805
- console.error('Error checking index existence:', error);
1222
+ } catch (err) {
1223
+ logger.error({
1224
+ err,
1225
+ targetIndexName
1226
+ }, 'Error checking index existence');
806
1227
  } finally {
807
1228
  // targetClient.close();
808
1229
  }