node-es-transformer 1.0.0-beta1 → 1.0.0-beta3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,20 @@
1
1
  import fs from 'fs';
2
2
  import es from 'event-stream';
3
3
  import glob from 'glob';
4
+ import split from 'split2';
5
+ import { Readable } from 'stream';
4
6
  import cliProgress from 'cli-progress';
5
7
  import elasticsearch from '@elastic/elasticsearch';
6
8
 
7
- var DEFAULT_BUFFER_SIZE = 1000;
9
+ // In earlier versions this was used to set the number of docs to index in a
10
+ // single bulk request. Since we switched to use the helpers.bulk() method from
11
+ // the ES client, this now translates to the `flushBytes` option of the helper.
12
+ // However, for kind of a backwards compability with the old values, this uses
13
+ // KBytes instead of Bytes. It will be multiplied by 1024 in the index queue.
14
+ var DEFAULT_BUFFER_SIZE = 5120;
15
+
16
+ // The default number of docs to fetch in a single search request when reindexing.
17
+ var DEFAULT_SEARCH_SIZE = 1000;
8
18
 
9
19
  function createMappingFactory(ref) {
10
20
  var sourceClient = ref.sourceClient;
@@ -15,6 +25,7 @@ function createMappingFactory(ref) {
15
25
  var mappingsOverride = ref.mappingsOverride;
16
26
  var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
17
27
  var verbose = ref.verbose;
28
+ var deleteIndex = ref.deleteIndex;
18
29
 
19
30
  return async function () {
20
31
  var targetMappings = mappingsOverride ? undefined : mappings;
@@ -24,7 +35,14 @@ function createMappingFactory(ref) {
24
35
  var mapping = await sourceClient.indices.getMapping({
25
36
  index: sourceIndexName,
26
37
  });
27
- targetMappings = mapping[sourceIndexName].mappings;
38
+ if (mapping[sourceIndexName]) {
39
+ targetMappings = mapping[sourceIndexName].mappings;
40
+ } else {
41
+ var allMappings = Object.values(mapping);
42
+ if (allMappings.length > 0) {
43
+ targetMappings = Object.values(mapping)[0].mappings;
44
+ }
45
+ }
28
46
  } catch (err) {
29
47
  console.log('Error reading source mapping', err);
30
48
  return;
@@ -39,18 +57,28 @@ function createMappingFactory(ref) {
39
57
  }
40
58
 
41
59
  try {
42
- var resp = await targetClient.indices.create({
43
- index: targetIndexName,
44
- body: Object.assign({}, {mappings: targetMappings},
45
- (indexMappingTotalFieldsLimit !== undefined
46
- ? {
47
- settings: {
48
- 'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
49
- },
50
- }
51
- : {})),
52
- });
53
- if (verbose) { console.log('Created target mapping', resp); }
60
+ var indexExists = await targetClient.indices.exists({ index: targetIndexName });
61
+
62
+ if (indexExists === true && deleteIndex === true) {
63
+ await targetClient.indices.delete({ index: targetIndexName });
64
+ }
65
+
66
+ if (indexExists === false || deleteIndex === true) {
67
+ var resp = await targetClient.indices.create({
68
+ index: targetIndexName,
69
+ body: Object.assign({}, {mappings: targetMappings},
70
+ (indexMappingTotalFieldsLimit !== undefined
71
+ ? {
72
+ settings: {
73
+ 'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
74
+ 'index.number_of_shards': 1,
75
+ 'index.number_of_replicas': 0,
76
+ },
77
+ }
78
+ : {})),
79
+ });
80
+ if (verbose) { console.log('Created target mapping', resp); }
81
+ }
54
82
  } catch (err) {
55
83
  console.log('Error creating target mapping', err);
56
84
  }
@@ -58,22 +86,28 @@ function createMappingFactory(ref) {
58
86
  };
59
87
  }
60
88
 
61
- var MAX_QUEUE_SIZE = 15;
62
-
63
89
  function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
64
90
  function startIndex(files) {
65
- var ingestQueueSize = 0;
66
91
  var finished = false;
67
92
 
68
93
  var file = files.shift();
69
94
  var s = fs
70
95
  .createReadStream(file)
71
- .pipe(es.split(splitRegex))
96
+ .pipe(split(splitRegex))
72
97
  .pipe(
73
98
  es
74
99
  .mapSync(function (line) {
75
100
  try {
76
- var doc = typeof transform === 'function' ? transform(line) : line;
101
+ // skip empty lines
102
+ if (line === '') {
103
+ return;
104
+ }
105
+
106
+ var doc =
107
+ typeof transform === 'function'
108
+ ? JSON.stringify(transform(JSON.parse(line)))
109
+ : line;
110
+
77
111
  // if doc is undefined we'll skip indexing it
78
112
  if (typeof doc === 'undefined') {
79
113
  s.resume();
@@ -107,20 +141,13 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
107
141
  })
108
142
  );
109
143
 
110
- indexer.queueEmitter.on('queue-size', async function (size) {
144
+ indexer.queueEmitter.on('pause', function () {
111
145
  if (finished) { return; }
112
- ingestQueueSize = size;
113
-
114
- if (ingestQueueSize < MAX_QUEUE_SIZE) {
115
- s.resume();
116
- } else {
117
- s.pause();
118
- }
146
+ s.pause();
119
147
  });
120
148
 
121
149
  indexer.queueEmitter.on('resume', function () {
122
150
  if (finished) { return; }
123
- ingestQueueSize = 0;
124
151
  s.resume();
125
152
  });
126
153
  }
@@ -136,7 +163,7 @@ var EventEmitter = require('events');
136
163
 
137
164
  var queueEmitter = new EventEmitter();
138
165
 
139
- var parallelCalls = 1;
166
+ var parallelCalls = 5;
140
167
 
141
168
  // a simple helper queue to bulk index documents
142
169
  function indexQueueFactory(ref) {
@@ -146,78 +173,76 @@ function indexQueueFactory(ref) {
146
173
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
147
174
  var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
148
175
 
149
- var buffer = [];
150
- var queue = [];
151
- var ingesting = 0;
152
- var ingestTimes = [];
153
- var finished = false;
176
+ var flushBytes = bufferSize * 1024; // Convert KB to Bytes
177
+ var highWaterMark = flushBytes * parallelCalls;
154
178
 
155
- var ingest = function (b) {
156
- if (typeof b !== 'undefined') {
157
- queue.push(b);
158
- queueEmitter.emit('queue-size', queue.length);
159
- }
179
+ // Create a Readable stream
180
+ var stream = new Readable({
181
+ read: function read() {}, // Implement read but we manage pushing manually
182
+ highWaterMark: highWaterMark, // Buffer size for backpressure management
183
+ });
160
184
 
161
- if (ingestTimes.length > 5) { ingestTimes = ingestTimes.slice(-5); }
185
+ async function* ndjsonStreamIterator(readableStream) {
186
+ var buffer = ''; // To hold the incomplete data
187
+ var skippedHeader = false;
162
188
 
163
- if (ingesting < parallelCalls) {
164
- var docs = queue.shift();
189
+ // Iterate over the stream using async iteration
190
+ for await (var chunk of readableStream) {
191
+ buffer += chunk.toString(); // Accumulate the chunk data in the buffer
165
192
 
166
- queueEmitter.emit('queue-size', queue.length);
167
- if (queue.length <= 5) {
168
- queueEmitter.emit('resume');
169
- }
193
+ // Split the buffer into lines (NDJSON items)
194
+ var lines = buffer.split('\n');
170
195
 
171
- ingesting += 1;
172
-
173
- if (verbose)
174
- { console.log(("bulk ingest docs: " + (docs.length / 2) + ", queue length: " + (queue.length))); }
175
-
176
- var start = Date.now();
177
- client
178
- .bulk({ body: docs })
179
- .then(function () {
180
- var end = Date.now();
181
- var delta = end - start;
182
- ingestTimes.push(delta);
183
- ingesting -= 1;
184
-
185
- var ingestTimesMovingAverage =
186
- ingestTimes.length > 0
187
- ? ingestTimes.reduce(function (p, c) { return p + c; }, 0) / ingestTimes.length
188
- : 0;
189
- var ingestTimesMovingAverageSeconds = Math.floor(ingestTimesMovingAverage / 1000);
190
-
191
- if (
192
- ingestTimes.length > 0 &&
193
- ingestTimesMovingAverageSeconds < 30 &&
194
- parallelCalls < 10
195
- ) {
196
- parallelCalls += 1;
197
- } else if (
198
- ingestTimes.length > 0 &&
199
- ingestTimesMovingAverageSeconds >= 30 &&
200
- parallelCalls > 1
201
- ) {
202
- parallelCalls -= 1;
203
- }
196
+ // The last line might be incomplete, so hold it back in the buffer
197
+ buffer = lines.pop();
204
198
 
205
- if (queue.length > 0) {
206
- ingest();
207
- } else if (queue.length === 0 && finished) {
208
- queueEmitter.emit('finish');
209
- }
210
- })
211
- .catch(function (error) {
212
- console.error(error);
213
- ingesting -= 1;
214
- parallelCalls = 1;
215
- if (queue.length > 0) {
216
- ingest();
199
+ // Yield each complete JSON object
200
+ for (var line of lines) {
201
+ if (line.trim()) {
202
+ try {
203
+ if (!skipHeader || (skipHeader && !skippedHeader)) {
204
+ yield JSON.parse(line); // Parse and yield the JSON object
205
+ skippedHeader = true;
206
+ }
207
+ } catch (err) {
208
+ // Handle JSON parse errors if necessary
209
+ console.error('Failed to parse JSON:', err);
217
210
  }
218
- });
211
+ }
212
+ }
219
213
  }
220
- };
214
+
215
+ // Handle any remaining data in the buffer after the stream ends
216
+ if (buffer.trim()) {
217
+ try {
218
+ yield JSON.parse(buffer);
219
+ } catch (err) {
220
+ console.error('Failed to parse final JSON:', err);
221
+ }
222
+ }
223
+ }
224
+
225
+ var finished = false;
226
+
227
+ // Async IIFE to start bulk indexing
228
+ (async function () {
229
+ console.log('START BULK INDEXING');
230
+ await client.helpers.bulk({
231
+ concurrency: parallelCalls,
232
+ flushBytes: flushBytes,
233
+ flushInterval: 1000,
234
+ refreshOnCompletion: true,
235
+ datasource: ndjsonStreamIterator(stream),
236
+ onDocument: function onDocument(doc) {
237
+ return {
238
+ index: { _index: targetIndexName },
239
+ };
240
+ },
241
+ });
242
+ console.log('FINISHED BULK INDEXING');
243
+
244
+ queueEmitter.emit('finish');
245
+ })();
221
246
 
222
247
  return {
223
248
  add: function (doc) {
@@ -225,37 +250,22 @@ function indexQueueFactory(ref) {
225
250
  throw new Error('Unexpected doc added after indexer should finish.');
226
251
  }
227
252
 
228
- if (!skipHeader) {
229
- var header = { index: { _index: targetIndexName } };
230
- buffer.push(header);
231
- }
232
- buffer.push(doc);
233
-
234
- if (queue.length === 0) {
235
- queueEmitter.emit('resume');
236
- }
237
-
238
- if (buffer.length >= bufferSize * 2) {
239
- ingest(buffer);
240
- buffer = [];
253
+ var canContinue = stream.push(((JSON.stringify(doc)) + "\n"));
254
+ if (!canContinue) {
255
+ queueEmitter.emit('pause');
256
+ stream.once('drain', function () {
257
+ queueEmitter.emit('resume');
258
+ });
241
259
  }
242
260
  },
243
261
  finish: function () {
244
262
  finished = true;
245
-
246
- if (buffer.length > 0) {
247
- ingest(buffer);
248
- buffer = [];
249
- } else if (queue.length === 0 && ingesting === 0) {
250
- queueEmitter.emit('finish');
251
- }
263
+ stream.push(null);
252
264
  },
253
265
  queueEmitter: queueEmitter,
254
266
  };
255
267
  }
256
268
 
257
- var MAX_QUEUE_SIZE$1 = 15;
258
-
259
269
  // create a new progress bar instance and use shades_classic theme
260
270
  var progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
261
271
 
@@ -265,40 +275,69 @@ function indexReaderFactory(
265
275
  transform,
266
276
  client,
267
277
  query,
268
- bufferSize
278
+ searchSize,
279
+ populatedFields
269
280
  ) {
270
- if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
281
+ if ( searchSize === void 0 ) searchSize = DEFAULT_SEARCH_SIZE;
282
+ if ( populatedFields === void 0 ) populatedFields = false;
271
283
 
272
284
  return async function indexReader() {
273
- var responseQueue = [];
274
285
  var docsNum = 0;
286
+ var scrollId;
287
+ var finished = false;
288
+ var readActive = false;
289
+ var backPressurePause = false;
275
290
 
276
- function search() {
277
- return client.search({
278
- index: sourceIndexName,
279
- scroll: '30s',
280
- size: bufferSize,
281
- query: query,
282
- });
291
+ async function fetchPopulatedFields() {
292
+ try {
293
+ var response = await client.search({
294
+ index: sourceIndexName,
295
+ size: searchSize,
296
+ query: {
297
+ function_score: {
298
+ query: query,
299
+ random_score: {},
300
+ },
301
+ },
302
+ });
303
+
304
+ // Get all field names for each returned doc and flatten it
305
+ // to a list of unique field names used across all docs.
306
+ return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
307
+ } catch (e) {
308
+ console.log('error', e);
309
+ }
310
+ }
311
+
312
+ function search(fields) {
313
+ return client.search(Object.assign({}, {index: sourceIndexName,
314
+ scroll: '600s',
315
+ size: searchSize,
316
+ query: query},
317
+ (fields ? { _source: fields } : {})));
283
318
  }
284
319
 
285
320
  function scroll(id) {
286
321
  return client.scroll({
287
322
  scroll_id: id,
288
- scroll: '30s',
323
+ scroll: '600s',
289
324
  });
290
325
  }
291
326
 
292
- // start things off by searching, setting a scroll timeout, and pushing
293
- // our first response into the queue to be processed
294
- var se = await search();
295
- responseQueue.push(se);
296
- progressBar.start(se.hits.total.value, 0);
327
+ var fieldsWithData;
328
+
329
+ // identify populated fields
330
+ if (populatedFields) {
331
+ fieldsWithData = await fetchPopulatedFields();
332
+ }
333
+
334
+ await fetchNextResponse();
297
335
 
298
336
  function processHit(hit) {
299
337
  docsNum += 1;
300
338
  try {
301
339
  var doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
340
+
302
341
  // if doc is undefined we'll skip indexing it
303
342
  if (typeof doc === 'undefined') {
304
343
  return;
@@ -317,68 +356,117 @@ function indexReaderFactory(
317
356
  }
318
357
  }
319
358
 
320
- var ingestQueueSize = 0;
321
- var scrollId = se._scroll_id; // eslint-disable-line no-underscore-dangle
322
- var readActive = false;
359
+ async function fetchNextResponse() {
360
+ readActive = true;
323
361
 
324
- async function processResponseQueue() {
325
- while (responseQueue.length) {
326
- readActive = true;
327
- var response = responseQueue.shift();
362
+ var sc = scrollId ? await scroll(scrollId) : await search(fieldsWithData);
328
363
 
329
- // collect the docs from this response
330
- response.hits.hits.forEach(processHit);
364
+ if (!scrollId) {
365
+ progressBar.start(sc.hits.total.value, 0);
366
+ }
331
367
 
332
- progressBar.update(docsNum);
368
+ scrollId = sc._scroll_id;
369
+ readActive = false;
333
370
 
334
- // check to see if we have collected all of the docs
335
- if (response.hits.total.value === docsNum) {
336
- indexer.finish();
337
- break;
338
- }
339
-
340
- if (ingestQueueSize < MAX_QUEUE_SIZE$1) {
341
- // get the next response if there are more docs to fetch
342
- var sc = await scroll(response._scroll_id); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
343
- scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
344
- responseQueue.push(sc);
345
- } else {
346
- readActive = false;
347
- }
348
- }
371
+ processResponse(sc);
349
372
  }
350
373
 
351
- indexer.queueEmitter.on('queue-size', async function (size) {
352
- ingestQueueSize = size;
374
+ async function processResponse(response) {
375
+ // collect the docs from this response
376
+ response.hits.hits.forEach(processHit);
353
377
 
354
- if (!readActive && ingestQueueSize < MAX_QUEUE_SIZE$1) {
355
- // get the next response if there are more docs to fetch
356
- var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
357
- scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
358
- responseQueue.push(sc);
359
- processResponseQueue();
378
+ progressBar.update(docsNum);
379
+
380
+ // check to see if we have collected all of the docs
381
+ if (response.hits.total.value === docsNum) {
382
+ indexer.finish();
383
+ return;
360
384
  }
385
+
386
+ if (!backPressurePause) {
387
+ await fetchNextResponse();
388
+ }
389
+ }
390
+
391
+ indexer.queueEmitter.on('pause', async function () {
392
+ backPressurePause = true;
361
393
  });
362
394
 
363
395
  indexer.queueEmitter.on('resume', async function () {
364
- ingestQueueSize = 0;
396
+ backPressurePause = false;
365
397
 
366
- if (readActive) {
398
+ if (readActive || finished) {
367
399
  return;
368
400
  }
369
401
 
370
- // get the next response if there are more docs to fetch
371
- var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
372
- scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
373
- responseQueue.push(sc);
374
- processResponseQueue();
402
+ await fetchNextResponse();
375
403
  });
376
404
 
377
405
  indexer.queueEmitter.on('finish', function () {
406
+ finished = true;
378
407
  progressBar.stop();
379
408
  });
409
+ };
410
+ }
380
411
 
381
- processResponseQueue();
412
+ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
413
+ function startIndex() {
414
+ console.log('START INDEX', splitRegex);
415
+ var finished = false;
416
+
417
+ var s = stream.pipe(split(splitRegex)).pipe(
418
+ es
419
+ .mapSync(function (line) {
420
+ try {
421
+ // skip empty lines
422
+ if (line === '') {
423
+ return;
424
+ }
425
+
426
+ var doc =
427
+ typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
428
+
429
+ // if doc is undefined we'll skip indexing it
430
+ if (typeof doc === 'undefined') {
431
+ s.resume();
432
+ return;
433
+ }
434
+
435
+ // the transform callback may return an array of docs so we can emit
436
+ // multiple docs from a single line
437
+ if (Array.isArray(doc)) {
438
+ doc.forEach(function (d) { return indexer.add(d); });
439
+ return;
440
+ }
441
+
442
+ indexer.add(doc);
443
+ } catch (e) {
444
+ console.log('error', e);
445
+ }
446
+ })
447
+ .on('error', function (err) {
448
+ console.log('Error while reading file.', err);
449
+ })
450
+ .on('end', function () {
451
+ if (verbose) { console.log('Read entire stream.'); }
452
+ indexer.finish();
453
+ finished = true;
454
+ })
455
+ );
456
+
457
+ indexer.queueEmitter.on('pause', function () {
458
+ if (finished) { return; }
459
+ s.pause();
460
+ });
461
+
462
+ indexer.queueEmitter.on('resume', function () {
463
+ if (finished) { return; }
464
+ s.resume();
465
+ });
466
+ }
467
+
468
+ return function () {
469
+ startIndex();
382
470
  };
383
471
  }
384
472
 
@@ -387,6 +475,8 @@ async function transformer(ref) {
387
475
  var sourceClientConfig = ref.sourceClientConfig;
388
476
  var targetClientConfig = ref.targetClientConfig;
389
477
  var bufferSize = ref.bufferSize; if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
478
+ var searchSize = ref.searchSize; if ( searchSize === void 0 ) searchSize = DEFAULT_SEARCH_SIZE;
479
+ var stream = ref.stream;
390
480
  var fileName = ref.fileName;
391
481
  var splitRegex = ref.splitRegex; if ( splitRegex === void 0 ) splitRegex = /\n/;
392
482
  var sourceIndexName = ref.sourceIndexName;
@@ -394,11 +484,13 @@ async function transformer(ref) {
394
484
  var mappings = ref.mappings;
395
485
  var mappingsOverride = ref.mappingsOverride; if ( mappingsOverride === void 0 ) mappingsOverride = false;
396
486
  var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
487
+ var populatedFields = ref.populatedFields; if ( populatedFields === void 0 ) populatedFields = false;
397
488
  var query = ref.query;
398
489
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
399
490
  var transform = ref.transform;
400
491
  var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
401
492
 
493
+ console.log('TRANSFORMER');
402
494
  if (typeof targetIndexName === 'undefined') {
403
495
  throw Error('targetIndexName must be specified.');
404
496
  }
@@ -421,6 +513,7 @@ async function transformer(ref) {
421
513
  mappingsOverride: mappingsOverride,
422
514
  indexMappingTotalFieldsLimit: indexMappingTotalFieldsLimit,
423
515
  verbose: verbose,
516
+ deleteIndex: deleteIndex,
424
517
  });
425
518
  var indexer = indexQueueFactory({
426
519
  targetClient: targetClient,
@@ -435,8 +528,12 @@ async function transformer(ref) {
435
528
  throw Error('Only either one of fileName or sourceIndexName can be specified.');
436
529
  }
437
530
 
438
- if (typeof fileName === 'undefined' && typeof sourceIndexName === 'undefined') {
439
- throw Error('Either fileName or sourceIndexName must be specified.');
531
+ if (
532
+ (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') ||
533
+ (typeof fileName !== 'undefined' && typeof stream !== 'undefined') ||
534
+ (typeof sourceIndexName !== 'undefined' && typeof stream !== 'undefined')
535
+ ) {
536
+ throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
440
537
  }
441
538
 
442
539
  if (typeof fileName !== 'undefined') {
@@ -450,17 +547,25 @@ async function transformer(ref) {
450
547
  transform,
451
548
  sourceClient,
452
549
  query,
453
- bufferSize
550
+ searchSize,
551
+ populatedFields
454
552
  );
455
553
  }
456
554
 
555
+ if (typeof stream !== 'undefined') {
556
+ console.log('STREAM READER');
557
+ return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
558
+ }
559
+
457
560
  return null;
458
561
  }
459
562
 
460
563
  var reader = getReader();
564
+ console.log('READER INITIALIZED');
461
565
 
462
566
  try {
463
567
  var indexExists = await targetClient.indices.exists({ index: targetIndexName });
568
+ console.log('INDEX EXISTS', indexExists);
464
569
 
465
570
  if (indexExists === false) {
466
571
  await createMapping();