node-es-transformer 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -79,11 +79,10 @@ If you need to ingest large CSV/JSON files (GigaBytes) into Elasticsearch withou
79
79
 
80
80
  | node-es-transformer | Elasticsearch Client | Elasticsearch Server | Node.js |
81
81
  | ----------------------- | -------------------- | -------------------- | ------- |
82
- | 1.0.0-beta8+ | 8.x and 9.x | 8.x and 9.x | 22+ |
83
- | 1.0.0-beta7 | 9.x only | 9.x only | 22+ |
84
- | 1.0.0-beta6 and earlier | 8.x | 8.x | 22+ |
82
+ | 1.0.0+ | 8.x and 9.x | 8.x and 9.x | 22+ |
83
+ | 1.0.0-beta7 and earlier | 8.x | 8.x | 18-20 |
85
84
 
86
- **Multi-Version Support**: Starting with v1.0.0-beta8, the library supports both Elasticsearch 8.x and 9.x through automatic version detection and client aliasing. This enables seamless reindexing between major versions (e.g., migrating from ES 8.x to 9.x). All functionality is tested in CI against multiple ES versions including cross-version reindexing scenarios.
85
+ **Multi-Version Support**: Starting with v1.0.0, the library supports both Elasticsearch 8.x and 9.x through automatic version detection and client aliasing. This enables seamless reindexing between major versions (e.g., migrating from ES 8.x to 9.x). All functionality is tested in CI against multiple ES versions including cross-version reindexing scenarios.
87
86
 
88
87
  **Upgrading?** See [MIGRATION.md](MIGRATION.md) for upgrade guidance from beta versions to v1.0.0.
89
88
 
@@ -0,0 +1,619 @@
1
+ 'use strict';
2
+
3
+ var elasticsearch9 = require('es9');
4
+ var elasticsearch8 = require('es8');
5
+ var fs = require('fs');
6
+ var es = require('event-stream');
7
+ var glob = require('glob');
8
+ var split = require('split2');
9
+ var stream = require('stream');
10
+ var cliProgress = require('cli-progress');
11
+
12
+ // In earlier versions this was used to set the number of docs to index in a
13
+ // single bulk request. Since we switched to use the helpers.bulk() method from
14
+ // the ES client, this now translates to the `flushBytes` option of the helper.
15
+ // However, for kind of a backwards compability with the old values, this uses
16
+ // KBytes instead of Bytes. It will be multiplied by 1024 in the index queue.
17
+ const DEFAULT_BUFFER_SIZE = 5120;
18
+
19
+ // The default number of docs to fetch in a single search request when reindexing.
20
+ const DEFAULT_SEARCH_SIZE = 1000;
21
+
22
+ function createMappingFactory({
23
+ sourceClient,
24
+ sourceIndexName,
25
+ targetClient,
26
+ targetIndexName,
27
+ mappings,
28
+ mappingsOverride,
29
+ indexMappingTotalFieldsLimit,
30
+ verbose,
31
+ deleteIndex,
32
+ pipeline
33
+ }) {
34
+ return async () => {
35
+ let targetMappings = mappingsOverride ? undefined : mappings;
36
+ if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
37
+ try {
38
+ const mapping = await sourceClient.indices.getMapping({
39
+ index: sourceIndexName
40
+ });
41
+ if (mapping[sourceIndexName]) {
42
+ targetMappings = mapping[sourceIndexName].mappings;
43
+ } else {
44
+ const allMappings = Object.values(mapping);
45
+ if (allMappings.length > 0) {
46
+ targetMappings = Object.values(mapping)[0].mappings;
47
+ }
48
+ }
49
+ } catch (err) {
50
+ console.log('Error reading source mapping', err);
51
+ return;
52
+ }
53
+ }
54
+ if (typeof targetMappings === 'object' && targetMappings !== null) {
55
+ if (mappingsOverride) {
56
+ targetMappings = {
57
+ ...targetMappings,
58
+ properties: {
59
+ ...targetMappings.properties,
60
+ ...mappings
61
+ }
62
+ };
63
+ }
64
+ try {
65
+ const indexExists = await targetClient.indices.exists({
66
+ index: targetIndexName
67
+ });
68
+ if (indexExists === true && deleteIndex === true) {
69
+ await targetClient.indices.delete({
70
+ index: targetIndexName
71
+ });
72
+ }
73
+ if (indexExists === false || deleteIndex === true) {
74
+ const resp = await targetClient.indices.create({
75
+ index: targetIndexName,
76
+ mappings: targetMappings,
77
+ ...(pipeline !== undefined ? {
78
+ settings: {
79
+ index: {
80
+ default_pipeline: pipeline
81
+ }
82
+ }
83
+ } : {}),
84
+ ...(indexMappingTotalFieldsLimit !== undefined ? {
85
+ settings: {
86
+ 'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
87
+ 'index.number_of_shards': 1,
88
+ 'index.number_of_replicas': 0
89
+ }
90
+ } : {})
91
+ });
92
+ if (verbose) console.log('Created target mapping', resp);
93
+ }
94
+ } catch (err) {
95
+ console.log('Error creating target mapping', err);
96
+ }
97
+ }
98
+ };
99
+ }
100
+
101
+ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
102
+ function startIndex(files) {
103
+ let finished = false;
104
+ const file = files.shift();
105
+ const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
106
+ try {
107
+ // skip empty lines
108
+ if (line === '') {
109
+ return;
110
+ }
111
+ const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
112
+
113
+ // if doc is undefined we'll skip indexing it
114
+ if (typeof doc === 'undefined') {
115
+ s.resume();
116
+ return;
117
+ }
118
+
119
+ // the transform callback may return an array of docs so we can emit
120
+ // multiple docs from a single line
121
+ if (Array.isArray(doc)) {
122
+ doc.forEach(d => indexer.add(d));
123
+ return;
124
+ }
125
+ indexer.add(doc);
126
+ } catch (e) {
127
+ console.log('error', e);
128
+ }
129
+ }).on('error', err => {
130
+ console.log('Error while reading file.', err);
131
+ }).on('end', () => {
132
+ if (verbose) console.log('Read entire file: ', file);
133
+ if (files.length > 0) {
134
+ startIndex(files);
135
+ return;
136
+ }
137
+ indexer.finish();
138
+ finished = true;
139
+ }));
140
+ indexer.queueEmitter.on('pause', () => {
141
+ if (finished) return;
142
+ s.pause();
143
+ });
144
+ indexer.queueEmitter.on('resume', () => {
145
+ if (finished) return;
146
+ s.resume();
147
+ });
148
+ }
149
+ return () => {
150
+ try {
151
+ const files = glob.globSync(fileName);
152
+ startIndex(files);
153
+ } catch (error) {
154
+ console.log('Error matching files:', error);
155
+ }
156
+ };
157
+ }
158
+
159
+ const EventEmitter = require('events');
160
+ const queueEmitter = new EventEmitter();
161
+ const parallelCalls = 5;
162
+
163
+ // a simple helper queue to bulk index documents
164
+ function indexQueueFactory({
165
+ targetClient: client,
166
+ targetIndexName,
167
+ bufferSize = DEFAULT_BUFFER_SIZE,
168
+ skipHeader = false
169
+ }) {
170
+ let docsPerSecond = 0;
171
+ const flushBytes = bufferSize * 1024; // Convert KB to Bytes
172
+ const highWaterMark = flushBytes * parallelCalls;
173
+
174
+ // Create a Readable stream
175
+ const stream$1 = new stream.Readable({
176
+ read() {},
177
+ // Implement read but we manage pushing manually
178
+ highWaterMark // Buffer size for backpressure management
179
+ });
180
+ async function* ndjsonStreamIterator(readableStream) {
181
+ let buffer = ''; // To hold the incomplete data
182
+ let skippedHeader = false;
183
+ try {
184
+ // Iterate over the stream using async iteration
185
+ for await (const chunk of readableStream) {
186
+ buffer += chunk.toString(); // Accumulate the chunk data in the buffer
187
+
188
+ // Split the buffer into lines (NDJSON items)
189
+ const lines = buffer.split('\n');
190
+
191
+ // The last line might be incomplete, so hold it back in the buffer
192
+ buffer = lines.pop();
193
+
194
+ // Yield each complete JSON object
195
+ for (const line of lines) {
196
+ if (line.trim()) {
197
+ try {
198
+ if (!skipHeader || skipHeader && !skippedHeader) {
199
+ yield JSON.parse(line); // Parse and yield the JSON object
200
+ skippedHeader = true;
201
+ }
202
+ } catch (err) {
203
+ // Handle JSON parse errors if necessary
204
+ console.error('Failed to parse JSON:', err);
205
+ }
206
+ }
207
+ }
208
+ }
209
+
210
+ // Handle any remaining data in the buffer after the stream ends
211
+ if (buffer.trim()) {
212
+ try {
213
+ yield JSON.parse(buffer);
214
+ } catch (err) {
215
+ console.error('Failed to parse final JSON:', err);
216
+ }
217
+ }
218
+ } finally {
219
+ // Ensure the stream is properly cleaned up if the iterator is terminated early
220
+ if (!readableStream.destroyed) {
221
+ readableStream.destroy();
222
+ }
223
+ }
224
+ }
225
+ let finished = false;
226
+ let drainListener = null;
227
+
228
+ // Async IIFE to start bulk indexing
229
+ (async () => {
230
+ const interval = setInterval(() => {
231
+ queueEmitter.emit('docsPerSecond', docsPerSecond);
232
+ docsPerSecond = 0;
233
+ }, 1000);
234
+ try {
235
+ await client.helpers.bulk({
236
+ concurrency: parallelCalls,
237
+ flushBytes,
238
+ flushInterval: 1000,
239
+ refreshOnCompletion: true,
240
+ datasource: ndjsonStreamIterator(stream$1),
241
+ onDocument(doc) {
242
+ docsPerSecond++;
243
+ return {
244
+ index: {
245
+ _index: targetIndexName
246
+ }
247
+ };
248
+ }
249
+ });
250
+ } catch (error) {
251
+ console.error('Error during bulk indexing:', error);
252
+ throw error;
253
+ } finally {
254
+ // Clean up interval
255
+ clearInterval(interval);
256
+
257
+ // Remove drain listener if it exists
258
+ if (drainListener) {
259
+ stream$1.removeListener('drain', drainListener);
260
+ drainListener = null;
261
+ }
262
+
263
+ // Remove all listeners from stream
264
+ stream$1.removeAllListeners();
265
+
266
+ // Properly destroy the stream to prevent open handles
267
+ if (!stream$1.destroyed) {
268
+ stream$1.destroy();
269
+ }
270
+
271
+ // Emit finish and clean up queue emitter listeners
272
+ queueEmitter.emit('finish');
273
+ queueEmitter.removeAllListeners();
274
+ }
275
+ })();
276
+ return {
277
+ add: doc => {
278
+ if (finished) {
279
+ throw new Error('Unexpected doc added after indexer should finish.');
280
+ }
281
+ const canContinue = stream$1.push(`${JSON.stringify(doc)}\n`);
282
+ if (!canContinue) {
283
+ queueEmitter.emit('pause');
284
+
285
+ // Store the listener so we can clean it up later
286
+ drainListener = () => {
287
+ queueEmitter.emit('resume');
288
+ };
289
+ stream$1.once('drain', drainListener);
290
+ }
291
+ },
292
+ finish: () => {
293
+ finished = true;
294
+ stream$1.push(null);
295
+ },
296
+ queueEmitter
297
+ };
298
+ }
299
+
300
+ // create a new progress bar instance and use shades_classic theme
301
+ const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
302
+ function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
303
+ return async function indexReader() {
304
+ let docsNum = 0;
305
+ let scrollId;
306
+ let finished = false;
307
+ let readActive = false;
308
+ let backPressurePause = false;
309
+ async function fetchPopulatedFields() {
310
+ try {
311
+ // Get all populated fields from the index
312
+ const response = await client.fieldCaps({
313
+ index: sourceIndexName,
314
+ fields: '*',
315
+ include_empty_fields: false,
316
+ filters: '-metadata'
317
+ }, {
318
+ maxRetries: 0
319
+ });
320
+ return Object.keys(response.fields);
321
+ } catch (e) {
322
+ console.log('error', e);
323
+ }
324
+ }
325
+ function search(fields) {
326
+ return client.search({
327
+ index: sourceIndexName,
328
+ scroll: '600s',
329
+ size: searchSize,
330
+ query,
331
+ ...(fields ? {
332
+ _source: fields
333
+ } : {})
334
+ });
335
+ }
336
+ function scroll(id) {
337
+ return client.scroll({
338
+ scroll_id: id,
339
+ scroll: '600s'
340
+ });
341
+ }
342
+ let fieldsWithData;
343
+
344
+ // identify populated fields
345
+ if (populatedFields) {
346
+ fieldsWithData = await fetchPopulatedFields();
347
+ }
348
+ await fetchNextResponse();
349
+ function processHit(hit) {
350
+ docsNum += 1;
351
+ try {
352
+ const doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
353
+
354
+ // if doc is undefined we'll skip indexing it
355
+ if (typeof doc === 'undefined') {
356
+ return;
357
+ }
358
+
359
+ // the transform callback may return an array of docs so we can emit
360
+ // multiple docs from a single line
361
+ if (Array.isArray(doc)) {
362
+ doc.forEach(d => indexer.add(d));
363
+ return;
364
+ }
365
+ indexer.add(doc);
366
+ } catch (e) {
367
+ console.log('error', e);
368
+ }
369
+ }
370
+ async function fetchNextResponse() {
371
+ readActive = true;
372
+ const sc = scrollId ? await scroll(scrollId) : await search(fieldsWithData);
373
+ if (!scrollId) {
374
+ progressBar.start(sc.hits.total.value, 0);
375
+ }
376
+ scrollId = sc._scroll_id;
377
+ readActive = false;
378
+ processResponse(sc);
379
+ }
380
+ async function processResponse(response) {
381
+ // collect the docs from this response
382
+ response.hits.hits.forEach(processHit);
383
+ progressBar.update(docsNum);
384
+
385
+ // check to see if we have collected all of the docs
386
+ if (response.hits.total.value === docsNum) {
387
+ indexer.finish();
388
+ return;
389
+ }
390
+ if (!backPressurePause) {
391
+ await fetchNextResponse();
392
+ }
393
+ }
394
+ indexer.queueEmitter.on('pause', async () => {
395
+ backPressurePause = true;
396
+ });
397
+ indexer.queueEmitter.on('resume', async () => {
398
+ backPressurePause = false;
399
+ if (readActive || finished) {
400
+ return;
401
+ }
402
+ await fetchNextResponse();
403
+ });
404
+ indexer.queueEmitter.on('finish', () => {
405
+ finished = true;
406
+ progressBar.stop();
407
+ });
408
+ };
409
+ }
410
+
411
+ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
412
+ function startIndex() {
413
+ let finished = false;
414
+ const s = stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
415
+ try {
416
+ // skip empty lines
417
+ if (line === '') {
418
+ return;
419
+ }
420
+ const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
421
+
422
+ // if doc is undefined we'll skip indexing it
423
+ if (typeof doc === 'undefined') {
424
+ s.resume();
425
+ return;
426
+ }
427
+
428
+ // the transform callback may return an array of docs so we can emit
429
+ // multiple docs from a single line
430
+ if (Array.isArray(doc)) {
431
+ doc.forEach(d => indexer.add(d));
432
+ return;
433
+ }
434
+ indexer.add(doc);
435
+ } catch (e) {
436
+ console.log('error', e);
437
+ }
438
+ }).on('error', err => {
439
+ console.log('Error while reading stream.', err);
440
+ }).on('end', () => {
441
+ if (verbose) console.log('Read entire stream.');
442
+ indexer.finish();
443
+ finished = true;
444
+ }));
445
+ indexer.queueEmitter.on('pause', () => {
446
+ if (finished) return;
447
+ s.pause();
448
+ });
449
+ indexer.queueEmitter.on('resume', () => {
450
+ if (finished) return;
451
+ s.resume();
452
+ });
453
+ }
454
+ return () => {
455
+ startIndex();
456
+ };
457
+ }
458
+
459
+ /**
460
+ * Detect Elasticsearch version by querying the cluster
461
+ */
462
+ async function detectElasticsearchVersion(config) {
463
+ try {
464
+ // Try with v9 client first (most common for new setups)
465
+ const testClient = new elasticsearch9.Client(config);
466
+ const info = await testClient.info();
467
+ const version = info.version?.number;
468
+ await testClient.close();
469
+ if (version) {
470
+ const majorVersion = parseInt(version.split('.')[0], 10);
471
+ return majorVersion;
472
+ }
473
+ } catch (e) {
474
+ // If v9 client fails, try v8 client
475
+ try {
476
+ const testClient = new elasticsearch8.Client(config);
477
+ const info = await testClient.info();
478
+ const version = info.version?.number;
479
+ await testClient.close();
480
+ if (version) {
481
+ const majorVersion = parseInt(version.split('.')[0], 10);
482
+ return majorVersion;
483
+ }
484
+ } catch (e2) {
485
+ // Could not detect version
486
+ }
487
+ }
488
+
489
+ // Default to v9 if detection fails
490
+ return 9;
491
+ }
492
+
493
+ /**
494
+ * Create or validate an Elasticsearch client
495
+ * @param {Object|Client} clientOrConfig - Either a client instance or config object
496
+ * @param {Object} defaultConfig - Default configuration to use if creating a new client
497
+ * @param {number} [forceVersion] - Force a specific ES client version (8 or 9)
498
+ */
499
+ async function getOrCreateClient(clientOrConfig, defaultConfig, forceVersion) {
500
+ // If already a client instance, return it
501
+ if (clientOrConfig && typeof clientOrConfig.info === 'function') {
502
+ return clientOrConfig;
503
+ }
504
+ const config = clientOrConfig || defaultConfig;
505
+
506
+ // If version is forced, use the specified client
507
+ if (forceVersion === 8) {
508
+ return new elasticsearch8.Client(config);
509
+ } else if (forceVersion === 9) {
510
+ return new elasticsearch9.Client(config);
511
+ }
512
+
513
+ // Auto-detect version
514
+ const majorVersion = await detectElasticsearchVersion(config);
515
+ if (majorVersion >= 9) {
516
+ return new elasticsearch9.Client(config);
517
+ } else {
518
+ return new elasticsearch8.Client(config);
519
+ }
520
+ }
521
+ async function transformer({
522
+ deleteIndex = false,
523
+ sourceClient: sourceClientInput,
524
+ targetClient: targetClientInput,
525
+ sourceClientConfig,
526
+ targetClientConfig,
527
+ sourceClientVersion,
528
+ targetClientVersion,
529
+ bufferSize = DEFAULT_BUFFER_SIZE,
530
+ searchSize = DEFAULT_SEARCH_SIZE,
531
+ stream,
532
+ fileName,
533
+ splitRegex = /\n/,
534
+ sourceIndexName,
535
+ targetIndexName,
536
+ mappings,
537
+ mappingsOverride = false,
538
+ indexMappingTotalFieldsLimit,
539
+ pipeline,
540
+ populatedFields = false,
541
+ query,
542
+ skipHeader = false,
543
+ transform,
544
+ verbose = true
545
+ }) {
546
+ if (typeof targetIndexName === 'undefined') {
547
+ throw Error('targetIndexName must be specified.');
548
+ }
549
+ const defaultClientConfig = {
550
+ node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
551
+ };
552
+
553
+ // Support both old (config) and new (client instance) patterns
554
+ const sourceClient = await getOrCreateClient(sourceClientInput || sourceClientConfig, defaultClientConfig, sourceClientVersion);
555
+ const targetClient = await getOrCreateClient(targetClientInput || targetClientConfig || sourceClientInput || sourceClientConfig, defaultClientConfig, targetClientVersion);
556
+ const createMapping = createMappingFactory({
557
+ sourceClient,
558
+ sourceIndexName,
559
+ targetClient,
560
+ targetIndexName,
561
+ mappings,
562
+ mappingsOverride,
563
+ indexMappingTotalFieldsLimit,
564
+ verbose,
565
+ deleteIndex,
566
+ pipeline
567
+ });
568
+ const indexer = indexQueueFactory({
569
+ targetClient,
570
+ targetIndexName,
571
+ bufferSize,
572
+ skipHeader});
573
+ function getReader() {
574
+ if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
575
+ throw Error('Only either one of fileName or sourceIndexName can be specified.');
576
+ }
577
+ if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined' || typeof fileName !== 'undefined' && typeof stream !== 'undefined' || typeof sourceIndexName !== 'undefined' && typeof stream !== 'undefined') {
578
+ throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
579
+ }
580
+ if (typeof fileName !== 'undefined') {
581
+ return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose);
582
+ }
583
+ if (typeof sourceIndexName !== 'undefined') {
584
+ return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
585
+ }
586
+ if (typeof stream !== 'undefined') {
587
+ return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
588
+ }
589
+ return null;
590
+ }
591
+ const reader = getReader();
592
+ try {
593
+ const indexExists = await targetClient.indices.exists({
594
+ index: targetIndexName
595
+ });
596
+ if (indexExists === false) {
597
+ await createMapping();
598
+ reader();
599
+ } else if (deleteIndex === true) {
600
+ await targetClient.indices.delete({
601
+ index: targetIndexName
602
+ });
603
+ await createMapping();
604
+ reader();
605
+ } else {
606
+ reader();
607
+ }
608
+ } catch (error) {
609
+ console.error('Error checking index existence:', error);
610
+ } finally {
611
+ // targetClient.close();
612
+ }
613
+ return {
614
+ events: indexer.queueEmitter
615
+ };
616
+ }
617
+
618
+ module.exports = transformer;
619
+ //# sourceMappingURL=node-es-transformer.cjs.js.map