node-es-transformer 1.0.0-beta2 → 1.0.0-beta3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -3
- package/dist/node-es-transformer.cjs.js +233 -172
- package/dist/node-es-transformer.esm.js +233 -172
- package/package.json +6 -5
package/README.md
CHANGED
|
@@ -110,7 +110,8 @@ transformer({
|
|
|
110
110
|
|
|
111
111
|
- `deleteIndex`: Setting to automatically delete an existing index, default is `false`.
|
|
112
112
|
- `sourceClientConfig`/`targetClientConfig`: Optional Elasticsearch client options, defaults to `{ node: 'http://localhost:9200' }`.
|
|
113
|
-
- `bufferSize`: The
|
|
113
|
+
- `bufferSize`: The threshold to flush bulk index request in KBytes, defaults to `5120`.
|
|
114
|
+
- `searchSize`: The amount of documents to be fetched with each search request when reindexing from another source index.
|
|
114
115
|
- `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName` is not allowed.
|
|
115
116
|
- `splitRegex`: Custom line split regex, defaults to `/\n/`.
|
|
116
117
|
- `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName` is not allowed.
|
|
@@ -147,10 +148,10 @@ yarn
|
|
|
147
148
|
|
|
148
149
|
```bash
|
|
149
150
|
# Download the docker image
|
|
150
|
-
docker pull docker.elastic.co/elasticsearch/elasticsearch:8.
|
|
151
|
+
docker pull docker.elastic.co/elasticsearch/elasticsearch:8.15.0
|
|
151
152
|
|
|
152
153
|
# Run the container
|
|
153
|
-
docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.
|
|
154
|
+
docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.15.0
|
|
154
155
|
```
|
|
155
156
|
|
|
156
157
|
To commit, use `cz`. To prepare a release, use e.g. `yarn release -- --release-as 1.0.0-beta2`.
|
|
@@ -5,10 +5,20 @@ function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'defau
|
|
|
5
5
|
var fs = _interopDefault(require('fs'));
|
|
6
6
|
var es = _interopDefault(require('event-stream'));
|
|
7
7
|
var glob = _interopDefault(require('glob'));
|
|
8
|
+
var split = _interopDefault(require('split2'));
|
|
9
|
+
var stream = require('stream');
|
|
8
10
|
var cliProgress = _interopDefault(require('cli-progress'));
|
|
9
11
|
var elasticsearch = _interopDefault(require('@elastic/elasticsearch'));
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
// In earlier versions this was used to set the number of docs to index in a
|
|
14
|
+
// single bulk request. Since we switched to use the helpers.bulk() method from
|
|
15
|
+
// the ES client, this now translates to the `flushBytes` option of the helper.
|
|
16
|
+
// However, for kind of a backwards compability with the old values, this uses
|
|
17
|
+
// KBytes instead of Bytes. It will be multiplied by 1024 in the index queue.
|
|
18
|
+
var DEFAULT_BUFFER_SIZE = 5120;
|
|
19
|
+
|
|
20
|
+
// The default number of docs to fetch in a single search request when reindexing.
|
|
21
|
+
var DEFAULT_SEARCH_SIZE = 1000;
|
|
12
22
|
|
|
13
23
|
function createMappingFactory(ref) {
|
|
14
24
|
var sourceClient = ref.sourceClient;
|
|
@@ -19,6 +29,7 @@ function createMappingFactory(ref) {
|
|
|
19
29
|
var mappingsOverride = ref.mappingsOverride;
|
|
20
30
|
var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
|
|
21
31
|
var verbose = ref.verbose;
|
|
32
|
+
var deleteIndex = ref.deleteIndex;
|
|
22
33
|
|
|
23
34
|
return async function () {
|
|
24
35
|
var targetMappings = mappingsOverride ? undefined : mappings;
|
|
@@ -28,7 +39,14 @@ function createMappingFactory(ref) {
|
|
|
28
39
|
var mapping = await sourceClient.indices.getMapping({
|
|
29
40
|
index: sourceIndexName,
|
|
30
41
|
});
|
|
31
|
-
|
|
42
|
+
if (mapping[sourceIndexName]) {
|
|
43
|
+
targetMappings = mapping[sourceIndexName].mappings;
|
|
44
|
+
} else {
|
|
45
|
+
var allMappings = Object.values(mapping);
|
|
46
|
+
if (allMappings.length > 0) {
|
|
47
|
+
targetMappings = Object.values(mapping)[0].mappings;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
32
50
|
} catch (err) {
|
|
33
51
|
console.log('Error reading source mapping', err);
|
|
34
52
|
return;
|
|
@@ -43,18 +61,28 @@ function createMappingFactory(ref) {
|
|
|
43
61
|
}
|
|
44
62
|
|
|
45
63
|
try {
|
|
46
|
-
var
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
64
|
+
var indexExists = await targetClient.indices.exists({ index: targetIndexName });
|
|
65
|
+
|
|
66
|
+
if (indexExists === true && deleteIndex === true) {
|
|
67
|
+
await targetClient.indices.delete({ index: targetIndexName });
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (indexExists === false || deleteIndex === true) {
|
|
71
|
+
var resp = await targetClient.indices.create({
|
|
72
|
+
index: targetIndexName,
|
|
73
|
+
body: Object.assign({}, {mappings: targetMappings},
|
|
74
|
+
(indexMappingTotalFieldsLimit !== undefined
|
|
75
|
+
? {
|
|
76
|
+
settings: {
|
|
77
|
+
'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
|
|
78
|
+
'index.number_of_shards': 1,
|
|
79
|
+
'index.number_of_replicas': 0,
|
|
80
|
+
},
|
|
81
|
+
}
|
|
82
|
+
: {})),
|
|
83
|
+
});
|
|
84
|
+
if (verbose) { console.log('Created target mapping', resp); }
|
|
85
|
+
}
|
|
58
86
|
} catch (err) {
|
|
59
87
|
console.log('Error creating target mapping', err);
|
|
60
88
|
}
|
|
@@ -62,17 +90,14 @@ function createMappingFactory(ref) {
|
|
|
62
90
|
};
|
|
63
91
|
}
|
|
64
92
|
|
|
65
|
-
var MAX_QUEUE_SIZE = 15;
|
|
66
|
-
|
|
67
93
|
function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
68
94
|
function startIndex(files) {
|
|
69
|
-
var ingestQueueSize = 0;
|
|
70
95
|
var finished = false;
|
|
71
96
|
|
|
72
97
|
var file = files.shift();
|
|
73
98
|
var s = fs
|
|
74
99
|
.createReadStream(file)
|
|
75
|
-
.pipe(
|
|
100
|
+
.pipe(split(splitRegex))
|
|
76
101
|
.pipe(
|
|
77
102
|
es
|
|
78
103
|
.mapSync(function (line) {
|
|
@@ -120,20 +145,13 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
|
120
145
|
})
|
|
121
146
|
);
|
|
122
147
|
|
|
123
|
-
indexer.queueEmitter.on('
|
|
148
|
+
indexer.queueEmitter.on('pause', function () {
|
|
124
149
|
if (finished) { return; }
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if (ingestQueueSize < MAX_QUEUE_SIZE) {
|
|
128
|
-
s.resume();
|
|
129
|
-
} else {
|
|
130
|
-
s.pause();
|
|
131
|
-
}
|
|
150
|
+
s.pause();
|
|
132
151
|
});
|
|
133
152
|
|
|
134
153
|
indexer.queueEmitter.on('resume', function () {
|
|
135
154
|
if (finished) { return; }
|
|
136
|
-
ingestQueueSize = 0;
|
|
137
155
|
s.resume();
|
|
138
156
|
});
|
|
139
157
|
}
|
|
@@ -149,7 +167,7 @@ var EventEmitter = require('events');
|
|
|
149
167
|
|
|
150
168
|
var queueEmitter = new EventEmitter();
|
|
151
169
|
|
|
152
|
-
var parallelCalls =
|
|
170
|
+
var parallelCalls = 5;
|
|
153
171
|
|
|
154
172
|
// a simple helper queue to bulk index documents
|
|
155
173
|
function indexQueueFactory(ref) {
|
|
@@ -159,78 +177,76 @@ function indexQueueFactory(ref) {
|
|
|
159
177
|
var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
|
|
160
178
|
var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
|
|
161
179
|
|
|
162
|
-
var
|
|
163
|
-
var
|
|
164
|
-
var ingesting = 0;
|
|
165
|
-
var ingestTimes = [];
|
|
166
|
-
var finished = false;
|
|
180
|
+
var flushBytes = bufferSize * 1024; // Convert KB to Bytes
|
|
181
|
+
var highWaterMark = flushBytes * parallelCalls;
|
|
167
182
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
183
|
+
// Create a Readable stream
|
|
184
|
+
var stream$$1 = new stream.Readable({
|
|
185
|
+
read: function read() {}, // Implement read but we manage pushing manually
|
|
186
|
+
highWaterMark: highWaterMark, // Buffer size for backpressure management
|
|
187
|
+
});
|
|
173
188
|
|
|
174
|
-
|
|
189
|
+
async function* ndjsonStreamIterator(readableStream) {
|
|
190
|
+
var buffer = ''; // To hold the incomplete data
|
|
191
|
+
var skippedHeader = false;
|
|
175
192
|
|
|
176
|
-
|
|
177
|
-
|
|
193
|
+
// Iterate over the stream using async iteration
|
|
194
|
+
for await (var chunk of readableStream) {
|
|
195
|
+
buffer += chunk.toString(); // Accumulate the chunk data in the buffer
|
|
178
196
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
queueEmitter.emit('resume');
|
|
182
|
-
}
|
|
197
|
+
// Split the buffer into lines (NDJSON items)
|
|
198
|
+
var lines = buffer.split('\n');
|
|
183
199
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
if (verbose)
|
|
187
|
-
{ console.log(("bulk ingest docs: " + (docs.length / 2) + ", queue length: " + (queue.length))); }
|
|
188
|
-
|
|
189
|
-
var start = Date.now();
|
|
190
|
-
client
|
|
191
|
-
.bulk({ body: docs })
|
|
192
|
-
.then(function () {
|
|
193
|
-
var end = Date.now();
|
|
194
|
-
var delta = end - start;
|
|
195
|
-
ingestTimes.push(delta);
|
|
196
|
-
ingesting -= 1;
|
|
197
|
-
|
|
198
|
-
var ingestTimesMovingAverage =
|
|
199
|
-
ingestTimes.length > 0
|
|
200
|
-
? ingestTimes.reduce(function (p, c) { return p + c; }, 0) / ingestTimes.length
|
|
201
|
-
: 0;
|
|
202
|
-
var ingestTimesMovingAverageSeconds = Math.floor(ingestTimesMovingAverage / 1000);
|
|
203
|
-
|
|
204
|
-
if (
|
|
205
|
-
ingestTimes.length > 0 &&
|
|
206
|
-
ingestTimesMovingAverageSeconds < 30 &&
|
|
207
|
-
parallelCalls < 10
|
|
208
|
-
) {
|
|
209
|
-
parallelCalls += 1;
|
|
210
|
-
} else if (
|
|
211
|
-
ingestTimes.length > 0 &&
|
|
212
|
-
ingestTimesMovingAverageSeconds >= 30 &&
|
|
213
|
-
parallelCalls > 1
|
|
214
|
-
) {
|
|
215
|
-
parallelCalls -= 1;
|
|
216
|
-
}
|
|
200
|
+
// The last line might be incomplete, so hold it back in the buffer
|
|
201
|
+
buffer = lines.pop();
|
|
217
202
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
ingest();
|
|
203
|
+
// Yield each complete JSON object
|
|
204
|
+
for (var line of lines) {
|
|
205
|
+
if (line.trim()) {
|
|
206
|
+
try {
|
|
207
|
+
if (!skipHeader || (skipHeader && !skippedHeader)) {
|
|
208
|
+
yield JSON.parse(line); // Parse and yield the JSON object
|
|
209
|
+
skippedHeader = true;
|
|
210
|
+
}
|
|
211
|
+
} catch (err) {
|
|
212
|
+
// Handle JSON parse errors if necessary
|
|
213
|
+
console.error('Failed to parse JSON:', err);
|
|
230
214
|
}
|
|
231
|
-
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
232
217
|
}
|
|
233
|
-
|
|
218
|
+
|
|
219
|
+
// Handle any remaining data in the buffer after the stream ends
|
|
220
|
+
if (buffer.trim()) {
|
|
221
|
+
try {
|
|
222
|
+
yield JSON.parse(buffer);
|
|
223
|
+
} catch (err) {
|
|
224
|
+
console.error('Failed to parse final JSON:', err);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
var finished = false;
|
|
230
|
+
|
|
231
|
+
// Async IIFE to start bulk indexing
|
|
232
|
+
(async function () {
|
|
233
|
+
console.log('START BULK INDEXING');
|
|
234
|
+
await client.helpers.bulk({
|
|
235
|
+
concurrency: parallelCalls,
|
|
236
|
+
flushBytes: flushBytes,
|
|
237
|
+
flushInterval: 1000,
|
|
238
|
+
refreshOnCompletion: true,
|
|
239
|
+
datasource: ndjsonStreamIterator(stream$$1),
|
|
240
|
+
onDocument: function onDocument(doc) {
|
|
241
|
+
return {
|
|
242
|
+
index: { _index: targetIndexName },
|
|
243
|
+
};
|
|
244
|
+
},
|
|
245
|
+
});
|
|
246
|
+
console.log('FINISHED BULK INDEXING');
|
|
247
|
+
|
|
248
|
+
queueEmitter.emit('finish');
|
|
249
|
+
})();
|
|
234
250
|
|
|
235
251
|
return {
|
|
236
252
|
add: function (doc) {
|
|
@@ -238,37 +254,22 @@ function indexQueueFactory(ref) {
|
|
|
238
254
|
throw new Error('Unexpected doc added after indexer should finish.');
|
|
239
255
|
}
|
|
240
256
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
if (queue.length === 0) {
|
|
248
|
-
queueEmitter.emit('resume');
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
if (buffer.length >= bufferSize * 2) {
|
|
252
|
-
ingest(buffer);
|
|
253
|
-
buffer = [];
|
|
257
|
+
var canContinue = stream$$1.push(((JSON.stringify(doc)) + "\n"));
|
|
258
|
+
if (!canContinue) {
|
|
259
|
+
queueEmitter.emit('pause');
|
|
260
|
+
stream$$1.once('drain', function () {
|
|
261
|
+
queueEmitter.emit('resume');
|
|
262
|
+
});
|
|
254
263
|
}
|
|
255
264
|
},
|
|
256
265
|
finish: function () {
|
|
257
266
|
finished = true;
|
|
258
|
-
|
|
259
|
-
if (buffer.length > 0) {
|
|
260
|
-
ingest(buffer);
|
|
261
|
-
buffer = [];
|
|
262
|
-
} else if (queue.length === 0 && ingesting === 0) {
|
|
263
|
-
queueEmitter.emit('finish');
|
|
264
|
-
}
|
|
267
|
+
stream$$1.push(null);
|
|
265
268
|
},
|
|
266
269
|
queueEmitter: queueEmitter,
|
|
267
270
|
};
|
|
268
271
|
}
|
|
269
272
|
|
|
270
|
-
var MAX_QUEUE_SIZE$1 = 15;
|
|
271
|
-
|
|
272
273
|
// create a new progress bar instance and use shades_classic theme
|
|
273
274
|
var progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
|
274
275
|
|
|
@@ -278,21 +279,24 @@ function indexReaderFactory(
|
|
|
278
279
|
transform,
|
|
279
280
|
client,
|
|
280
281
|
query,
|
|
281
|
-
|
|
282
|
+
searchSize,
|
|
282
283
|
populatedFields
|
|
283
284
|
) {
|
|
284
|
-
if (
|
|
285
|
+
if ( searchSize === void 0 ) searchSize = DEFAULT_SEARCH_SIZE;
|
|
285
286
|
if ( populatedFields === void 0 ) populatedFields = false;
|
|
286
287
|
|
|
287
288
|
return async function indexReader() {
|
|
288
|
-
var responseQueue = [];
|
|
289
289
|
var docsNum = 0;
|
|
290
|
+
var scrollId;
|
|
291
|
+
var finished = false;
|
|
292
|
+
var readActive = false;
|
|
293
|
+
var backPressurePause = false;
|
|
290
294
|
|
|
291
295
|
async function fetchPopulatedFields() {
|
|
292
296
|
try {
|
|
293
297
|
var response = await client.search({
|
|
294
298
|
index: sourceIndexName,
|
|
295
|
-
size:
|
|
299
|
+
size: searchSize,
|
|
296
300
|
query: {
|
|
297
301
|
function_score: {
|
|
298
302
|
query: query,
|
|
@@ -303,7 +307,7 @@ function indexReaderFactory(
|
|
|
303
307
|
|
|
304
308
|
// Get all field names for each returned doc and flatten it
|
|
305
309
|
// to a list of unique field names used across all docs.
|
|
306
|
-
return new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1));
|
|
310
|
+
return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
|
|
307
311
|
} catch (e) {
|
|
308
312
|
console.log('error', e);
|
|
309
313
|
}
|
|
@@ -312,7 +316,7 @@ function indexReaderFactory(
|
|
|
312
316
|
function search(fields) {
|
|
313
317
|
return client.search(Object.assign({}, {index: sourceIndexName,
|
|
314
318
|
scroll: '600s',
|
|
315
|
-
size:
|
|
319
|
+
size: searchSize,
|
|
316
320
|
query: query},
|
|
317
321
|
(fields ? { _source: fields } : {})));
|
|
318
322
|
}
|
|
@@ -329,21 +333,14 @@ function indexReaderFactory(
|
|
|
329
333
|
// identify populated fields
|
|
330
334
|
if (populatedFields) {
|
|
331
335
|
fieldsWithData = await fetchPopulatedFields();
|
|
332
|
-
console.log('fieldsWithData', fieldsWithData);
|
|
333
336
|
}
|
|
334
337
|
|
|
335
|
-
|
|
336
|
-
// our first response into the queue to be processed
|
|
337
|
-
var se = await search(fieldsWithData);
|
|
338
|
-
responseQueue.push(se);
|
|
339
|
-
progressBar.start(se.hits.total.value, 0);
|
|
340
|
-
console.log('se', se.hits.hits[0]);
|
|
338
|
+
await fetchNextResponse();
|
|
341
339
|
|
|
342
340
|
function processHit(hit) {
|
|
343
341
|
docsNum += 1;
|
|
344
342
|
try {
|
|
345
343
|
var doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
|
|
346
|
-
// console.log('doc', doc);
|
|
347
344
|
|
|
348
345
|
// if doc is undefined we'll skip indexing it
|
|
349
346
|
if (typeof doc === 'undefined') {
|
|
@@ -363,68 +360,117 @@ function indexReaderFactory(
|
|
|
363
360
|
}
|
|
364
361
|
}
|
|
365
362
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
var readActive = false;
|
|
369
|
-
|
|
370
|
-
async function processResponseQueue() {
|
|
371
|
-
while (responseQueue.length) {
|
|
372
|
-
readActive = true;
|
|
373
|
-
var response = responseQueue.shift();
|
|
363
|
+
async function fetchNextResponse() {
|
|
364
|
+
readActive = true;
|
|
374
365
|
|
|
375
|
-
|
|
376
|
-
response.hits.hits.forEach(processHit);
|
|
366
|
+
var sc = scrollId ? await scroll(scrollId) : await search(fieldsWithData);
|
|
377
367
|
|
|
378
|
-
|
|
368
|
+
if (!scrollId) {
|
|
369
|
+
progressBar.start(sc.hits.total.value, 0);
|
|
370
|
+
}
|
|
379
371
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
indexer.finish();
|
|
383
|
-
break;
|
|
384
|
-
}
|
|
372
|
+
scrollId = sc._scroll_id;
|
|
373
|
+
readActive = false;
|
|
385
374
|
|
|
386
|
-
|
|
387
|
-
// get the next response if there are more docs to fetch
|
|
388
|
-
var sc = await scroll(response._scroll_id); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
|
|
389
|
-
scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
|
|
390
|
-
responseQueue.push(sc);
|
|
391
|
-
} else {
|
|
392
|
-
readActive = false;
|
|
393
|
-
}
|
|
394
|
-
}
|
|
375
|
+
processResponse(sc);
|
|
395
376
|
}
|
|
396
377
|
|
|
397
|
-
|
|
398
|
-
|
|
378
|
+
async function processResponse(response) {
|
|
379
|
+
// collect the docs from this response
|
|
380
|
+
response.hits.hits.forEach(processHit);
|
|
381
|
+
|
|
382
|
+
progressBar.update(docsNum);
|
|
383
|
+
|
|
384
|
+
// check to see if we have collected all of the docs
|
|
385
|
+
if (response.hits.total.value === docsNum) {
|
|
386
|
+
indexer.finish();
|
|
387
|
+
return;
|
|
388
|
+
}
|
|
399
389
|
|
|
400
|
-
if (!
|
|
401
|
-
|
|
402
|
-
var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
|
|
403
|
-
scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
|
|
404
|
-
responseQueue.push(sc);
|
|
405
|
-
processResponseQueue();
|
|
390
|
+
if (!backPressurePause) {
|
|
391
|
+
await fetchNextResponse();
|
|
406
392
|
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
indexer.queueEmitter.on('pause', async function () {
|
|
396
|
+
backPressurePause = true;
|
|
407
397
|
});
|
|
408
398
|
|
|
409
399
|
indexer.queueEmitter.on('resume', async function () {
|
|
410
|
-
|
|
400
|
+
backPressurePause = false;
|
|
411
401
|
|
|
412
|
-
if (readActive) {
|
|
402
|
+
if (readActive || finished) {
|
|
413
403
|
return;
|
|
414
404
|
}
|
|
415
405
|
|
|
416
|
-
|
|
417
|
-
var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
|
|
418
|
-
scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
|
|
419
|
-
responseQueue.push(sc);
|
|
420
|
-
processResponseQueue();
|
|
406
|
+
await fetchNextResponse();
|
|
421
407
|
});
|
|
422
408
|
|
|
423
409
|
indexer.queueEmitter.on('finish', function () {
|
|
410
|
+
finished = true;
|
|
424
411
|
progressBar.stop();
|
|
425
412
|
});
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
function streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose) {
|
|
417
|
+
function startIndex() {
|
|
418
|
+
console.log('START INDEX', splitRegex);
|
|
419
|
+
var finished = false;
|
|
420
|
+
|
|
421
|
+
var s = stream$$1.pipe(split(splitRegex)).pipe(
|
|
422
|
+
es
|
|
423
|
+
.mapSync(function (line) {
|
|
424
|
+
try {
|
|
425
|
+
// skip empty lines
|
|
426
|
+
if (line === '') {
|
|
427
|
+
return;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
var doc =
|
|
431
|
+
typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
|
|
432
|
+
|
|
433
|
+
// if doc is undefined we'll skip indexing it
|
|
434
|
+
if (typeof doc === 'undefined') {
|
|
435
|
+
s.resume();
|
|
436
|
+
return;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// the transform callback may return an array of docs so we can emit
|
|
440
|
+
// multiple docs from a single line
|
|
441
|
+
if (Array.isArray(doc)) {
|
|
442
|
+
doc.forEach(function (d) { return indexer.add(d); });
|
|
443
|
+
return;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
indexer.add(doc);
|
|
447
|
+
} catch (e) {
|
|
448
|
+
console.log('error', e);
|
|
449
|
+
}
|
|
450
|
+
})
|
|
451
|
+
.on('error', function (err) {
|
|
452
|
+
console.log('Error while reading file.', err);
|
|
453
|
+
})
|
|
454
|
+
.on('end', function () {
|
|
455
|
+
if (verbose) { console.log('Read entire stream.'); }
|
|
456
|
+
indexer.finish();
|
|
457
|
+
finished = true;
|
|
458
|
+
})
|
|
459
|
+
);
|
|
460
|
+
|
|
461
|
+
indexer.queueEmitter.on('pause', function () {
|
|
462
|
+
if (finished) { return; }
|
|
463
|
+
s.pause();
|
|
464
|
+
});
|
|
426
465
|
|
|
427
|
-
|
|
466
|
+
indexer.queueEmitter.on('resume', function () {
|
|
467
|
+
if (finished) { return; }
|
|
468
|
+
s.resume();
|
|
469
|
+
});
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
return function () {
|
|
473
|
+
startIndex();
|
|
428
474
|
};
|
|
429
475
|
}
|
|
430
476
|
|
|
@@ -433,6 +479,8 @@ async function transformer(ref) {
|
|
|
433
479
|
var sourceClientConfig = ref.sourceClientConfig;
|
|
434
480
|
var targetClientConfig = ref.targetClientConfig;
|
|
435
481
|
var bufferSize = ref.bufferSize; if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
|
|
482
|
+
var searchSize = ref.searchSize; if ( searchSize === void 0 ) searchSize = DEFAULT_SEARCH_SIZE;
|
|
483
|
+
var stream$$1 = ref.stream;
|
|
436
484
|
var fileName = ref.fileName;
|
|
437
485
|
var splitRegex = ref.splitRegex; if ( splitRegex === void 0 ) splitRegex = /\n/;
|
|
438
486
|
var sourceIndexName = ref.sourceIndexName;
|
|
@@ -446,6 +494,7 @@ async function transformer(ref) {
|
|
|
446
494
|
var transform = ref.transform;
|
|
447
495
|
var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
|
|
448
496
|
|
|
497
|
+
console.log('TRANSFORMER');
|
|
449
498
|
if (typeof targetIndexName === 'undefined') {
|
|
450
499
|
throw Error('targetIndexName must be specified.');
|
|
451
500
|
}
|
|
@@ -468,6 +517,7 @@ async function transformer(ref) {
|
|
|
468
517
|
mappingsOverride: mappingsOverride,
|
|
469
518
|
indexMappingTotalFieldsLimit: indexMappingTotalFieldsLimit,
|
|
470
519
|
verbose: verbose,
|
|
520
|
+
deleteIndex: deleteIndex,
|
|
471
521
|
});
|
|
472
522
|
var indexer = indexQueueFactory({
|
|
473
523
|
targetClient: targetClient,
|
|
@@ -482,8 +532,12 @@ async function transformer(ref) {
|
|
|
482
532
|
throw Error('Only either one of fileName or sourceIndexName can be specified.');
|
|
483
533
|
}
|
|
484
534
|
|
|
485
|
-
if (
|
|
486
|
-
|
|
535
|
+
if (
|
|
536
|
+
(typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') ||
|
|
537
|
+
(typeof fileName !== 'undefined' && typeof stream$$1 !== 'undefined') ||
|
|
538
|
+
(typeof sourceIndexName !== 'undefined' && typeof stream$$1 !== 'undefined')
|
|
539
|
+
) {
|
|
540
|
+
throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
|
|
487
541
|
}
|
|
488
542
|
|
|
489
543
|
if (typeof fileName !== 'undefined') {
|
|
@@ -497,18 +551,25 @@ async function transformer(ref) {
|
|
|
497
551
|
transform,
|
|
498
552
|
sourceClient,
|
|
499
553
|
query,
|
|
500
|
-
|
|
554
|
+
searchSize,
|
|
501
555
|
populatedFields
|
|
502
556
|
);
|
|
503
557
|
}
|
|
504
558
|
|
|
559
|
+
if (typeof stream$$1 !== 'undefined') {
|
|
560
|
+
console.log('STREAM READER');
|
|
561
|
+
return streamReaderFactory(indexer, stream$$1, transform, splitRegex, verbose);
|
|
562
|
+
}
|
|
563
|
+
|
|
505
564
|
return null;
|
|
506
565
|
}
|
|
507
566
|
|
|
508
567
|
var reader = getReader();
|
|
568
|
+
console.log('READER INITIALIZED');
|
|
509
569
|
|
|
510
570
|
try {
|
|
511
571
|
var indexExists = await targetClient.indices.exists({ index: targetIndexName });
|
|
572
|
+
console.log('INDEX EXISTS', indexExists);
|
|
512
573
|
|
|
513
574
|
if (indexExists === false) {
|
|
514
575
|
await createMapping();
|
|
@@ -1,10 +1,20 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
2
|
import es from 'event-stream';
|
|
3
3
|
import glob from 'glob';
|
|
4
|
+
import split from 'split2';
|
|
5
|
+
import { Readable } from 'stream';
|
|
4
6
|
import cliProgress from 'cli-progress';
|
|
5
7
|
import elasticsearch from '@elastic/elasticsearch';
|
|
6
8
|
|
|
7
|
-
|
|
9
|
+
// In earlier versions this was used to set the number of docs to index in a
|
|
10
|
+
// single bulk request. Since we switched to use the helpers.bulk() method from
|
|
11
|
+
// the ES client, this now translates to the `flushBytes` option of the helper.
|
|
12
|
+
// However, for kind of a backwards compability with the old values, this uses
|
|
13
|
+
// KBytes instead of Bytes. It will be multiplied by 1024 in the index queue.
|
|
14
|
+
var DEFAULT_BUFFER_SIZE = 5120;
|
|
15
|
+
|
|
16
|
+
// The default number of docs to fetch in a single search request when reindexing.
|
|
17
|
+
var DEFAULT_SEARCH_SIZE = 1000;
|
|
8
18
|
|
|
9
19
|
function createMappingFactory(ref) {
|
|
10
20
|
var sourceClient = ref.sourceClient;
|
|
@@ -15,6 +25,7 @@ function createMappingFactory(ref) {
|
|
|
15
25
|
var mappingsOverride = ref.mappingsOverride;
|
|
16
26
|
var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
|
|
17
27
|
var verbose = ref.verbose;
|
|
28
|
+
var deleteIndex = ref.deleteIndex;
|
|
18
29
|
|
|
19
30
|
return async function () {
|
|
20
31
|
var targetMappings = mappingsOverride ? undefined : mappings;
|
|
@@ -24,7 +35,14 @@ function createMappingFactory(ref) {
|
|
|
24
35
|
var mapping = await sourceClient.indices.getMapping({
|
|
25
36
|
index: sourceIndexName,
|
|
26
37
|
});
|
|
27
|
-
|
|
38
|
+
if (mapping[sourceIndexName]) {
|
|
39
|
+
targetMappings = mapping[sourceIndexName].mappings;
|
|
40
|
+
} else {
|
|
41
|
+
var allMappings = Object.values(mapping);
|
|
42
|
+
if (allMappings.length > 0) {
|
|
43
|
+
targetMappings = Object.values(mapping)[0].mappings;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
28
46
|
} catch (err) {
|
|
29
47
|
console.log('Error reading source mapping', err);
|
|
30
48
|
return;
|
|
@@ -39,18 +57,28 @@ function createMappingFactory(ref) {
|
|
|
39
57
|
}
|
|
40
58
|
|
|
41
59
|
try {
|
|
42
|
-
var
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
60
|
+
var indexExists = await targetClient.indices.exists({ index: targetIndexName });
|
|
61
|
+
|
|
62
|
+
if (indexExists === true && deleteIndex === true) {
|
|
63
|
+
await targetClient.indices.delete({ index: targetIndexName });
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (indexExists === false || deleteIndex === true) {
|
|
67
|
+
var resp = await targetClient.indices.create({
|
|
68
|
+
index: targetIndexName,
|
|
69
|
+
body: Object.assign({}, {mappings: targetMappings},
|
|
70
|
+
(indexMappingTotalFieldsLimit !== undefined
|
|
71
|
+
? {
|
|
72
|
+
settings: {
|
|
73
|
+
'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
|
|
74
|
+
'index.number_of_shards': 1,
|
|
75
|
+
'index.number_of_replicas': 0,
|
|
76
|
+
},
|
|
77
|
+
}
|
|
78
|
+
: {})),
|
|
79
|
+
});
|
|
80
|
+
if (verbose) { console.log('Created target mapping', resp); }
|
|
81
|
+
}
|
|
54
82
|
} catch (err) {
|
|
55
83
|
console.log('Error creating target mapping', err);
|
|
56
84
|
}
|
|
@@ -58,17 +86,14 @@ function createMappingFactory(ref) {
|
|
|
58
86
|
};
|
|
59
87
|
}
|
|
60
88
|
|
|
61
|
-
var MAX_QUEUE_SIZE = 15;
|
|
62
|
-
|
|
63
89
|
function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
64
90
|
function startIndex(files) {
|
|
65
|
-
var ingestQueueSize = 0;
|
|
66
91
|
var finished = false;
|
|
67
92
|
|
|
68
93
|
var file = files.shift();
|
|
69
94
|
var s = fs
|
|
70
95
|
.createReadStream(file)
|
|
71
|
-
.pipe(
|
|
96
|
+
.pipe(split(splitRegex))
|
|
72
97
|
.pipe(
|
|
73
98
|
es
|
|
74
99
|
.mapSync(function (line) {
|
|
@@ -116,20 +141,13 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
|
116
141
|
})
|
|
117
142
|
);
|
|
118
143
|
|
|
119
|
-
indexer.queueEmitter.on('
|
|
144
|
+
indexer.queueEmitter.on('pause', function () {
|
|
120
145
|
if (finished) { return; }
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if (ingestQueueSize < MAX_QUEUE_SIZE) {
|
|
124
|
-
s.resume();
|
|
125
|
-
} else {
|
|
126
|
-
s.pause();
|
|
127
|
-
}
|
|
146
|
+
s.pause();
|
|
128
147
|
});
|
|
129
148
|
|
|
130
149
|
indexer.queueEmitter.on('resume', function () {
|
|
131
150
|
if (finished) { return; }
|
|
132
|
-
ingestQueueSize = 0;
|
|
133
151
|
s.resume();
|
|
134
152
|
});
|
|
135
153
|
}
|
|
@@ -145,7 +163,7 @@ var EventEmitter = require('events');
|
|
|
145
163
|
|
|
146
164
|
var queueEmitter = new EventEmitter();
|
|
147
165
|
|
|
148
|
-
var parallelCalls =
|
|
166
|
+
var parallelCalls = 5;
|
|
149
167
|
|
|
150
168
|
// a simple helper queue to bulk index documents
|
|
151
169
|
function indexQueueFactory(ref) {
|
|
@@ -155,78 +173,76 @@ function indexQueueFactory(ref) {
|
|
|
155
173
|
var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
|
|
156
174
|
var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
|
|
157
175
|
|
|
158
|
-
var
|
|
159
|
-
var
|
|
160
|
-
var ingesting = 0;
|
|
161
|
-
var ingestTimes = [];
|
|
162
|
-
var finished = false;
|
|
176
|
+
var flushBytes = bufferSize * 1024; // Convert KB to Bytes
|
|
177
|
+
var highWaterMark = flushBytes * parallelCalls;
|
|
163
178
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
179
|
+
// Create a Readable stream
|
|
180
|
+
var stream = new Readable({
|
|
181
|
+
read: function read() {}, // Implement read but we manage pushing manually
|
|
182
|
+
highWaterMark: highWaterMark, // Buffer size for backpressure management
|
|
183
|
+
});
|
|
169
184
|
|
|
170
|
-
|
|
185
|
+
async function* ndjsonStreamIterator(readableStream) {
|
|
186
|
+
var buffer = ''; // To hold the incomplete data
|
|
187
|
+
var skippedHeader = false;
|
|
171
188
|
|
|
172
|
-
|
|
173
|
-
|
|
189
|
+
// Iterate over the stream using async iteration
|
|
190
|
+
for await (var chunk of readableStream) {
|
|
191
|
+
buffer += chunk.toString(); // Accumulate the chunk data in the buffer
|
|
174
192
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
queueEmitter.emit('resume');
|
|
178
|
-
}
|
|
193
|
+
// Split the buffer into lines (NDJSON items)
|
|
194
|
+
var lines = buffer.split('\n');
|
|
179
195
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
if (verbose)
|
|
183
|
-
{ console.log(("bulk ingest docs: " + (docs.length / 2) + ", queue length: " + (queue.length))); }
|
|
184
|
-
|
|
185
|
-
var start = Date.now();
|
|
186
|
-
client
|
|
187
|
-
.bulk({ body: docs })
|
|
188
|
-
.then(function () {
|
|
189
|
-
var end = Date.now();
|
|
190
|
-
var delta = end - start;
|
|
191
|
-
ingestTimes.push(delta);
|
|
192
|
-
ingesting -= 1;
|
|
193
|
-
|
|
194
|
-
var ingestTimesMovingAverage =
|
|
195
|
-
ingestTimes.length > 0
|
|
196
|
-
? ingestTimes.reduce(function (p, c) { return p + c; }, 0) / ingestTimes.length
|
|
197
|
-
: 0;
|
|
198
|
-
var ingestTimesMovingAverageSeconds = Math.floor(ingestTimesMovingAverage / 1000);
|
|
199
|
-
|
|
200
|
-
if (
|
|
201
|
-
ingestTimes.length > 0 &&
|
|
202
|
-
ingestTimesMovingAverageSeconds < 30 &&
|
|
203
|
-
parallelCalls < 10
|
|
204
|
-
) {
|
|
205
|
-
parallelCalls += 1;
|
|
206
|
-
} else if (
|
|
207
|
-
ingestTimes.length > 0 &&
|
|
208
|
-
ingestTimesMovingAverageSeconds >= 30 &&
|
|
209
|
-
parallelCalls > 1
|
|
210
|
-
) {
|
|
211
|
-
parallelCalls -= 1;
|
|
212
|
-
}
|
|
196
|
+
// The last line might be incomplete, so hold it back in the buffer
|
|
197
|
+
buffer = lines.pop();
|
|
213
198
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
ingest();
|
|
199
|
+
// Yield each complete JSON object
|
|
200
|
+
for (var line of lines) {
|
|
201
|
+
if (line.trim()) {
|
|
202
|
+
try {
|
|
203
|
+
if (!skipHeader || (skipHeader && !skippedHeader)) {
|
|
204
|
+
yield JSON.parse(line); // Parse and yield the JSON object
|
|
205
|
+
skippedHeader = true;
|
|
206
|
+
}
|
|
207
|
+
} catch (err) {
|
|
208
|
+
// Handle JSON parse errors if necessary
|
|
209
|
+
console.error('Failed to parse JSON:', err);
|
|
226
210
|
}
|
|
227
|
-
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
228
213
|
}
|
|
229
|
-
|
|
214
|
+
|
|
215
|
+
// Handle any remaining data in the buffer after the stream ends
|
|
216
|
+
if (buffer.trim()) {
|
|
217
|
+
try {
|
|
218
|
+
yield JSON.parse(buffer);
|
|
219
|
+
} catch (err) {
|
|
220
|
+
console.error('Failed to parse final JSON:', err);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
var finished = false;
|
|
226
|
+
|
|
227
|
+
// Async IIFE to start bulk indexing
|
|
228
|
+
(async function () {
|
|
229
|
+
console.log('START BULK INDEXING');
|
|
230
|
+
await client.helpers.bulk({
|
|
231
|
+
concurrency: parallelCalls,
|
|
232
|
+
flushBytes: flushBytes,
|
|
233
|
+
flushInterval: 1000,
|
|
234
|
+
refreshOnCompletion: true,
|
|
235
|
+
datasource: ndjsonStreamIterator(stream),
|
|
236
|
+
onDocument: function onDocument(doc) {
|
|
237
|
+
return {
|
|
238
|
+
index: { _index: targetIndexName },
|
|
239
|
+
};
|
|
240
|
+
},
|
|
241
|
+
});
|
|
242
|
+
console.log('FINISHED BULK INDEXING');
|
|
243
|
+
|
|
244
|
+
queueEmitter.emit('finish');
|
|
245
|
+
})();
|
|
230
246
|
|
|
231
247
|
return {
|
|
232
248
|
add: function (doc) {
|
|
@@ -234,37 +250,22 @@ function indexQueueFactory(ref) {
|
|
|
234
250
|
throw new Error('Unexpected doc added after indexer should finish.');
|
|
235
251
|
}
|
|
236
252
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if (queue.length === 0) {
|
|
244
|
-
queueEmitter.emit('resume');
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
if (buffer.length >= bufferSize * 2) {
|
|
248
|
-
ingest(buffer);
|
|
249
|
-
buffer = [];
|
|
253
|
+
var canContinue = stream.push(((JSON.stringify(doc)) + "\n"));
|
|
254
|
+
if (!canContinue) {
|
|
255
|
+
queueEmitter.emit('pause');
|
|
256
|
+
stream.once('drain', function () {
|
|
257
|
+
queueEmitter.emit('resume');
|
|
258
|
+
});
|
|
250
259
|
}
|
|
251
260
|
},
|
|
252
261
|
finish: function () {
|
|
253
262
|
finished = true;
|
|
254
|
-
|
|
255
|
-
if (buffer.length > 0) {
|
|
256
|
-
ingest(buffer);
|
|
257
|
-
buffer = [];
|
|
258
|
-
} else if (queue.length === 0 && ingesting === 0) {
|
|
259
|
-
queueEmitter.emit('finish');
|
|
260
|
-
}
|
|
263
|
+
stream.push(null);
|
|
261
264
|
},
|
|
262
265
|
queueEmitter: queueEmitter,
|
|
263
266
|
};
|
|
264
267
|
}
|
|
265
268
|
|
|
266
|
-
var MAX_QUEUE_SIZE$1 = 15;
|
|
267
|
-
|
|
268
269
|
// create a new progress bar instance and use shades_classic theme
|
|
269
270
|
var progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
|
270
271
|
|
|
@@ -274,21 +275,24 @@ function indexReaderFactory(
|
|
|
274
275
|
transform,
|
|
275
276
|
client,
|
|
276
277
|
query,
|
|
277
|
-
|
|
278
|
+
searchSize,
|
|
278
279
|
populatedFields
|
|
279
280
|
) {
|
|
280
|
-
if (
|
|
281
|
+
if ( searchSize === void 0 ) searchSize = DEFAULT_SEARCH_SIZE;
|
|
281
282
|
if ( populatedFields === void 0 ) populatedFields = false;
|
|
282
283
|
|
|
283
284
|
return async function indexReader() {
|
|
284
|
-
var responseQueue = [];
|
|
285
285
|
var docsNum = 0;
|
|
286
|
+
var scrollId;
|
|
287
|
+
var finished = false;
|
|
288
|
+
var readActive = false;
|
|
289
|
+
var backPressurePause = false;
|
|
286
290
|
|
|
287
291
|
async function fetchPopulatedFields() {
|
|
288
292
|
try {
|
|
289
293
|
var response = await client.search({
|
|
290
294
|
index: sourceIndexName,
|
|
291
|
-
size:
|
|
295
|
+
size: searchSize,
|
|
292
296
|
query: {
|
|
293
297
|
function_score: {
|
|
294
298
|
query: query,
|
|
@@ -299,7 +303,7 @@ function indexReaderFactory(
|
|
|
299
303
|
|
|
300
304
|
// Get all field names for each returned doc and flatten it
|
|
301
305
|
// to a list of unique field names used across all docs.
|
|
302
|
-
return new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1));
|
|
306
|
+
return Array.from(new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1)));
|
|
303
307
|
} catch (e) {
|
|
304
308
|
console.log('error', e);
|
|
305
309
|
}
|
|
@@ -308,7 +312,7 @@ function indexReaderFactory(
|
|
|
308
312
|
function search(fields) {
|
|
309
313
|
return client.search(Object.assign({}, {index: sourceIndexName,
|
|
310
314
|
scroll: '600s',
|
|
311
|
-
size:
|
|
315
|
+
size: searchSize,
|
|
312
316
|
query: query},
|
|
313
317
|
(fields ? { _source: fields } : {})));
|
|
314
318
|
}
|
|
@@ -325,21 +329,14 @@ function indexReaderFactory(
|
|
|
325
329
|
// identify populated fields
|
|
326
330
|
if (populatedFields) {
|
|
327
331
|
fieldsWithData = await fetchPopulatedFields();
|
|
328
|
-
console.log('fieldsWithData', fieldsWithData);
|
|
329
332
|
}
|
|
330
333
|
|
|
331
|
-
|
|
332
|
-
// our first response into the queue to be processed
|
|
333
|
-
var se = await search(fieldsWithData);
|
|
334
|
-
responseQueue.push(se);
|
|
335
|
-
progressBar.start(se.hits.total.value, 0);
|
|
336
|
-
console.log('se', se.hits.hits[0]);
|
|
334
|
+
await fetchNextResponse();
|
|
337
335
|
|
|
338
336
|
function processHit(hit) {
|
|
339
337
|
docsNum += 1;
|
|
340
338
|
try {
|
|
341
339
|
var doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
|
|
342
|
-
// console.log('doc', doc);
|
|
343
340
|
|
|
344
341
|
// if doc is undefined we'll skip indexing it
|
|
345
342
|
if (typeof doc === 'undefined') {
|
|
@@ -359,68 +356,117 @@ function indexReaderFactory(
|
|
|
359
356
|
}
|
|
360
357
|
}
|
|
361
358
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
var readActive = false;
|
|
365
|
-
|
|
366
|
-
async function processResponseQueue() {
|
|
367
|
-
while (responseQueue.length) {
|
|
368
|
-
readActive = true;
|
|
369
|
-
var response = responseQueue.shift();
|
|
359
|
+
async function fetchNextResponse() {
|
|
360
|
+
readActive = true;
|
|
370
361
|
|
|
371
|
-
|
|
372
|
-
response.hits.hits.forEach(processHit);
|
|
362
|
+
var sc = scrollId ? await scroll(scrollId) : await search(fieldsWithData);
|
|
373
363
|
|
|
374
|
-
|
|
364
|
+
if (!scrollId) {
|
|
365
|
+
progressBar.start(sc.hits.total.value, 0);
|
|
366
|
+
}
|
|
375
367
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
indexer.finish();
|
|
379
|
-
break;
|
|
380
|
-
}
|
|
368
|
+
scrollId = sc._scroll_id;
|
|
369
|
+
readActive = false;
|
|
381
370
|
|
|
382
|
-
|
|
383
|
-
// get the next response if there are more docs to fetch
|
|
384
|
-
var sc = await scroll(response._scroll_id); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
|
|
385
|
-
scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
|
|
386
|
-
responseQueue.push(sc);
|
|
387
|
-
} else {
|
|
388
|
-
readActive = false;
|
|
389
|
-
}
|
|
390
|
-
}
|
|
371
|
+
processResponse(sc);
|
|
391
372
|
}
|
|
392
373
|
|
|
393
|
-
|
|
394
|
-
|
|
374
|
+
async function processResponse(response) {
|
|
375
|
+
// collect the docs from this response
|
|
376
|
+
response.hits.hits.forEach(processHit);
|
|
377
|
+
|
|
378
|
+
progressBar.update(docsNum);
|
|
379
|
+
|
|
380
|
+
// check to see if we have collected all of the docs
|
|
381
|
+
if (response.hits.total.value === docsNum) {
|
|
382
|
+
indexer.finish();
|
|
383
|
+
return;
|
|
384
|
+
}
|
|
395
385
|
|
|
396
|
-
if (!
|
|
397
|
-
|
|
398
|
-
var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
|
|
399
|
-
scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
|
|
400
|
-
responseQueue.push(sc);
|
|
401
|
-
processResponseQueue();
|
|
386
|
+
if (!backPressurePause) {
|
|
387
|
+
await fetchNextResponse();
|
|
402
388
|
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
indexer.queueEmitter.on('pause', async function () {
|
|
392
|
+
backPressurePause = true;
|
|
403
393
|
});
|
|
404
394
|
|
|
405
395
|
indexer.queueEmitter.on('resume', async function () {
|
|
406
|
-
|
|
396
|
+
backPressurePause = false;
|
|
407
397
|
|
|
408
|
-
if (readActive) {
|
|
398
|
+
if (readActive || finished) {
|
|
409
399
|
return;
|
|
410
400
|
}
|
|
411
401
|
|
|
412
|
-
|
|
413
|
-
var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
|
|
414
|
-
scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
|
|
415
|
-
responseQueue.push(sc);
|
|
416
|
-
processResponseQueue();
|
|
402
|
+
await fetchNextResponse();
|
|
417
403
|
});
|
|
418
404
|
|
|
419
405
|
indexer.queueEmitter.on('finish', function () {
|
|
406
|
+
finished = true;
|
|
420
407
|
progressBar.stop();
|
|
421
408
|
});
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
function streamReaderFactory(indexer, stream, transform, splitRegex, verbose) {
|
|
413
|
+
function startIndex() {
|
|
414
|
+
console.log('START INDEX', splitRegex);
|
|
415
|
+
var finished = false;
|
|
416
|
+
|
|
417
|
+
var s = stream.pipe(split(splitRegex)).pipe(
|
|
418
|
+
es
|
|
419
|
+
.mapSync(function (line) {
|
|
420
|
+
try {
|
|
421
|
+
// skip empty lines
|
|
422
|
+
if (line === '') {
|
|
423
|
+
return;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
var doc =
|
|
427
|
+
typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
|
|
428
|
+
|
|
429
|
+
// if doc is undefined we'll skip indexing it
|
|
430
|
+
if (typeof doc === 'undefined') {
|
|
431
|
+
s.resume();
|
|
432
|
+
return;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// the transform callback may return an array of docs so we can emit
|
|
436
|
+
// multiple docs from a single line
|
|
437
|
+
if (Array.isArray(doc)) {
|
|
438
|
+
doc.forEach(function (d) { return indexer.add(d); });
|
|
439
|
+
return;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
indexer.add(doc);
|
|
443
|
+
} catch (e) {
|
|
444
|
+
console.log('error', e);
|
|
445
|
+
}
|
|
446
|
+
})
|
|
447
|
+
.on('error', function (err) {
|
|
448
|
+
console.log('Error while reading file.', err);
|
|
449
|
+
})
|
|
450
|
+
.on('end', function () {
|
|
451
|
+
if (verbose) { console.log('Read entire stream.'); }
|
|
452
|
+
indexer.finish();
|
|
453
|
+
finished = true;
|
|
454
|
+
})
|
|
455
|
+
);
|
|
456
|
+
|
|
457
|
+
indexer.queueEmitter.on('pause', function () {
|
|
458
|
+
if (finished) { return; }
|
|
459
|
+
s.pause();
|
|
460
|
+
});
|
|
422
461
|
|
|
423
|
-
|
|
462
|
+
indexer.queueEmitter.on('resume', function () {
|
|
463
|
+
if (finished) { return; }
|
|
464
|
+
s.resume();
|
|
465
|
+
});
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
return function () {
|
|
469
|
+
startIndex();
|
|
424
470
|
};
|
|
425
471
|
}
|
|
426
472
|
|
|
@@ -429,6 +475,8 @@ async function transformer(ref) {
|
|
|
429
475
|
var sourceClientConfig = ref.sourceClientConfig;
|
|
430
476
|
var targetClientConfig = ref.targetClientConfig;
|
|
431
477
|
var bufferSize = ref.bufferSize; if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
|
|
478
|
+
var searchSize = ref.searchSize; if ( searchSize === void 0 ) searchSize = DEFAULT_SEARCH_SIZE;
|
|
479
|
+
var stream = ref.stream;
|
|
432
480
|
var fileName = ref.fileName;
|
|
433
481
|
var splitRegex = ref.splitRegex; if ( splitRegex === void 0 ) splitRegex = /\n/;
|
|
434
482
|
var sourceIndexName = ref.sourceIndexName;
|
|
@@ -442,6 +490,7 @@ async function transformer(ref) {
|
|
|
442
490
|
var transform = ref.transform;
|
|
443
491
|
var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
|
|
444
492
|
|
|
493
|
+
console.log('TRANSFORMER');
|
|
445
494
|
if (typeof targetIndexName === 'undefined') {
|
|
446
495
|
throw Error('targetIndexName must be specified.');
|
|
447
496
|
}
|
|
@@ -464,6 +513,7 @@ async function transformer(ref) {
|
|
|
464
513
|
mappingsOverride: mappingsOverride,
|
|
465
514
|
indexMappingTotalFieldsLimit: indexMappingTotalFieldsLimit,
|
|
466
515
|
verbose: verbose,
|
|
516
|
+
deleteIndex: deleteIndex,
|
|
467
517
|
});
|
|
468
518
|
var indexer = indexQueueFactory({
|
|
469
519
|
targetClient: targetClient,
|
|
@@ -478,8 +528,12 @@ async function transformer(ref) {
|
|
|
478
528
|
throw Error('Only either one of fileName or sourceIndexName can be specified.');
|
|
479
529
|
}
|
|
480
530
|
|
|
481
|
-
if (
|
|
482
|
-
|
|
531
|
+
if (
|
|
532
|
+
(typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') ||
|
|
533
|
+
(typeof fileName !== 'undefined' && typeof stream !== 'undefined') ||
|
|
534
|
+
(typeof sourceIndexName !== 'undefined' && typeof stream !== 'undefined')
|
|
535
|
+
) {
|
|
536
|
+
throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
|
|
483
537
|
}
|
|
484
538
|
|
|
485
539
|
if (typeof fileName !== 'undefined') {
|
|
@@ -493,18 +547,25 @@ async function transformer(ref) {
|
|
|
493
547
|
transform,
|
|
494
548
|
sourceClient,
|
|
495
549
|
query,
|
|
496
|
-
|
|
550
|
+
searchSize,
|
|
497
551
|
populatedFields
|
|
498
552
|
);
|
|
499
553
|
}
|
|
500
554
|
|
|
555
|
+
if (typeof stream !== 'undefined') {
|
|
556
|
+
console.log('STREAM READER');
|
|
557
|
+
return streamReaderFactory(indexer, stream, transform, splitRegex, verbose);
|
|
558
|
+
}
|
|
559
|
+
|
|
501
560
|
return null;
|
|
502
561
|
}
|
|
503
562
|
|
|
504
563
|
var reader = getReader();
|
|
564
|
+
console.log('READER INITIALIZED');
|
|
505
565
|
|
|
506
566
|
try {
|
|
507
567
|
var indexExists = await targetClient.indices.exists({ index: targetIndexName });
|
|
568
|
+
console.log('INDEX EXISTS', indexExists);
|
|
508
569
|
|
|
509
570
|
if (indexExists === false) {
|
|
510
571
|
await createMapping();
|
package/package.json
CHANGED
|
@@ -14,20 +14,21 @@
|
|
|
14
14
|
"license": "Apache-2.0",
|
|
15
15
|
"author": "Walter Rafelsberger <walter@rafelsberger.at>",
|
|
16
16
|
"contributors": [],
|
|
17
|
-
"version": "1.0.0-
|
|
17
|
+
"version": "1.0.0-beta3",
|
|
18
18
|
"main": "dist/node-es-transformer.cjs.js",
|
|
19
19
|
"module": "dist/node-es-transformer.esm.js",
|
|
20
20
|
"dependencies": {
|
|
21
|
-
"@elastic/elasticsearch": "^8.
|
|
21
|
+
"@elastic/elasticsearch": "^8.15.0",
|
|
22
22
|
"cli-progress": "^3.12.0",
|
|
23
23
|
"event-stream": "3.3.4",
|
|
24
|
-
"
|
|
24
|
+
"git-cz": "^4.9.0",
|
|
25
|
+
"glob": "7.1.2",
|
|
26
|
+
"split2": "^4.2.0"
|
|
25
27
|
},
|
|
26
28
|
"devDependencies": {
|
|
27
29
|
"acorn": "^6.4.2",
|
|
28
30
|
"async-retry": "^1.3.3",
|
|
29
31
|
"commit-and-tag-version": "^11.3.0",
|
|
30
|
-
"cz-conventional-changelog": "^3.3.0",
|
|
31
32
|
"eslint": "^8.51.0",
|
|
32
33
|
"eslint-config-airbnb": "19.0.4",
|
|
33
34
|
"eslint-config-prettier": "^9.0.0",
|
|
@@ -57,7 +58,7 @@
|
|
|
57
58
|
],
|
|
58
59
|
"config": {
|
|
59
60
|
"commitizen": {
|
|
60
|
-
"path": "
|
|
61
|
+
"path": "git-cz"
|
|
61
62
|
}
|
|
62
63
|
},
|
|
63
64
|
"jest": {
|