node-es-transformer 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -12
- package/dist/node-es-transformer.cjs.js +567 -127
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +548 -127
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +24 -2
- package/package.json +12 -7
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
var elasticsearch9 = require('es9');
|
|
4
4
|
var elasticsearch8 = require('es8');
|
|
5
|
+
var parquet = require('@dsnp/parquetjs');
|
|
6
|
+
var zlib = require('zlib');
|
|
7
|
+
var compression_js = require('@dsnp/parquetjs/dist/lib/compression.js');
|
|
8
|
+
var arrow = require('apache-arrow');
|
|
5
9
|
var fs = require('fs');
|
|
6
10
|
var csvParse = require('csv-parse');
|
|
7
11
|
var es = require('event-stream');
|
|
@@ -9,6 +13,26 @@ var glob = require('glob');
|
|
|
9
13
|
var split = require('split2');
|
|
10
14
|
var stream = require('stream');
|
|
11
15
|
var cliProgress = require('cli-progress');
|
|
16
|
+
var pino = require('pino');
|
|
17
|
+
|
|
18
|
+
function _interopNamespaceDefault(e) {
|
|
19
|
+
var n = Object.create(null);
|
|
20
|
+
if (e) {
|
|
21
|
+
Object.keys(e).forEach(function (k) {
|
|
22
|
+
if (k !== 'default') {
|
|
23
|
+
var d = Object.getOwnPropertyDescriptor(e, k);
|
|
24
|
+
Object.defineProperty(n, k, d.get ? d : {
|
|
25
|
+
enumerable: true,
|
|
26
|
+
get: function () { return e[k]; }
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
n.default = e;
|
|
32
|
+
return Object.freeze(n);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
var arrow__namespace = /*#__PURE__*/_interopNamespaceDefault(arrow);
|
|
12
36
|
|
|
13
37
|
// In earlier versions this was used to set the number of docs to index in a
|
|
14
38
|
// single bulk request. Since we switched to use the helpers.bulk() method from
|
|
@@ -29,9 +53,9 @@ function createMappingFactory({
|
|
|
29
53
|
inferredIngestPipeline,
|
|
30
54
|
mappingsOverride,
|
|
31
55
|
indexMappingTotalFieldsLimit,
|
|
32
|
-
verbose,
|
|
33
56
|
deleteIndex,
|
|
34
|
-
pipeline
|
|
57
|
+
pipeline,
|
|
58
|
+
logger
|
|
35
59
|
}) {
|
|
36
60
|
return async () => {
|
|
37
61
|
let targetMappings = mappingsOverride ? undefined : mappings;
|
|
@@ -50,7 +74,10 @@ function createMappingFactory({
|
|
|
50
74
|
}
|
|
51
75
|
}
|
|
52
76
|
} catch (err) {
|
|
53
|
-
|
|
77
|
+
logger.error({
|
|
78
|
+
err,
|
|
79
|
+
sourceIndexName
|
|
80
|
+
}, 'Error reading source mapping');
|
|
54
81
|
return;
|
|
55
82
|
}
|
|
56
83
|
}
|
|
@@ -82,9 +109,14 @@ function createMappingFactory({
|
|
|
82
109
|
...inferredIngestPipeline
|
|
83
110
|
});
|
|
84
111
|
defaultPipeline = inferredPipelineName;
|
|
85
|
-
|
|
112
|
+
logger.info({
|
|
113
|
+
inferredPipelineName
|
|
114
|
+
}, 'Created inferred ingest pipeline');
|
|
86
115
|
} catch (err) {
|
|
87
|
-
|
|
116
|
+
logger.error({
|
|
117
|
+
err,
|
|
118
|
+
inferredPipelineName
|
|
119
|
+
}, 'Error creating inferred ingest pipeline');
|
|
88
120
|
}
|
|
89
121
|
}
|
|
90
122
|
const settings = {
|
|
@@ -97,22 +129,54 @@ function createMappingFactory({
|
|
|
97
129
|
'index.number_of_replicas': 0
|
|
98
130
|
} : {})
|
|
99
131
|
};
|
|
100
|
-
const
|
|
132
|
+
const response = await targetClient.indices.create({
|
|
101
133
|
index: targetIndexName,
|
|
102
134
|
mappings: targetMappings,
|
|
103
135
|
...(Object.keys(settings).length > 0 ? {
|
|
104
136
|
settings
|
|
105
137
|
} : {})
|
|
106
138
|
});
|
|
107
|
-
|
|
139
|
+
logger.info({
|
|
140
|
+
targetIndexName,
|
|
141
|
+
response
|
|
142
|
+
}, 'Created target mapping');
|
|
108
143
|
}
|
|
109
144
|
} catch (err) {
|
|
110
|
-
|
|
145
|
+
logger.error({
|
|
146
|
+
err,
|
|
147
|
+
targetIndexName
|
|
148
|
+
}, 'Error creating target mapping');
|
|
111
149
|
}
|
|
112
150
|
}
|
|
113
151
|
};
|
|
114
152
|
}
|
|
115
153
|
|
|
154
|
+
function registerZstdCompression() {
|
|
155
|
+
if (compression_js.PARQUET_COMPRESSION_METHODS.ZSTD) {
|
|
156
|
+
return;
|
|
157
|
+
}
|
|
158
|
+
if (typeof zlib.zstdCompressSync !== 'function' || typeof zlib.zstdDecompressSync !== 'function') {
|
|
159
|
+
compression_js.PARQUET_COMPRESSION_METHODS.ZSTD = {
|
|
160
|
+
deflate() {
|
|
161
|
+
throw new Error('ZSTD compression requires Node.js with zstd support.');
|
|
162
|
+
},
|
|
163
|
+
inflate() {
|
|
164
|
+
throw new Error('ZSTD compression requires Node.js with zstd support.');
|
|
165
|
+
}
|
|
166
|
+
};
|
|
167
|
+
return;
|
|
168
|
+
}
|
|
169
|
+
compression_js.PARQUET_COMPRESSION_METHODS.ZSTD = {
|
|
170
|
+
deflate(value) {
|
|
171
|
+
return zlib.zstdCompressSync(value);
|
|
172
|
+
},
|
|
173
|
+
inflate(value) {
|
|
174
|
+
return zlib.zstdDecompressSync(value);
|
|
175
|
+
}
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
registerZstdCompression();
|
|
179
|
+
|
|
116
180
|
function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
117
181
|
const options = {
|
|
118
182
|
bom: true,
|
|
@@ -128,8 +192,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
|
128
192
|
return options;
|
|
129
193
|
}
|
|
130
194
|
|
|
131
|
-
function
|
|
132
|
-
|
|
195
|
+
function createPauseWaiter$1(queueEmitter) {
|
|
196
|
+
let paused = false;
|
|
197
|
+
let waiters = [];
|
|
198
|
+
const onPause = () => {
|
|
199
|
+
paused = true;
|
|
200
|
+
};
|
|
201
|
+
const onResume = () => {
|
|
202
|
+
paused = false;
|
|
203
|
+
waiters.forEach(resolve => resolve());
|
|
204
|
+
waiters = [];
|
|
205
|
+
};
|
|
206
|
+
queueEmitter.on('pause', onPause);
|
|
207
|
+
queueEmitter.on('resume', onResume);
|
|
208
|
+
return {
|
|
209
|
+
async waitIfPaused() {
|
|
210
|
+
if (!paused) return;
|
|
211
|
+
await new Promise(resolve => {
|
|
212
|
+
waiters.push(resolve);
|
|
213
|
+
});
|
|
214
|
+
},
|
|
215
|
+
cleanup() {
|
|
216
|
+
queueEmitter.removeListener('pause', onPause);
|
|
217
|
+
queueEmitter.removeListener('resume', onResume);
|
|
218
|
+
waiters.forEach(resolve => resolve());
|
|
219
|
+
waiters = [];
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
224
|
+
function addParsedDoc(parsed, file) {
|
|
133
225
|
const context = {
|
|
134
226
|
fileName: file
|
|
135
227
|
};
|
|
@@ -137,7 +229,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
137
229
|
|
|
138
230
|
// if doc is null/undefined we'll skip indexing it
|
|
139
231
|
if (doc === null || typeof doc === 'undefined') {
|
|
140
|
-
streamRef.resume();
|
|
141
232
|
return;
|
|
142
233
|
}
|
|
143
234
|
|
|
@@ -152,9 +243,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
152
243
|
}
|
|
153
244
|
indexer.add(doc);
|
|
154
245
|
}
|
|
155
|
-
function
|
|
246
|
+
async function processParquetFile(file) {
|
|
247
|
+
const {
|
|
248
|
+
waitIfPaused,
|
|
249
|
+
cleanup
|
|
250
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
251
|
+
const reader = await parquet.ParquetReader.openFile(file);
|
|
252
|
+
try {
|
|
253
|
+
const cursor = reader.getCursor();
|
|
254
|
+
while (true) {
|
|
255
|
+
// eslint-disable-next-line no-await-in-loop
|
|
256
|
+
const row = await cursor.next();
|
|
257
|
+
if (row === null || typeof row === 'undefined') {
|
|
258
|
+
break;
|
|
259
|
+
}
|
|
260
|
+
addParsedDoc(row, file);
|
|
261
|
+
// eslint-disable-next-line no-await-in-loop
|
|
262
|
+
await waitIfPaused();
|
|
263
|
+
}
|
|
264
|
+
logger.info({
|
|
265
|
+
file
|
|
266
|
+
}, 'Read entire file');
|
|
267
|
+
} finally {
|
|
268
|
+
cleanup();
|
|
269
|
+
await reader.close();
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
async function processArrowFile(file) {
|
|
273
|
+
const {
|
|
274
|
+
waitIfPaused,
|
|
275
|
+
cleanup
|
|
276
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
277
|
+
try {
|
|
278
|
+
const reader = await arrow__namespace.RecordBatchReader.from(fs.createReadStream(file));
|
|
279
|
+
for await (const recordBatch of reader) {
|
|
280
|
+
const {
|
|
281
|
+
fields
|
|
282
|
+
} = recordBatch.schema;
|
|
283
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
284
|
+
const row = {};
|
|
285
|
+
fields.forEach(field => {
|
|
286
|
+
const vector = recordBatch.getChild(field.name);
|
|
287
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
288
|
+
});
|
|
289
|
+
addParsedDoc(row, file);
|
|
290
|
+
// eslint-disable-next-line no-await-in-loop
|
|
291
|
+
await waitIfPaused();
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
logger.info({
|
|
295
|
+
file
|
|
296
|
+
}, 'Read entire file');
|
|
297
|
+
} finally {
|
|
298
|
+
cleanup();
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
function processStreamFile(file, buildStream, errorMessage) {
|
|
302
|
+
return new Promise((resolve, reject) => {
|
|
303
|
+
let finished = false;
|
|
304
|
+
const s = buildStream();
|
|
305
|
+
const onPause = () => {
|
|
306
|
+
if (finished) return;
|
|
307
|
+
s.pause();
|
|
308
|
+
};
|
|
309
|
+
const onResume = () => {
|
|
310
|
+
if (finished) return;
|
|
311
|
+
s.resume();
|
|
312
|
+
};
|
|
313
|
+
function cleanup() {
|
|
314
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
315
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
316
|
+
}
|
|
317
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
318
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
319
|
+
s.on('end', () => {
|
|
320
|
+
finished = true;
|
|
321
|
+
cleanup();
|
|
322
|
+
logger.info({
|
|
323
|
+
file
|
|
324
|
+
}, 'Read entire file');
|
|
325
|
+
resolve();
|
|
326
|
+
});
|
|
327
|
+
s.on('error', err => {
|
|
328
|
+
finished = true;
|
|
329
|
+
cleanup();
|
|
330
|
+
logger.error({
|
|
331
|
+
err,
|
|
332
|
+
file
|
|
333
|
+
}, errorMessage);
|
|
334
|
+
reject(err);
|
|
335
|
+
});
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
function processNdjsonFile(file) {
|
|
156
339
|
let skippedHeader = false;
|
|
157
|
-
|
|
340
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
158
341
|
try {
|
|
159
342
|
// skip empty lines
|
|
160
343
|
if (line === '') {
|
|
@@ -165,72 +348,115 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
165
348
|
return;
|
|
166
349
|
}
|
|
167
350
|
const parsed = JSON.parse(line);
|
|
168
|
-
addParsedDoc(parsed, file
|
|
169
|
-
} catch (
|
|
170
|
-
|
|
351
|
+
addParsedDoc(parsed, file);
|
|
352
|
+
} catch (err) {
|
|
353
|
+
logger.error({
|
|
354
|
+
err,
|
|
355
|
+
file
|
|
356
|
+
}, 'Failed to process NDJSON line');
|
|
171
357
|
}
|
|
172
358
|
}).on('error', err => {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
359
|
+
logger.error({
|
|
360
|
+
err,
|
|
361
|
+
file
|
|
362
|
+
}, 'Error while reading file');
|
|
363
|
+
})), 'Error while reading file');
|
|
176
364
|
}
|
|
177
|
-
function
|
|
365
|
+
function processCsvFile(file) {
|
|
178
366
|
const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
|
|
179
|
-
|
|
367
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
|
|
180
368
|
try {
|
|
181
|
-
addParsedDoc(record, file
|
|
182
|
-
} catch (
|
|
183
|
-
|
|
369
|
+
addParsedDoc(record, file);
|
|
370
|
+
} catch (err) {
|
|
371
|
+
logger.error({
|
|
372
|
+
err,
|
|
373
|
+
file
|
|
374
|
+
}, 'Failed to process CSV record');
|
|
184
375
|
}
|
|
185
376
|
}).on('error', err => {
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
377
|
+
logger.error({
|
|
378
|
+
err,
|
|
379
|
+
file
|
|
380
|
+
}, 'Error while reading CSV file');
|
|
381
|
+
})), 'Error while reading CSV file');
|
|
189
382
|
}
|
|
190
|
-
function
|
|
191
|
-
|
|
383
|
+
async function processFile(file) {
|
|
384
|
+
if (sourceFormat === 'csv') {
|
|
385
|
+
await processCsvFile(file);
|
|
386
|
+
return;
|
|
387
|
+
}
|
|
388
|
+
if (sourceFormat === 'ndjson') {
|
|
389
|
+
await processNdjsonFile(file);
|
|
390
|
+
return;
|
|
391
|
+
}
|
|
392
|
+
if (sourceFormat === 'parquet') {
|
|
393
|
+
await processParquetFile(file);
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
396
|
+
if (sourceFormat === 'arrow') {
|
|
397
|
+
await processArrowFile(file);
|
|
398
|
+
return;
|
|
399
|
+
}
|
|
400
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
401
|
+
}
|
|
402
|
+
async function startIndex(files) {
|
|
192
403
|
if (files.length === 0) {
|
|
193
404
|
indexer.finish();
|
|
194
405
|
return;
|
|
195
406
|
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
if (files.length > 0) {
|
|
201
|
-
startIndex(files);
|
|
202
|
-
return;
|
|
407
|
+
try {
|
|
408
|
+
for (const file of files) {
|
|
409
|
+
// eslint-disable-next-line no-await-in-loop
|
|
410
|
+
await processFile(file);
|
|
203
411
|
}
|
|
412
|
+
} catch (err) {
|
|
413
|
+
logger.error({
|
|
414
|
+
err,
|
|
415
|
+
files
|
|
416
|
+
}, 'Error while processing files');
|
|
417
|
+
} finally {
|
|
204
418
|
indexer.finish();
|
|
205
|
-
|
|
206
|
-
});
|
|
207
|
-
indexer.queueEmitter.on('pause', () => {
|
|
208
|
-
if (finished) return;
|
|
209
|
-
s.pause();
|
|
210
|
-
});
|
|
211
|
-
indexer.queueEmitter.on('resume', () => {
|
|
212
|
-
if (finished) return;
|
|
213
|
-
s.resume();
|
|
214
|
-
});
|
|
419
|
+
}
|
|
215
420
|
}
|
|
216
421
|
return () => {
|
|
217
422
|
try {
|
|
218
423
|
const files = glob.globSync(fileName);
|
|
219
424
|
startIndex(files);
|
|
220
|
-
} catch (
|
|
221
|
-
|
|
425
|
+
} catch (err) {
|
|
426
|
+
logger.error({
|
|
427
|
+
err,
|
|
428
|
+
fileName
|
|
429
|
+
}, 'Error matching files');
|
|
430
|
+
indexer.finish();
|
|
222
431
|
}
|
|
223
432
|
};
|
|
224
433
|
}
|
|
225
434
|
|
|
226
435
|
const EventEmitter = require('events');
|
|
227
436
|
const parallelCalls = 5;
|
|
437
|
+
const MAX_SAFE_BIGINT = BigInt(Number.MAX_SAFE_INTEGER);
|
|
438
|
+
const MIN_SAFE_BIGINT = BigInt(Number.MIN_SAFE_INTEGER);
|
|
439
|
+
function coerceBigInt(value) {
|
|
440
|
+
if (value >= MIN_SAFE_BIGINT && value <= MAX_SAFE_BIGINT) {
|
|
441
|
+
return Number(value);
|
|
442
|
+
}
|
|
443
|
+
return value.toString();
|
|
444
|
+
}
|
|
445
|
+
function safeStringify(doc) {
|
|
446
|
+
return JSON.stringify(doc, (_key, value) => {
|
|
447
|
+
if (typeof value === 'bigint') {
|
|
448
|
+
return coerceBigInt(value);
|
|
449
|
+
}
|
|
450
|
+
return value;
|
|
451
|
+
});
|
|
452
|
+
}
|
|
228
453
|
|
|
229
454
|
// a simple helper queue to bulk index documents
|
|
230
455
|
function indexQueueFactory({
|
|
231
456
|
targetClient: client,
|
|
232
457
|
targetIndexName,
|
|
233
|
-
bufferSize = DEFAULT_BUFFER_SIZE
|
|
458
|
+
bufferSize = DEFAULT_BUFFER_SIZE,
|
|
459
|
+
logger
|
|
234
460
|
}) {
|
|
235
461
|
const queueEmitter = new EventEmitter();
|
|
236
462
|
let docsPerSecond = 0;
|
|
@@ -263,8 +489,9 @@ function indexQueueFactory({
|
|
|
263
489
|
try {
|
|
264
490
|
yield JSON.parse(line); // Parse and yield the JSON object
|
|
265
491
|
} catch (err) {
|
|
266
|
-
|
|
267
|
-
|
|
492
|
+
logger.error({
|
|
493
|
+
err
|
|
494
|
+
}, 'Failed to parse JSON from NDJSON stream');
|
|
268
495
|
}
|
|
269
496
|
}
|
|
270
497
|
}
|
|
@@ -274,7 +501,9 @@ function indexQueueFactory({
|
|
|
274
501
|
try {
|
|
275
502
|
yield JSON.parse(buffer);
|
|
276
503
|
} catch (err) {
|
|
277
|
-
|
|
504
|
+
logger.error({
|
|
505
|
+
err
|
|
506
|
+
}, 'Failed to parse final JSON from NDJSON stream');
|
|
278
507
|
}
|
|
279
508
|
}
|
|
280
509
|
} finally {
|
|
@@ -300,7 +529,7 @@ function indexQueueFactory({
|
|
|
300
529
|
flushInterval: 1000,
|
|
301
530
|
refreshOnCompletion: true,
|
|
302
531
|
datasource: ndjsonStreamIterator(stream$1),
|
|
303
|
-
onDocument(
|
|
532
|
+
onDocument() {
|
|
304
533
|
docsPerSecond++;
|
|
305
534
|
return {
|
|
306
535
|
index: {
|
|
@@ -309,9 +538,13 @@ function indexQueueFactory({
|
|
|
309
538
|
};
|
|
310
539
|
}
|
|
311
540
|
});
|
|
312
|
-
} catch (
|
|
313
|
-
|
|
314
|
-
|
|
541
|
+
} catch (err) {
|
|
542
|
+
logger.error({
|
|
543
|
+
err,
|
|
544
|
+
targetIndexName
|
|
545
|
+
}, 'Error during bulk indexing');
|
|
546
|
+
queueEmitter.emit('error', err);
|
|
547
|
+
throw err;
|
|
315
548
|
} finally {
|
|
316
549
|
// Clean up interval
|
|
317
550
|
clearInterval(interval);
|
|
@@ -340,7 +573,7 @@ function indexQueueFactory({
|
|
|
340
573
|
if (finished) {
|
|
341
574
|
throw new Error('Unexpected doc added after indexer should finish.');
|
|
342
575
|
}
|
|
343
|
-
const canContinue = stream$1.write(`${
|
|
576
|
+
const canContinue = stream$1.write(`${safeStringify(doc)}\n`);
|
|
344
577
|
if (!canContinue) {
|
|
345
578
|
queueEmitter.emit('pause');
|
|
346
579
|
|
|
@@ -361,7 +594,7 @@ function indexQueueFactory({
|
|
|
361
594
|
|
|
362
595
|
// create a new progress bar instance and use shades_classic theme
|
|
363
596
|
const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
|
364
|
-
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
|
|
597
|
+
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
|
|
365
598
|
return async function indexReader() {
|
|
366
599
|
let docsNum = 0;
|
|
367
600
|
let scrollId;
|
|
@@ -380,8 +613,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
380
613
|
maxRetries: 0
|
|
381
614
|
});
|
|
382
615
|
return Object.keys(response.fields);
|
|
383
|
-
} catch (
|
|
384
|
-
|
|
616
|
+
} catch (err) {
|
|
617
|
+
logger.error({
|
|
618
|
+
err,
|
|
619
|
+
sourceIndexName
|
|
620
|
+
}, 'Failed to fetch populated fields');
|
|
385
621
|
}
|
|
386
622
|
}
|
|
387
623
|
function search(fields) {
|
|
@@ -425,8 +661,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
425
661
|
return;
|
|
426
662
|
}
|
|
427
663
|
indexer.add(doc);
|
|
428
|
-
} catch (
|
|
429
|
-
|
|
664
|
+
} catch (err) {
|
|
665
|
+
logger.error({
|
|
666
|
+
err
|
|
667
|
+
}, 'Failed to process source index document');
|
|
430
668
|
}
|
|
431
669
|
}
|
|
432
670
|
async function fetchNextResponse() {
|
|
@@ -497,17 +735,25 @@ async function inferMappingsFromSource({
|
|
|
497
735
|
mappings,
|
|
498
736
|
inferMappings,
|
|
499
737
|
inferMappingsOptions,
|
|
500
|
-
|
|
738
|
+
logger
|
|
501
739
|
}) {
|
|
502
740
|
if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
|
|
503
741
|
return emptyInferenceResult(mappings);
|
|
504
742
|
}
|
|
743
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
744
|
+
logger.info({
|
|
745
|
+
sourceFormat
|
|
746
|
+
}, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
|
|
747
|
+
return emptyInferenceResult(mappings);
|
|
748
|
+
}
|
|
505
749
|
if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
|
|
506
750
|
return emptyInferenceResult(mappings);
|
|
507
751
|
}
|
|
508
752
|
const files = glob.globSync(fileName);
|
|
509
753
|
if (files.length === 0) {
|
|
510
|
-
|
|
754
|
+
logger.info({
|
|
755
|
+
fileName
|
|
756
|
+
}, 'No files matched for mapping inference');
|
|
511
757
|
return emptyInferenceResult(mappings);
|
|
512
758
|
}
|
|
513
759
|
const {
|
|
@@ -516,7 +762,7 @@ async function inferMappingsFromSource({
|
|
|
516
762
|
} = inferMappingsOptions || {};
|
|
517
763
|
const sampleText = readSample(files[0], sampleBytes);
|
|
518
764
|
if (!sampleText || sampleText.trim() === '') {
|
|
519
|
-
|
|
765
|
+
logger.info('Skipping mapping inference because the sample text is empty');
|
|
520
766
|
return emptyInferenceResult(mappings);
|
|
521
767
|
}
|
|
522
768
|
const params = {
|
|
@@ -543,31 +789,98 @@ async function inferMappingsFromSource({
|
|
|
543
789
|
}
|
|
544
790
|
try {
|
|
545
791
|
const response = await targetClient.textStructure.findStructure(params);
|
|
546
|
-
if (response?.mappings
|
|
547
|
-
|
|
792
|
+
if (response?.mappings) {
|
|
793
|
+
logger.info({
|
|
794
|
+
file: files[0]
|
|
795
|
+
}, 'Inferred mappings via _text_structure/find_structure');
|
|
548
796
|
}
|
|
549
|
-
if (response?.ingest_pipeline
|
|
550
|
-
|
|
797
|
+
if (response?.ingest_pipeline) {
|
|
798
|
+
logger.info('Inferred ingest pipeline via _text_structure/find_structure');
|
|
551
799
|
}
|
|
552
800
|
return {
|
|
553
801
|
mappings: response?.mappings || mappings,
|
|
554
802
|
ingestPipeline: response?.ingest_pipeline
|
|
555
803
|
};
|
|
556
|
-
} catch (
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
}
|
|
804
|
+
} catch (err) {
|
|
805
|
+
logger.warn({
|
|
806
|
+
err
|
|
807
|
+
}, 'Could not infer mappings via _text_structure/find_structure');
|
|
560
808
|
return emptyInferenceResult(mappings);
|
|
561
809
|
}
|
|
562
810
|
}
|
|
563
811
|
|
|
564
|
-
|
|
565
|
-
|
|
812
|
+
const DEFAULT_LOG_LEVEL = 'info';
|
|
813
|
+
function resolveLogLevel(verbose = true) {
|
|
814
|
+
if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
|
|
815
|
+
return process.env.LOG_LEVEL;
|
|
816
|
+
}
|
|
817
|
+
return verbose ? DEFAULT_LOG_LEVEL : 'error';
|
|
818
|
+
}
|
|
819
|
+
function createLogger({
|
|
820
|
+
logger,
|
|
821
|
+
verbose = true
|
|
822
|
+
} = {}) {
|
|
823
|
+
if (logger && typeof logger === 'object') {
|
|
824
|
+
return logger;
|
|
825
|
+
}
|
|
826
|
+
return pino({
|
|
827
|
+
name: 'node-es-transformer',
|
|
828
|
+
level: resolveLogLevel(verbose),
|
|
829
|
+
timestamp: pino.stdTimeFunctions.isoTime,
|
|
830
|
+
serializers: {
|
|
831
|
+
err: pino.stdSerializers.err,
|
|
832
|
+
error: pino.stdSerializers.err
|
|
833
|
+
}
|
|
834
|
+
});
|
|
835
|
+
}
|
|
836
|
+
function createChildLogger(logger, bindings) {
|
|
837
|
+
if (!logger || typeof logger.child !== 'function') {
|
|
838
|
+
return logger;
|
|
839
|
+
}
|
|
840
|
+
return logger.child(bindings);
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
function createPauseWaiter(queueEmitter) {
|
|
844
|
+
let paused = false;
|
|
845
|
+
let waiters = [];
|
|
846
|
+
const onPause = () => {
|
|
847
|
+
paused = true;
|
|
848
|
+
};
|
|
849
|
+
const onResume = () => {
|
|
850
|
+
paused = false;
|
|
851
|
+
waiters.forEach(resolve => resolve());
|
|
852
|
+
waiters = [];
|
|
853
|
+
};
|
|
854
|
+
queueEmitter.on('pause', onPause);
|
|
855
|
+
queueEmitter.on('resume', onResume);
|
|
856
|
+
return {
|
|
857
|
+
async waitIfPaused() {
|
|
858
|
+
if (!paused) return;
|
|
859
|
+
await new Promise(resolve => {
|
|
860
|
+
waiters.push(resolve);
|
|
861
|
+
});
|
|
862
|
+
},
|
|
863
|
+
cleanup() {
|
|
864
|
+
queueEmitter.removeListener('pause', onPause);
|
|
865
|
+
queueEmitter.removeListener('resume', onResume);
|
|
866
|
+
waiters.forEach(resolve => resolve());
|
|
867
|
+
waiters = [];
|
|
868
|
+
}
|
|
869
|
+
};
|
|
870
|
+
}
|
|
871
|
+
async function readStreamToBuffer(stream) {
|
|
872
|
+
const chunks = [];
|
|
873
|
+
for await (const chunk of stream) {
|
|
874
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
875
|
+
}
|
|
876
|
+
return Buffer.concat(chunks);
|
|
877
|
+
}
|
|
878
|
+
function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
879
|
+
function addParsedDoc(parsed) {
|
|
566
880
|
const doc = typeof transform === 'function' ? transform(parsed) : parsed;
|
|
567
881
|
|
|
568
882
|
// if doc is null/undefined we'll skip indexing it
|
|
569
883
|
if (doc === null || typeof doc === 'undefined') {
|
|
570
|
-
streamRef.resume();
|
|
571
884
|
return;
|
|
572
885
|
}
|
|
573
886
|
|
|
@@ -582,50 +895,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
|
|
|
582
895
|
}
|
|
583
896
|
indexer.add(doc);
|
|
584
897
|
}
|
|
585
|
-
function
|
|
586
|
-
|
|
587
|
-
|
|
898
|
+
async function processParquetStream() {
|
|
899
|
+
const {
|
|
900
|
+
waitIfPaused,
|
|
901
|
+
cleanup
|
|
902
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
903
|
+
const parquetBuffer = await readStreamToBuffer(stream);
|
|
904
|
+
const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
|
|
905
|
+
try {
|
|
906
|
+
const cursor = reader.getCursor();
|
|
907
|
+
while (true) {
|
|
908
|
+
// eslint-disable-next-line no-await-in-loop
|
|
909
|
+
const row = await cursor.next();
|
|
910
|
+
if (row === null || typeof row === 'undefined') {
|
|
911
|
+
break;
|
|
912
|
+
}
|
|
913
|
+
addParsedDoc(row);
|
|
914
|
+
// eslint-disable-next-line no-await-in-loop
|
|
915
|
+
await waitIfPaused();
|
|
916
|
+
}
|
|
917
|
+
logger.info('Read entire stream');
|
|
918
|
+
} finally {
|
|
919
|
+
cleanup();
|
|
920
|
+
await reader.close();
|
|
921
|
+
}
|
|
922
|
+
}
|
|
923
|
+
async function processArrowStream() {
|
|
924
|
+
const {
|
|
925
|
+
waitIfPaused,
|
|
926
|
+
cleanup
|
|
927
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
928
|
+
try {
|
|
929
|
+
const reader = await arrow__namespace.RecordBatchReader.from(stream);
|
|
930
|
+
for await (const recordBatch of reader) {
|
|
931
|
+
const {
|
|
932
|
+
fields
|
|
933
|
+
} = recordBatch.schema;
|
|
934
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
935
|
+
const row = {};
|
|
936
|
+
fields.forEach(field => {
|
|
937
|
+
const vector = recordBatch.getChild(field.name);
|
|
938
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
939
|
+
});
|
|
940
|
+
addParsedDoc(row);
|
|
941
|
+
// eslint-disable-next-line no-await-in-loop
|
|
942
|
+
await waitIfPaused();
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
logger.info('Read entire stream');
|
|
946
|
+
} finally {
|
|
947
|
+
cleanup();
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
function processPipeline(buildPipeline, errorMessage) {
|
|
951
|
+
return new Promise((resolve, reject) => {
|
|
952
|
+
let finished = false;
|
|
953
|
+
const s = buildPipeline();
|
|
954
|
+
const onPause = () => {
|
|
955
|
+
if (finished) return;
|
|
956
|
+
s.pause();
|
|
957
|
+
};
|
|
958
|
+
const onResume = () => {
|
|
959
|
+
if (finished) return;
|
|
960
|
+
s.resume();
|
|
961
|
+
};
|
|
962
|
+
function cleanup() {
|
|
963
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
964
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
965
|
+
}
|
|
966
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
967
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
968
|
+
s.on('end', () => {
|
|
969
|
+
finished = true;
|
|
970
|
+
cleanup();
|
|
971
|
+
logger.info('Read entire stream');
|
|
972
|
+
resolve();
|
|
973
|
+
});
|
|
974
|
+
s.on('error', err => {
|
|
975
|
+
finished = true;
|
|
976
|
+
cleanup();
|
|
977
|
+
logger.error({
|
|
978
|
+
err
|
|
979
|
+
}, errorMessage);
|
|
980
|
+
reject(err);
|
|
981
|
+
});
|
|
982
|
+
});
|
|
983
|
+
}
|
|
984
|
+
function processCsvStream() {
|
|
985
|
+
return processPipeline(() => stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
|
|
588
986
|
try {
|
|
589
|
-
addParsedDoc(record
|
|
590
|
-
} catch (
|
|
591
|
-
|
|
987
|
+
addParsedDoc(record);
|
|
988
|
+
} catch (err) {
|
|
989
|
+
logger.error({
|
|
990
|
+
err
|
|
991
|
+
}, 'Failed to process CSV stream record');
|
|
592
992
|
}
|
|
593
993
|
}).on('error', err => {
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
}
|
|
607
|
-
const parsed = JSON.parse(line);
|
|
608
|
-
addParsedDoc(parsed, s);
|
|
609
|
-
} catch (e) {
|
|
610
|
-
console.log('error', e);
|
|
994
|
+
logger.error({
|
|
995
|
+
err
|
|
996
|
+
}, 'Error while reading CSV stream');
|
|
997
|
+
})), 'Error while reading CSV stream');
|
|
998
|
+
}
|
|
999
|
+
function processNdjsonStream() {
|
|
1000
|
+
let skippedHeader = false;
|
|
1001
|
+
return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
1002
|
+
try {
|
|
1003
|
+
// skip empty lines
|
|
1004
|
+
if (line === '') {
|
|
1005
|
+
return;
|
|
611
1006
|
}
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
1007
|
+
if (skipHeader && !skippedHeader) {
|
|
1008
|
+
skippedHeader = true;
|
|
1009
|
+
return;
|
|
1010
|
+
}
|
|
1011
|
+
const parsed = JSON.parse(line);
|
|
1012
|
+
addParsedDoc(parsed);
|
|
1013
|
+
} catch (err) {
|
|
1014
|
+
logger.error({
|
|
1015
|
+
err
|
|
1016
|
+
}, 'Failed to process NDJSON stream line');
|
|
1017
|
+
}
|
|
1018
|
+
}).on('error', err => {
|
|
1019
|
+
logger.error({
|
|
1020
|
+
err
|
|
1021
|
+
}, 'Error while reading stream');
|
|
1022
|
+
})), 'Error while reading stream');
|
|
1023
|
+
}
|
|
1024
|
+
async function startIndex() {
|
|
1025
|
+
try {
|
|
1026
|
+
if (sourceFormat === 'csv') {
|
|
1027
|
+
await processCsvStream();
|
|
1028
|
+
} else if (sourceFormat === 'ndjson') {
|
|
1029
|
+
await processNdjsonStream();
|
|
1030
|
+
} else if (sourceFormat === 'parquet') {
|
|
1031
|
+
await processParquetStream();
|
|
1032
|
+
} else if (sourceFormat === 'arrow') {
|
|
1033
|
+
await processArrowStream();
|
|
1034
|
+
} else {
|
|
1035
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
1036
|
+
}
|
|
1037
|
+
} catch (err) {
|
|
1038
|
+
logger.error({
|
|
1039
|
+
err
|
|
1040
|
+
}, 'Error while reading stream');
|
|
1041
|
+
} finally {
|
|
618
1042
|
indexer.finish();
|
|
619
|
-
|
|
620
|
-
});
|
|
621
|
-
indexer.queueEmitter.on('pause', () => {
|
|
622
|
-
if (finished) return;
|
|
623
|
-
s.pause();
|
|
624
|
-
});
|
|
625
|
-
indexer.queueEmitter.on('resume', () => {
|
|
626
|
-
if (finished) return;
|
|
627
|
-
s.resume();
|
|
628
|
-
});
|
|
1043
|
+
}
|
|
629
1044
|
}
|
|
630
1045
|
return () => {
|
|
631
1046
|
startIndex();
|
|
@@ -721,11 +1136,16 @@ async function transformer({
|
|
|
721
1136
|
query,
|
|
722
1137
|
skipHeader = false,
|
|
723
1138
|
transform,
|
|
724
|
-
verbose = true
|
|
1139
|
+
verbose = true,
|
|
1140
|
+
logger: loggerInput
|
|
725
1141
|
}) {
|
|
726
1142
|
if (typeof targetIndexName === 'undefined') {
|
|
727
1143
|
throw Error('targetIndexName must be specified.');
|
|
728
1144
|
}
|
|
1145
|
+
const logger = createLogger({
|
|
1146
|
+
logger: loggerInput,
|
|
1147
|
+
verbose
|
|
1148
|
+
});
|
|
729
1149
|
const defaultClientConfig = {
|
|
730
1150
|
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
|
|
731
1151
|
};
|
|
@@ -742,7 +1162,9 @@ async function transformer({
|
|
|
742
1162
|
mappings,
|
|
743
1163
|
inferMappings,
|
|
744
1164
|
inferMappingsOptions,
|
|
745
|
-
|
|
1165
|
+
logger: createChildLogger(logger, {
|
|
1166
|
+
component: 'mapping-inference'
|
|
1167
|
+
})
|
|
746
1168
|
});
|
|
747
1169
|
const createMapping = createMappingFactory({
|
|
748
1170
|
sourceClient,
|
|
@@ -753,17 +1175,23 @@ async function transformer({
|
|
|
753
1175
|
inferredIngestPipeline: inferenceResult.ingestPipeline,
|
|
754
1176
|
mappingsOverride,
|
|
755
1177
|
indexMappingTotalFieldsLimit,
|
|
756
|
-
verbose,
|
|
757
1178
|
deleteIndex,
|
|
758
|
-
pipeline
|
|
1179
|
+
pipeline,
|
|
1180
|
+
logger: createChildLogger(logger, {
|
|
1181
|
+
component: 'create-mapping'
|
|
1182
|
+
})
|
|
759
1183
|
});
|
|
760
1184
|
const indexer = indexQueueFactory({
|
|
761
1185
|
targetClient,
|
|
762
1186
|
targetIndexName,
|
|
763
|
-
bufferSize
|
|
1187
|
+
bufferSize,
|
|
1188
|
+
logger: createChildLogger(logger, {
|
|
1189
|
+
component: 'index-queue'
|
|
1190
|
+
})
|
|
1191
|
+
});
|
|
764
1192
|
function validateSourceFormat() {
|
|
765
|
-
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
766
|
-
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "
|
|
1193
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
|
|
1194
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
|
|
767
1195
|
}
|
|
768
1196
|
}
|
|
769
1197
|
function getReader() {
|
|
@@ -775,18 +1203,27 @@ async function transformer({
|
|
|
775
1203
|
}
|
|
776
1204
|
if (typeof fileName !== 'undefined') {
|
|
777
1205
|
validateSourceFormat();
|
|
778
|
-
return fileReaderFactory(indexer, fileName, transform, splitRegex,
|
|
1206
|
+
return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1207
|
+
component: 'file-reader'
|
|
1208
|
+
}));
|
|
779
1209
|
}
|
|
780
1210
|
if (typeof sourceIndexName !== 'undefined') {
|
|
781
|
-
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields
|
|
1211
|
+
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
|
|
1212
|
+
component: 'index-reader'
|
|
1213
|
+
}));
|
|
782
1214
|
}
|
|
783
1215
|
if (typeof stream !== 'undefined') {
|
|
784
1216
|
validateSourceFormat();
|
|
785
|
-
return streamReaderFactory(indexer, stream, transform, splitRegex,
|
|
1217
|
+
return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1218
|
+
component: 'stream-reader'
|
|
1219
|
+
}));
|
|
786
1220
|
}
|
|
787
1221
|
return null;
|
|
788
1222
|
}
|
|
789
1223
|
const reader = getReader();
|
|
1224
|
+
if (typeof reader !== 'function') {
|
|
1225
|
+
throw Error('One of fileName, sourceIndexName, or stream must be specified.');
|
|
1226
|
+
}
|
|
790
1227
|
try {
|
|
791
1228
|
const indexExists = await targetClient.indices.exists({
|
|
792
1229
|
index: targetIndexName
|
|
@@ -803,8 +1240,11 @@ async function transformer({
|
|
|
803
1240
|
} else {
|
|
804
1241
|
reader();
|
|
805
1242
|
}
|
|
806
|
-
} catch (
|
|
807
|
-
|
|
1243
|
+
} catch (err) {
|
|
1244
|
+
logger.error({
|
|
1245
|
+
err,
|
|
1246
|
+
targetIndexName
|
|
1247
|
+
}, 'Error checking index existence');
|
|
808
1248
|
} finally {
|
|
809
1249
|
// targetClient.close();
|
|
810
1250
|
}
|