node-es-transformer 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -12
- package/dist/node-es-transformer.cjs.js +522 -126
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +503 -126
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +24 -2
- package/package.json +12 -7
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
var elasticsearch9 = require('es9');
|
|
4
4
|
var elasticsearch8 = require('es8');
|
|
5
|
+
var parquet = require('@dsnp/parquetjs');
|
|
6
|
+
var arrow = require('apache-arrow');
|
|
5
7
|
var fs = require('fs');
|
|
6
8
|
var csvParse = require('csv-parse');
|
|
7
9
|
var es = require('event-stream');
|
|
@@ -9,6 +11,26 @@ var glob = require('glob');
|
|
|
9
11
|
var split = require('split2');
|
|
10
12
|
var stream = require('stream');
|
|
11
13
|
var cliProgress = require('cli-progress');
|
|
14
|
+
var pino = require('pino');
|
|
15
|
+
|
|
16
|
+
function _interopNamespaceDefault(e) {
|
|
17
|
+
var n = Object.create(null);
|
|
18
|
+
if (e) {
|
|
19
|
+
Object.keys(e).forEach(function (k) {
|
|
20
|
+
if (k !== 'default') {
|
|
21
|
+
var d = Object.getOwnPropertyDescriptor(e, k);
|
|
22
|
+
Object.defineProperty(n, k, d.get ? d : {
|
|
23
|
+
enumerable: true,
|
|
24
|
+
get: function () { return e[k]; }
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
n.default = e;
|
|
30
|
+
return Object.freeze(n);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
var arrow__namespace = /*#__PURE__*/_interopNamespaceDefault(arrow);
|
|
12
34
|
|
|
13
35
|
// In earlier versions this was used to set the number of docs to index in a
|
|
14
36
|
// single bulk request. Since we switched to use the helpers.bulk() method from
|
|
@@ -29,9 +51,9 @@ function createMappingFactory({
|
|
|
29
51
|
inferredIngestPipeline,
|
|
30
52
|
mappingsOverride,
|
|
31
53
|
indexMappingTotalFieldsLimit,
|
|
32
|
-
verbose,
|
|
33
54
|
deleteIndex,
|
|
34
|
-
pipeline
|
|
55
|
+
pipeline,
|
|
56
|
+
logger
|
|
35
57
|
}) {
|
|
36
58
|
return async () => {
|
|
37
59
|
let targetMappings = mappingsOverride ? undefined : mappings;
|
|
@@ -50,7 +72,10 @@ function createMappingFactory({
|
|
|
50
72
|
}
|
|
51
73
|
}
|
|
52
74
|
} catch (err) {
|
|
53
|
-
|
|
75
|
+
logger.error({
|
|
76
|
+
err,
|
|
77
|
+
sourceIndexName
|
|
78
|
+
}, 'Error reading source mapping');
|
|
54
79
|
return;
|
|
55
80
|
}
|
|
56
81
|
}
|
|
@@ -82,9 +107,14 @@ function createMappingFactory({
|
|
|
82
107
|
...inferredIngestPipeline
|
|
83
108
|
});
|
|
84
109
|
defaultPipeline = inferredPipelineName;
|
|
85
|
-
|
|
110
|
+
logger.info({
|
|
111
|
+
inferredPipelineName
|
|
112
|
+
}, 'Created inferred ingest pipeline');
|
|
86
113
|
} catch (err) {
|
|
87
|
-
|
|
114
|
+
logger.error({
|
|
115
|
+
err,
|
|
116
|
+
inferredPipelineName
|
|
117
|
+
}, 'Error creating inferred ingest pipeline');
|
|
88
118
|
}
|
|
89
119
|
}
|
|
90
120
|
const settings = {
|
|
@@ -97,17 +127,23 @@ function createMappingFactory({
|
|
|
97
127
|
'index.number_of_replicas': 0
|
|
98
128
|
} : {})
|
|
99
129
|
};
|
|
100
|
-
const
|
|
130
|
+
const response = await targetClient.indices.create({
|
|
101
131
|
index: targetIndexName,
|
|
102
132
|
mappings: targetMappings,
|
|
103
133
|
...(Object.keys(settings).length > 0 ? {
|
|
104
134
|
settings
|
|
105
135
|
} : {})
|
|
106
136
|
});
|
|
107
|
-
|
|
137
|
+
logger.info({
|
|
138
|
+
targetIndexName,
|
|
139
|
+
response
|
|
140
|
+
}, 'Created target mapping');
|
|
108
141
|
}
|
|
109
142
|
} catch (err) {
|
|
110
|
-
|
|
143
|
+
logger.error({
|
|
144
|
+
err,
|
|
145
|
+
targetIndexName
|
|
146
|
+
}, 'Error creating target mapping');
|
|
111
147
|
}
|
|
112
148
|
}
|
|
113
149
|
};
|
|
@@ -128,8 +164,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
|
128
164
|
return options;
|
|
129
165
|
}
|
|
130
166
|
|
|
131
|
-
function
|
|
132
|
-
|
|
167
|
+
function createPauseWaiter$1(queueEmitter) {
|
|
168
|
+
let paused = false;
|
|
169
|
+
let waiters = [];
|
|
170
|
+
const onPause = () => {
|
|
171
|
+
paused = true;
|
|
172
|
+
};
|
|
173
|
+
const onResume = () => {
|
|
174
|
+
paused = false;
|
|
175
|
+
waiters.forEach(resolve => resolve());
|
|
176
|
+
waiters = [];
|
|
177
|
+
};
|
|
178
|
+
queueEmitter.on('pause', onPause);
|
|
179
|
+
queueEmitter.on('resume', onResume);
|
|
180
|
+
return {
|
|
181
|
+
async waitIfPaused() {
|
|
182
|
+
if (!paused) return;
|
|
183
|
+
await new Promise(resolve => {
|
|
184
|
+
waiters.push(resolve);
|
|
185
|
+
});
|
|
186
|
+
},
|
|
187
|
+
cleanup() {
|
|
188
|
+
queueEmitter.removeListener('pause', onPause);
|
|
189
|
+
queueEmitter.removeListener('resume', onResume);
|
|
190
|
+
waiters.forEach(resolve => resolve());
|
|
191
|
+
waiters = [];
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
196
|
+
function addParsedDoc(parsed, file) {
|
|
133
197
|
const context = {
|
|
134
198
|
fileName: file
|
|
135
199
|
};
|
|
@@ -137,7 +201,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
137
201
|
|
|
138
202
|
// if doc is null/undefined we'll skip indexing it
|
|
139
203
|
if (doc === null || typeof doc === 'undefined') {
|
|
140
|
-
streamRef.resume();
|
|
141
204
|
return;
|
|
142
205
|
}
|
|
143
206
|
|
|
@@ -152,9 +215,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
152
215
|
}
|
|
153
216
|
indexer.add(doc);
|
|
154
217
|
}
|
|
155
|
-
function
|
|
218
|
+
async function processParquetFile(file) {
|
|
219
|
+
const {
|
|
220
|
+
waitIfPaused,
|
|
221
|
+
cleanup
|
|
222
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
223
|
+
const reader = await parquet.ParquetReader.openFile(file);
|
|
224
|
+
try {
|
|
225
|
+
const cursor = reader.getCursor();
|
|
226
|
+
while (true) {
|
|
227
|
+
// eslint-disable-next-line no-await-in-loop
|
|
228
|
+
const row = await cursor.next();
|
|
229
|
+
if (row === null || typeof row === 'undefined') {
|
|
230
|
+
break;
|
|
231
|
+
}
|
|
232
|
+
addParsedDoc(row, file);
|
|
233
|
+
// eslint-disable-next-line no-await-in-loop
|
|
234
|
+
await waitIfPaused();
|
|
235
|
+
}
|
|
236
|
+
logger.info({
|
|
237
|
+
file
|
|
238
|
+
}, 'Read entire file');
|
|
239
|
+
} finally {
|
|
240
|
+
cleanup();
|
|
241
|
+
await reader.close();
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
async function processArrowFile(file) {
|
|
245
|
+
const {
|
|
246
|
+
waitIfPaused,
|
|
247
|
+
cleanup
|
|
248
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
249
|
+
try {
|
|
250
|
+
const reader = await arrow__namespace.RecordBatchReader.from(fs.createReadStream(file));
|
|
251
|
+
for await (const recordBatch of reader) {
|
|
252
|
+
const {
|
|
253
|
+
fields
|
|
254
|
+
} = recordBatch.schema;
|
|
255
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
256
|
+
const row = {};
|
|
257
|
+
fields.forEach(field => {
|
|
258
|
+
const vector = recordBatch.getChild(field.name);
|
|
259
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
260
|
+
});
|
|
261
|
+
addParsedDoc(row, file);
|
|
262
|
+
// eslint-disable-next-line no-await-in-loop
|
|
263
|
+
await waitIfPaused();
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
logger.info({
|
|
267
|
+
file
|
|
268
|
+
}, 'Read entire file');
|
|
269
|
+
} finally {
|
|
270
|
+
cleanup();
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
function processStreamFile(file, buildStream, errorMessage) {
|
|
274
|
+
return new Promise((resolve, reject) => {
|
|
275
|
+
let finished = false;
|
|
276
|
+
const s = buildStream();
|
|
277
|
+
const onPause = () => {
|
|
278
|
+
if (finished) return;
|
|
279
|
+
s.pause();
|
|
280
|
+
};
|
|
281
|
+
const onResume = () => {
|
|
282
|
+
if (finished) return;
|
|
283
|
+
s.resume();
|
|
284
|
+
};
|
|
285
|
+
function cleanup() {
|
|
286
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
287
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
288
|
+
}
|
|
289
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
290
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
291
|
+
s.on('end', () => {
|
|
292
|
+
finished = true;
|
|
293
|
+
cleanup();
|
|
294
|
+
logger.info({
|
|
295
|
+
file
|
|
296
|
+
}, 'Read entire file');
|
|
297
|
+
resolve();
|
|
298
|
+
});
|
|
299
|
+
s.on('error', err => {
|
|
300
|
+
finished = true;
|
|
301
|
+
cleanup();
|
|
302
|
+
logger.error({
|
|
303
|
+
err,
|
|
304
|
+
file
|
|
305
|
+
}, errorMessage);
|
|
306
|
+
reject(err);
|
|
307
|
+
});
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
function processNdjsonFile(file) {
|
|
156
311
|
let skippedHeader = false;
|
|
157
|
-
|
|
312
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
158
313
|
try {
|
|
159
314
|
// skip empty lines
|
|
160
315
|
if (line === '') {
|
|
@@ -165,60 +320,86 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
165
320
|
return;
|
|
166
321
|
}
|
|
167
322
|
const parsed = JSON.parse(line);
|
|
168
|
-
addParsedDoc(parsed, file
|
|
169
|
-
} catch (
|
|
170
|
-
|
|
323
|
+
addParsedDoc(parsed, file);
|
|
324
|
+
} catch (err) {
|
|
325
|
+
logger.error({
|
|
326
|
+
err,
|
|
327
|
+
file
|
|
328
|
+
}, 'Failed to process NDJSON line');
|
|
171
329
|
}
|
|
172
330
|
}).on('error', err => {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
331
|
+
logger.error({
|
|
332
|
+
err,
|
|
333
|
+
file
|
|
334
|
+
}, 'Error while reading file');
|
|
335
|
+
})), 'Error while reading file');
|
|
176
336
|
}
|
|
177
|
-
function
|
|
337
|
+
function processCsvFile(file) {
|
|
178
338
|
const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
|
|
179
|
-
|
|
339
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
|
|
180
340
|
try {
|
|
181
|
-
addParsedDoc(record, file
|
|
182
|
-
} catch (
|
|
183
|
-
|
|
341
|
+
addParsedDoc(record, file);
|
|
342
|
+
} catch (err) {
|
|
343
|
+
logger.error({
|
|
344
|
+
err,
|
|
345
|
+
file
|
|
346
|
+
}, 'Failed to process CSV record');
|
|
184
347
|
}
|
|
185
348
|
}).on('error', err => {
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
349
|
+
logger.error({
|
|
350
|
+
err,
|
|
351
|
+
file
|
|
352
|
+
}, 'Error while reading CSV file');
|
|
353
|
+
})), 'Error while reading CSV file');
|
|
189
354
|
}
|
|
190
|
-
function
|
|
191
|
-
|
|
355
|
+
async function processFile(file) {
|
|
356
|
+
if (sourceFormat === 'csv') {
|
|
357
|
+
await processCsvFile(file);
|
|
358
|
+
return;
|
|
359
|
+
}
|
|
360
|
+
if (sourceFormat === 'ndjson') {
|
|
361
|
+
await processNdjsonFile(file);
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
if (sourceFormat === 'parquet') {
|
|
365
|
+
await processParquetFile(file);
|
|
366
|
+
return;
|
|
367
|
+
}
|
|
368
|
+
if (sourceFormat === 'arrow') {
|
|
369
|
+
await processArrowFile(file);
|
|
370
|
+
return;
|
|
371
|
+
}
|
|
372
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
373
|
+
}
|
|
374
|
+
async function startIndex(files) {
|
|
192
375
|
if (files.length === 0) {
|
|
193
376
|
indexer.finish();
|
|
194
377
|
return;
|
|
195
378
|
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
if (files.length > 0) {
|
|
201
|
-
startIndex(files);
|
|
202
|
-
return;
|
|
379
|
+
try {
|
|
380
|
+
for (const file of files) {
|
|
381
|
+
// eslint-disable-next-line no-await-in-loop
|
|
382
|
+
await processFile(file);
|
|
203
383
|
}
|
|
384
|
+
} catch (err) {
|
|
385
|
+
logger.error({
|
|
386
|
+
err,
|
|
387
|
+
files
|
|
388
|
+
}, 'Error while processing files');
|
|
389
|
+
} finally {
|
|
204
390
|
indexer.finish();
|
|
205
|
-
|
|
206
|
-
});
|
|
207
|
-
indexer.queueEmitter.on('pause', () => {
|
|
208
|
-
if (finished) return;
|
|
209
|
-
s.pause();
|
|
210
|
-
});
|
|
211
|
-
indexer.queueEmitter.on('resume', () => {
|
|
212
|
-
if (finished) return;
|
|
213
|
-
s.resume();
|
|
214
|
-
});
|
|
391
|
+
}
|
|
215
392
|
}
|
|
216
393
|
return () => {
|
|
217
394
|
try {
|
|
218
395
|
const files = glob.globSync(fileName);
|
|
219
396
|
startIndex(files);
|
|
220
|
-
} catch (
|
|
221
|
-
|
|
397
|
+
} catch (err) {
|
|
398
|
+
logger.error({
|
|
399
|
+
err,
|
|
400
|
+
fileName
|
|
401
|
+
}, 'Error matching files');
|
|
402
|
+
indexer.finish();
|
|
222
403
|
}
|
|
223
404
|
};
|
|
224
405
|
}
|
|
@@ -230,7 +411,8 @@ const parallelCalls = 5;
|
|
|
230
411
|
function indexQueueFactory({
|
|
231
412
|
targetClient: client,
|
|
232
413
|
targetIndexName,
|
|
233
|
-
bufferSize = DEFAULT_BUFFER_SIZE
|
|
414
|
+
bufferSize = DEFAULT_BUFFER_SIZE,
|
|
415
|
+
logger
|
|
234
416
|
}) {
|
|
235
417
|
const queueEmitter = new EventEmitter();
|
|
236
418
|
let docsPerSecond = 0;
|
|
@@ -263,8 +445,9 @@ function indexQueueFactory({
|
|
|
263
445
|
try {
|
|
264
446
|
yield JSON.parse(line); // Parse and yield the JSON object
|
|
265
447
|
} catch (err) {
|
|
266
|
-
|
|
267
|
-
|
|
448
|
+
logger.error({
|
|
449
|
+
err
|
|
450
|
+
}, 'Failed to parse JSON from NDJSON stream');
|
|
268
451
|
}
|
|
269
452
|
}
|
|
270
453
|
}
|
|
@@ -274,7 +457,9 @@ function indexQueueFactory({
|
|
|
274
457
|
try {
|
|
275
458
|
yield JSON.parse(buffer);
|
|
276
459
|
} catch (err) {
|
|
277
|
-
|
|
460
|
+
logger.error({
|
|
461
|
+
err
|
|
462
|
+
}, 'Failed to parse final JSON from NDJSON stream');
|
|
278
463
|
}
|
|
279
464
|
}
|
|
280
465
|
} finally {
|
|
@@ -300,7 +485,7 @@ function indexQueueFactory({
|
|
|
300
485
|
flushInterval: 1000,
|
|
301
486
|
refreshOnCompletion: true,
|
|
302
487
|
datasource: ndjsonStreamIterator(stream$1),
|
|
303
|
-
onDocument(
|
|
488
|
+
onDocument() {
|
|
304
489
|
docsPerSecond++;
|
|
305
490
|
return {
|
|
306
491
|
index: {
|
|
@@ -309,9 +494,13 @@ function indexQueueFactory({
|
|
|
309
494
|
};
|
|
310
495
|
}
|
|
311
496
|
});
|
|
312
|
-
} catch (
|
|
313
|
-
|
|
314
|
-
|
|
497
|
+
} catch (err) {
|
|
498
|
+
logger.error({
|
|
499
|
+
err,
|
|
500
|
+
targetIndexName
|
|
501
|
+
}, 'Error during bulk indexing');
|
|
502
|
+
queueEmitter.emit('error', err);
|
|
503
|
+
throw err;
|
|
315
504
|
} finally {
|
|
316
505
|
// Clean up interval
|
|
317
506
|
clearInterval(interval);
|
|
@@ -361,7 +550,7 @@ function indexQueueFactory({
|
|
|
361
550
|
|
|
362
551
|
// create a new progress bar instance and use shades_classic theme
|
|
363
552
|
const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
|
364
|
-
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
|
|
553
|
+
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
|
|
365
554
|
return async function indexReader() {
|
|
366
555
|
let docsNum = 0;
|
|
367
556
|
let scrollId;
|
|
@@ -380,8 +569,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
380
569
|
maxRetries: 0
|
|
381
570
|
});
|
|
382
571
|
return Object.keys(response.fields);
|
|
383
|
-
} catch (
|
|
384
|
-
|
|
572
|
+
} catch (err) {
|
|
573
|
+
logger.error({
|
|
574
|
+
err,
|
|
575
|
+
sourceIndexName
|
|
576
|
+
}, 'Failed to fetch populated fields');
|
|
385
577
|
}
|
|
386
578
|
}
|
|
387
579
|
function search(fields) {
|
|
@@ -425,8 +617,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
425
617
|
return;
|
|
426
618
|
}
|
|
427
619
|
indexer.add(doc);
|
|
428
|
-
} catch (
|
|
429
|
-
|
|
620
|
+
} catch (err) {
|
|
621
|
+
logger.error({
|
|
622
|
+
err
|
|
623
|
+
}, 'Failed to process source index document');
|
|
430
624
|
}
|
|
431
625
|
}
|
|
432
626
|
async function fetchNextResponse() {
|
|
@@ -497,17 +691,25 @@ async function inferMappingsFromSource({
|
|
|
497
691
|
mappings,
|
|
498
692
|
inferMappings,
|
|
499
693
|
inferMappingsOptions,
|
|
500
|
-
|
|
694
|
+
logger
|
|
501
695
|
}) {
|
|
502
696
|
if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
|
|
503
697
|
return emptyInferenceResult(mappings);
|
|
504
698
|
}
|
|
699
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
700
|
+
logger.info({
|
|
701
|
+
sourceFormat
|
|
702
|
+
}, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
|
|
703
|
+
return emptyInferenceResult(mappings);
|
|
704
|
+
}
|
|
505
705
|
if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
|
|
506
706
|
return emptyInferenceResult(mappings);
|
|
507
707
|
}
|
|
508
708
|
const files = glob.globSync(fileName);
|
|
509
709
|
if (files.length === 0) {
|
|
510
|
-
|
|
710
|
+
logger.info({
|
|
711
|
+
fileName
|
|
712
|
+
}, 'No files matched for mapping inference');
|
|
511
713
|
return emptyInferenceResult(mappings);
|
|
512
714
|
}
|
|
513
715
|
const {
|
|
@@ -516,7 +718,7 @@ async function inferMappingsFromSource({
|
|
|
516
718
|
} = inferMappingsOptions || {};
|
|
517
719
|
const sampleText = readSample(files[0], sampleBytes);
|
|
518
720
|
if (!sampleText || sampleText.trim() === '') {
|
|
519
|
-
|
|
721
|
+
logger.info('Skipping mapping inference because the sample text is empty');
|
|
520
722
|
return emptyInferenceResult(mappings);
|
|
521
723
|
}
|
|
522
724
|
const params = {
|
|
@@ -543,31 +745,98 @@ async function inferMappingsFromSource({
|
|
|
543
745
|
}
|
|
544
746
|
try {
|
|
545
747
|
const response = await targetClient.textStructure.findStructure(params);
|
|
546
|
-
if (response?.mappings
|
|
547
|
-
|
|
748
|
+
if (response?.mappings) {
|
|
749
|
+
logger.info({
|
|
750
|
+
file: files[0]
|
|
751
|
+
}, 'Inferred mappings via _text_structure/find_structure');
|
|
548
752
|
}
|
|
549
|
-
if (response?.ingest_pipeline
|
|
550
|
-
|
|
753
|
+
if (response?.ingest_pipeline) {
|
|
754
|
+
logger.info('Inferred ingest pipeline via _text_structure/find_structure');
|
|
551
755
|
}
|
|
552
756
|
return {
|
|
553
757
|
mappings: response?.mappings || mappings,
|
|
554
758
|
ingestPipeline: response?.ingest_pipeline
|
|
555
759
|
};
|
|
556
|
-
} catch (
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
}
|
|
760
|
+
} catch (err) {
|
|
761
|
+
logger.warn({
|
|
762
|
+
err
|
|
763
|
+
}, 'Could not infer mappings via _text_structure/find_structure');
|
|
560
764
|
return emptyInferenceResult(mappings);
|
|
561
765
|
}
|
|
562
766
|
}
|
|
563
767
|
|
|
564
|
-
|
|
565
|
-
|
|
768
|
+
const DEFAULT_LOG_LEVEL = 'info';
|
|
769
|
+
function resolveLogLevel(verbose = true) {
|
|
770
|
+
if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
|
|
771
|
+
return process.env.LOG_LEVEL;
|
|
772
|
+
}
|
|
773
|
+
return verbose ? DEFAULT_LOG_LEVEL : 'error';
|
|
774
|
+
}
|
|
775
|
+
function createLogger({
|
|
776
|
+
logger,
|
|
777
|
+
verbose = true
|
|
778
|
+
} = {}) {
|
|
779
|
+
if (logger && typeof logger === 'object') {
|
|
780
|
+
return logger;
|
|
781
|
+
}
|
|
782
|
+
return pino({
|
|
783
|
+
name: 'node-es-transformer',
|
|
784
|
+
level: resolveLogLevel(verbose),
|
|
785
|
+
timestamp: pino.stdTimeFunctions.isoTime,
|
|
786
|
+
serializers: {
|
|
787
|
+
err: pino.stdSerializers.err,
|
|
788
|
+
error: pino.stdSerializers.err
|
|
789
|
+
}
|
|
790
|
+
});
|
|
791
|
+
}
|
|
792
|
+
function createChildLogger(logger, bindings) {
|
|
793
|
+
if (!logger || typeof logger.child !== 'function') {
|
|
794
|
+
return logger;
|
|
795
|
+
}
|
|
796
|
+
return logger.child(bindings);
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
function createPauseWaiter(queueEmitter) {
|
|
800
|
+
let paused = false;
|
|
801
|
+
let waiters = [];
|
|
802
|
+
const onPause = () => {
|
|
803
|
+
paused = true;
|
|
804
|
+
};
|
|
805
|
+
const onResume = () => {
|
|
806
|
+
paused = false;
|
|
807
|
+
waiters.forEach(resolve => resolve());
|
|
808
|
+
waiters = [];
|
|
809
|
+
};
|
|
810
|
+
queueEmitter.on('pause', onPause);
|
|
811
|
+
queueEmitter.on('resume', onResume);
|
|
812
|
+
return {
|
|
813
|
+
async waitIfPaused() {
|
|
814
|
+
if (!paused) return;
|
|
815
|
+
await new Promise(resolve => {
|
|
816
|
+
waiters.push(resolve);
|
|
817
|
+
});
|
|
818
|
+
},
|
|
819
|
+
cleanup() {
|
|
820
|
+
queueEmitter.removeListener('pause', onPause);
|
|
821
|
+
queueEmitter.removeListener('resume', onResume);
|
|
822
|
+
waiters.forEach(resolve => resolve());
|
|
823
|
+
waiters = [];
|
|
824
|
+
}
|
|
825
|
+
};
|
|
826
|
+
}
|
|
827
|
+
async function readStreamToBuffer(stream) {
|
|
828
|
+
const chunks = [];
|
|
829
|
+
for await (const chunk of stream) {
|
|
830
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
831
|
+
}
|
|
832
|
+
return Buffer.concat(chunks);
|
|
833
|
+
}
|
|
834
|
+
function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
835
|
+
function addParsedDoc(parsed) {
|
|
566
836
|
const doc = typeof transform === 'function' ? transform(parsed) : parsed;
|
|
567
837
|
|
|
568
838
|
// if doc is null/undefined we'll skip indexing it
|
|
569
839
|
if (doc === null || typeof doc === 'undefined') {
|
|
570
|
-
streamRef.resume();
|
|
571
840
|
return;
|
|
572
841
|
}
|
|
573
842
|
|
|
@@ -582,50 +851,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
|
|
|
582
851
|
}
|
|
583
852
|
indexer.add(doc);
|
|
584
853
|
}
|
|
585
|
-
function
|
|
586
|
-
|
|
587
|
-
|
|
854
|
+
async function processParquetStream() {
|
|
855
|
+
const {
|
|
856
|
+
waitIfPaused,
|
|
857
|
+
cleanup
|
|
858
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
859
|
+
const parquetBuffer = await readStreamToBuffer(stream);
|
|
860
|
+
const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
|
|
861
|
+
try {
|
|
862
|
+
const cursor = reader.getCursor();
|
|
863
|
+
while (true) {
|
|
864
|
+
// eslint-disable-next-line no-await-in-loop
|
|
865
|
+
const row = await cursor.next();
|
|
866
|
+
if (row === null || typeof row === 'undefined') {
|
|
867
|
+
break;
|
|
868
|
+
}
|
|
869
|
+
addParsedDoc(row);
|
|
870
|
+
// eslint-disable-next-line no-await-in-loop
|
|
871
|
+
await waitIfPaused();
|
|
872
|
+
}
|
|
873
|
+
logger.info('Read entire stream');
|
|
874
|
+
} finally {
|
|
875
|
+
cleanup();
|
|
876
|
+
await reader.close();
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
async function processArrowStream() {
|
|
880
|
+
const {
|
|
881
|
+
waitIfPaused,
|
|
882
|
+
cleanup
|
|
883
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
884
|
+
try {
|
|
885
|
+
const reader = await arrow__namespace.RecordBatchReader.from(stream);
|
|
886
|
+
for await (const recordBatch of reader) {
|
|
887
|
+
const {
|
|
888
|
+
fields
|
|
889
|
+
} = recordBatch.schema;
|
|
890
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
891
|
+
const row = {};
|
|
892
|
+
fields.forEach(field => {
|
|
893
|
+
const vector = recordBatch.getChild(field.name);
|
|
894
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
895
|
+
});
|
|
896
|
+
addParsedDoc(row);
|
|
897
|
+
// eslint-disable-next-line no-await-in-loop
|
|
898
|
+
await waitIfPaused();
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
logger.info('Read entire stream');
|
|
902
|
+
} finally {
|
|
903
|
+
cleanup();
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
function processPipeline(buildPipeline, errorMessage) {
|
|
907
|
+
return new Promise((resolve, reject) => {
|
|
908
|
+
let finished = false;
|
|
909
|
+
const s = buildPipeline();
|
|
910
|
+
const onPause = () => {
|
|
911
|
+
if (finished) return;
|
|
912
|
+
s.pause();
|
|
913
|
+
};
|
|
914
|
+
const onResume = () => {
|
|
915
|
+
if (finished) return;
|
|
916
|
+
s.resume();
|
|
917
|
+
};
|
|
918
|
+
function cleanup() {
|
|
919
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
920
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
921
|
+
}
|
|
922
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
923
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
924
|
+
s.on('end', () => {
|
|
925
|
+
finished = true;
|
|
926
|
+
cleanup();
|
|
927
|
+
logger.info('Read entire stream');
|
|
928
|
+
resolve();
|
|
929
|
+
});
|
|
930
|
+
s.on('error', err => {
|
|
931
|
+
finished = true;
|
|
932
|
+
cleanup();
|
|
933
|
+
logger.error({
|
|
934
|
+
err
|
|
935
|
+
}, errorMessage);
|
|
936
|
+
reject(err);
|
|
937
|
+
});
|
|
938
|
+
});
|
|
939
|
+
}
|
|
940
|
+
function processCsvStream() {
|
|
941
|
+
return processPipeline(() => stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
|
|
588
942
|
try {
|
|
589
|
-
addParsedDoc(record
|
|
590
|
-
} catch (
|
|
591
|
-
|
|
943
|
+
addParsedDoc(record);
|
|
944
|
+
} catch (err) {
|
|
945
|
+
logger.error({
|
|
946
|
+
err
|
|
947
|
+
}, 'Failed to process CSV stream record');
|
|
592
948
|
}
|
|
593
949
|
}).on('error', err => {
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
}
|
|
607
|
-
const parsed = JSON.parse(line);
|
|
608
|
-
addParsedDoc(parsed, s);
|
|
609
|
-
} catch (e) {
|
|
610
|
-
console.log('error', e);
|
|
950
|
+
logger.error({
|
|
951
|
+
err
|
|
952
|
+
}, 'Error while reading CSV stream');
|
|
953
|
+
})), 'Error while reading CSV stream');
|
|
954
|
+
}
|
|
955
|
+
function processNdjsonStream() {
|
|
956
|
+
let skippedHeader = false;
|
|
957
|
+
return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
958
|
+
try {
|
|
959
|
+
// skip empty lines
|
|
960
|
+
if (line === '') {
|
|
961
|
+
return;
|
|
611
962
|
}
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
963
|
+
if (skipHeader && !skippedHeader) {
|
|
964
|
+
skippedHeader = true;
|
|
965
|
+
return;
|
|
966
|
+
}
|
|
967
|
+
const parsed = JSON.parse(line);
|
|
968
|
+
addParsedDoc(parsed);
|
|
969
|
+
} catch (err) {
|
|
970
|
+
logger.error({
|
|
971
|
+
err
|
|
972
|
+
}, 'Failed to process NDJSON stream line');
|
|
973
|
+
}
|
|
974
|
+
}).on('error', err => {
|
|
975
|
+
logger.error({
|
|
976
|
+
err
|
|
977
|
+
}, 'Error while reading stream');
|
|
978
|
+
})), 'Error while reading stream');
|
|
979
|
+
}
|
|
980
|
+
async function startIndex() {
|
|
981
|
+
try {
|
|
982
|
+
if (sourceFormat === 'csv') {
|
|
983
|
+
await processCsvStream();
|
|
984
|
+
} else if (sourceFormat === 'ndjson') {
|
|
985
|
+
await processNdjsonStream();
|
|
986
|
+
} else if (sourceFormat === 'parquet') {
|
|
987
|
+
await processParquetStream();
|
|
988
|
+
} else if (sourceFormat === 'arrow') {
|
|
989
|
+
await processArrowStream();
|
|
990
|
+
} else {
|
|
991
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
992
|
+
}
|
|
993
|
+
} catch (err) {
|
|
994
|
+
logger.error({
|
|
995
|
+
err
|
|
996
|
+
}, 'Error while reading stream');
|
|
997
|
+
} finally {
|
|
618
998
|
indexer.finish();
|
|
619
|
-
|
|
620
|
-
});
|
|
621
|
-
indexer.queueEmitter.on('pause', () => {
|
|
622
|
-
if (finished) return;
|
|
623
|
-
s.pause();
|
|
624
|
-
});
|
|
625
|
-
indexer.queueEmitter.on('resume', () => {
|
|
626
|
-
if (finished) return;
|
|
627
|
-
s.resume();
|
|
628
|
-
});
|
|
999
|
+
}
|
|
629
1000
|
}
|
|
630
1001
|
return () => {
|
|
631
1002
|
startIndex();
|
|
@@ -721,11 +1092,16 @@ async function transformer({
|
|
|
721
1092
|
query,
|
|
722
1093
|
skipHeader = false,
|
|
723
1094
|
transform,
|
|
724
|
-
verbose = true
|
|
1095
|
+
verbose = true,
|
|
1096
|
+
logger: loggerInput
|
|
725
1097
|
}) {
|
|
726
1098
|
if (typeof targetIndexName === 'undefined') {
|
|
727
1099
|
throw Error('targetIndexName must be specified.');
|
|
728
1100
|
}
|
|
1101
|
+
const logger = createLogger({
|
|
1102
|
+
logger: loggerInput,
|
|
1103
|
+
verbose
|
|
1104
|
+
});
|
|
729
1105
|
const defaultClientConfig = {
|
|
730
1106
|
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
|
|
731
1107
|
};
|
|
@@ -742,7 +1118,9 @@ async function transformer({
|
|
|
742
1118
|
mappings,
|
|
743
1119
|
inferMappings,
|
|
744
1120
|
inferMappingsOptions,
|
|
745
|
-
|
|
1121
|
+
logger: createChildLogger(logger, {
|
|
1122
|
+
component: 'mapping-inference'
|
|
1123
|
+
})
|
|
746
1124
|
});
|
|
747
1125
|
const createMapping = createMappingFactory({
|
|
748
1126
|
sourceClient,
|
|
@@ -753,17 +1131,23 @@ async function transformer({
|
|
|
753
1131
|
inferredIngestPipeline: inferenceResult.ingestPipeline,
|
|
754
1132
|
mappingsOverride,
|
|
755
1133
|
indexMappingTotalFieldsLimit,
|
|
756
|
-
verbose,
|
|
757
1134
|
deleteIndex,
|
|
758
|
-
pipeline
|
|
1135
|
+
pipeline,
|
|
1136
|
+
logger: createChildLogger(logger, {
|
|
1137
|
+
component: 'create-mapping'
|
|
1138
|
+
})
|
|
759
1139
|
});
|
|
760
1140
|
const indexer = indexQueueFactory({
|
|
761
1141
|
targetClient,
|
|
762
1142
|
targetIndexName,
|
|
763
|
-
bufferSize
|
|
1143
|
+
bufferSize,
|
|
1144
|
+
logger: createChildLogger(logger, {
|
|
1145
|
+
component: 'index-queue'
|
|
1146
|
+
})
|
|
1147
|
+
});
|
|
764
1148
|
function validateSourceFormat() {
|
|
765
|
-
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
766
|
-
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "
|
|
1149
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
|
|
1150
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
|
|
767
1151
|
}
|
|
768
1152
|
}
|
|
769
1153
|
function getReader() {
|
|
@@ -775,18 +1159,27 @@ async function transformer({
|
|
|
775
1159
|
}
|
|
776
1160
|
if (typeof fileName !== 'undefined') {
|
|
777
1161
|
validateSourceFormat();
|
|
778
|
-
return fileReaderFactory(indexer, fileName, transform, splitRegex,
|
|
1162
|
+
return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1163
|
+
component: 'file-reader'
|
|
1164
|
+
}));
|
|
779
1165
|
}
|
|
780
1166
|
if (typeof sourceIndexName !== 'undefined') {
|
|
781
|
-
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields
|
|
1167
|
+
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
|
|
1168
|
+
component: 'index-reader'
|
|
1169
|
+
}));
|
|
782
1170
|
}
|
|
783
1171
|
if (typeof stream !== 'undefined') {
|
|
784
1172
|
validateSourceFormat();
|
|
785
|
-
return streamReaderFactory(indexer, stream, transform, splitRegex,
|
|
1173
|
+
return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1174
|
+
component: 'stream-reader'
|
|
1175
|
+
}));
|
|
786
1176
|
}
|
|
787
1177
|
return null;
|
|
788
1178
|
}
|
|
789
1179
|
const reader = getReader();
|
|
1180
|
+
if (typeof reader !== 'function') {
|
|
1181
|
+
throw Error('One of fileName, sourceIndexName, or stream must be specified.');
|
|
1182
|
+
}
|
|
790
1183
|
try {
|
|
791
1184
|
const indexExists = await targetClient.indices.exists({
|
|
792
1185
|
index: targetIndexName
|
|
@@ -803,8 +1196,11 @@ async function transformer({
|
|
|
803
1196
|
} else {
|
|
804
1197
|
reader();
|
|
805
1198
|
}
|
|
806
|
-
} catch (
|
|
807
|
-
|
|
1199
|
+
} catch (err) {
|
|
1200
|
+
logger.error({
|
|
1201
|
+
err,
|
|
1202
|
+
targetIndexName
|
|
1203
|
+
}, 'Error checking index existence');
|
|
808
1204
|
} finally {
|
|
809
1205
|
// targetClient.close();
|
|
810
1206
|
}
|