node-es-transformer 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -12
- package/dist/node-es-transformer.cjs.js +522 -126
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +503 -126
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +24 -2
- package/package.json +12 -7
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import elasticsearch9 from 'es9';
|
|
2
2
|
import elasticsearch8 from 'es8';
|
|
3
|
+
import parquet from '@dsnp/parquetjs';
|
|
4
|
+
import * as arrow from 'apache-arrow';
|
|
3
5
|
import fs from 'fs';
|
|
4
6
|
import { parse } from 'csv-parse';
|
|
5
7
|
import es from 'event-stream';
|
|
@@ -7,6 +9,7 @@ import { globSync } from 'glob';
|
|
|
7
9
|
import split from 'split2';
|
|
8
10
|
import { PassThrough } from 'stream';
|
|
9
11
|
import cliProgress from 'cli-progress';
|
|
12
|
+
import pino from 'pino';
|
|
10
13
|
|
|
11
14
|
// In earlier versions this was used to set the number of docs to index in a
|
|
12
15
|
// single bulk request. Since we switched to use the helpers.bulk() method from
|
|
@@ -27,9 +30,9 @@ function createMappingFactory({
|
|
|
27
30
|
inferredIngestPipeline,
|
|
28
31
|
mappingsOverride,
|
|
29
32
|
indexMappingTotalFieldsLimit,
|
|
30
|
-
verbose,
|
|
31
33
|
deleteIndex,
|
|
32
|
-
pipeline
|
|
34
|
+
pipeline,
|
|
35
|
+
logger
|
|
33
36
|
}) {
|
|
34
37
|
return async () => {
|
|
35
38
|
let targetMappings = mappingsOverride ? undefined : mappings;
|
|
@@ -48,7 +51,10 @@ function createMappingFactory({
|
|
|
48
51
|
}
|
|
49
52
|
}
|
|
50
53
|
} catch (err) {
|
|
51
|
-
|
|
54
|
+
logger.error({
|
|
55
|
+
err,
|
|
56
|
+
sourceIndexName
|
|
57
|
+
}, 'Error reading source mapping');
|
|
52
58
|
return;
|
|
53
59
|
}
|
|
54
60
|
}
|
|
@@ -80,9 +86,14 @@ function createMappingFactory({
|
|
|
80
86
|
...inferredIngestPipeline
|
|
81
87
|
});
|
|
82
88
|
defaultPipeline = inferredPipelineName;
|
|
83
|
-
|
|
89
|
+
logger.info({
|
|
90
|
+
inferredPipelineName
|
|
91
|
+
}, 'Created inferred ingest pipeline');
|
|
84
92
|
} catch (err) {
|
|
85
|
-
|
|
93
|
+
logger.error({
|
|
94
|
+
err,
|
|
95
|
+
inferredPipelineName
|
|
96
|
+
}, 'Error creating inferred ingest pipeline');
|
|
86
97
|
}
|
|
87
98
|
}
|
|
88
99
|
const settings = {
|
|
@@ -95,17 +106,23 @@ function createMappingFactory({
|
|
|
95
106
|
'index.number_of_replicas': 0
|
|
96
107
|
} : {})
|
|
97
108
|
};
|
|
98
|
-
const
|
|
109
|
+
const response = await targetClient.indices.create({
|
|
99
110
|
index: targetIndexName,
|
|
100
111
|
mappings: targetMappings,
|
|
101
112
|
...(Object.keys(settings).length > 0 ? {
|
|
102
113
|
settings
|
|
103
114
|
} : {})
|
|
104
115
|
});
|
|
105
|
-
|
|
116
|
+
logger.info({
|
|
117
|
+
targetIndexName,
|
|
118
|
+
response
|
|
119
|
+
}, 'Created target mapping');
|
|
106
120
|
}
|
|
107
121
|
} catch (err) {
|
|
108
|
-
|
|
122
|
+
logger.error({
|
|
123
|
+
err,
|
|
124
|
+
targetIndexName
|
|
125
|
+
}, 'Error creating target mapping');
|
|
109
126
|
}
|
|
110
127
|
}
|
|
111
128
|
};
|
|
@@ -126,8 +143,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
|
126
143
|
return options;
|
|
127
144
|
}
|
|
128
145
|
|
|
129
|
-
function
|
|
130
|
-
|
|
146
|
+
function createPauseWaiter$1(queueEmitter) {
|
|
147
|
+
let paused = false;
|
|
148
|
+
let waiters = [];
|
|
149
|
+
const onPause = () => {
|
|
150
|
+
paused = true;
|
|
151
|
+
};
|
|
152
|
+
const onResume = () => {
|
|
153
|
+
paused = false;
|
|
154
|
+
waiters.forEach(resolve => resolve());
|
|
155
|
+
waiters = [];
|
|
156
|
+
};
|
|
157
|
+
queueEmitter.on('pause', onPause);
|
|
158
|
+
queueEmitter.on('resume', onResume);
|
|
159
|
+
return {
|
|
160
|
+
async waitIfPaused() {
|
|
161
|
+
if (!paused) return;
|
|
162
|
+
await new Promise(resolve => {
|
|
163
|
+
waiters.push(resolve);
|
|
164
|
+
});
|
|
165
|
+
},
|
|
166
|
+
cleanup() {
|
|
167
|
+
queueEmitter.removeListener('pause', onPause);
|
|
168
|
+
queueEmitter.removeListener('resume', onResume);
|
|
169
|
+
waiters.forEach(resolve => resolve());
|
|
170
|
+
waiters = [];
|
|
171
|
+
}
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
175
|
+
function addParsedDoc(parsed, file) {
|
|
131
176
|
const context = {
|
|
132
177
|
fileName: file
|
|
133
178
|
};
|
|
@@ -135,7 +180,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
135
180
|
|
|
136
181
|
// if doc is null/undefined we'll skip indexing it
|
|
137
182
|
if (doc === null || typeof doc === 'undefined') {
|
|
138
|
-
streamRef.resume();
|
|
139
183
|
return;
|
|
140
184
|
}
|
|
141
185
|
|
|
@@ -150,9 +194,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
150
194
|
}
|
|
151
195
|
indexer.add(doc);
|
|
152
196
|
}
|
|
153
|
-
function
|
|
197
|
+
async function processParquetFile(file) {
|
|
198
|
+
const {
|
|
199
|
+
waitIfPaused,
|
|
200
|
+
cleanup
|
|
201
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
202
|
+
const reader = await parquet.ParquetReader.openFile(file);
|
|
203
|
+
try {
|
|
204
|
+
const cursor = reader.getCursor();
|
|
205
|
+
while (true) {
|
|
206
|
+
// eslint-disable-next-line no-await-in-loop
|
|
207
|
+
const row = await cursor.next();
|
|
208
|
+
if (row === null || typeof row === 'undefined') {
|
|
209
|
+
break;
|
|
210
|
+
}
|
|
211
|
+
addParsedDoc(row, file);
|
|
212
|
+
// eslint-disable-next-line no-await-in-loop
|
|
213
|
+
await waitIfPaused();
|
|
214
|
+
}
|
|
215
|
+
logger.info({
|
|
216
|
+
file
|
|
217
|
+
}, 'Read entire file');
|
|
218
|
+
} finally {
|
|
219
|
+
cleanup();
|
|
220
|
+
await reader.close();
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
async function processArrowFile(file) {
|
|
224
|
+
const {
|
|
225
|
+
waitIfPaused,
|
|
226
|
+
cleanup
|
|
227
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
228
|
+
try {
|
|
229
|
+
const reader = await arrow.RecordBatchReader.from(fs.createReadStream(file));
|
|
230
|
+
for await (const recordBatch of reader) {
|
|
231
|
+
const {
|
|
232
|
+
fields
|
|
233
|
+
} = recordBatch.schema;
|
|
234
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
235
|
+
const row = {};
|
|
236
|
+
fields.forEach(field => {
|
|
237
|
+
const vector = recordBatch.getChild(field.name);
|
|
238
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
239
|
+
});
|
|
240
|
+
addParsedDoc(row, file);
|
|
241
|
+
// eslint-disable-next-line no-await-in-loop
|
|
242
|
+
await waitIfPaused();
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
logger.info({
|
|
246
|
+
file
|
|
247
|
+
}, 'Read entire file');
|
|
248
|
+
} finally {
|
|
249
|
+
cleanup();
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
function processStreamFile(file, buildStream, errorMessage) {
|
|
253
|
+
return new Promise((resolve, reject) => {
|
|
254
|
+
let finished = false;
|
|
255
|
+
const s = buildStream();
|
|
256
|
+
const onPause = () => {
|
|
257
|
+
if (finished) return;
|
|
258
|
+
s.pause();
|
|
259
|
+
};
|
|
260
|
+
const onResume = () => {
|
|
261
|
+
if (finished) return;
|
|
262
|
+
s.resume();
|
|
263
|
+
};
|
|
264
|
+
function cleanup() {
|
|
265
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
266
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
267
|
+
}
|
|
268
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
269
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
270
|
+
s.on('end', () => {
|
|
271
|
+
finished = true;
|
|
272
|
+
cleanup();
|
|
273
|
+
logger.info({
|
|
274
|
+
file
|
|
275
|
+
}, 'Read entire file');
|
|
276
|
+
resolve();
|
|
277
|
+
});
|
|
278
|
+
s.on('error', err => {
|
|
279
|
+
finished = true;
|
|
280
|
+
cleanup();
|
|
281
|
+
logger.error({
|
|
282
|
+
err,
|
|
283
|
+
file
|
|
284
|
+
}, errorMessage);
|
|
285
|
+
reject(err);
|
|
286
|
+
});
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
function processNdjsonFile(file) {
|
|
154
290
|
let skippedHeader = false;
|
|
155
|
-
|
|
291
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
156
292
|
try {
|
|
157
293
|
// skip empty lines
|
|
158
294
|
if (line === '') {
|
|
@@ -163,60 +299,86 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
163
299
|
return;
|
|
164
300
|
}
|
|
165
301
|
const parsed = JSON.parse(line);
|
|
166
|
-
addParsedDoc(parsed, file
|
|
167
|
-
} catch (
|
|
168
|
-
|
|
302
|
+
addParsedDoc(parsed, file);
|
|
303
|
+
} catch (err) {
|
|
304
|
+
logger.error({
|
|
305
|
+
err,
|
|
306
|
+
file
|
|
307
|
+
}, 'Failed to process NDJSON line');
|
|
169
308
|
}
|
|
170
309
|
}).on('error', err => {
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
310
|
+
logger.error({
|
|
311
|
+
err,
|
|
312
|
+
file
|
|
313
|
+
}, 'Error while reading file');
|
|
314
|
+
})), 'Error while reading file');
|
|
174
315
|
}
|
|
175
|
-
function
|
|
316
|
+
function processCsvFile(file) {
|
|
176
317
|
const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
|
|
177
|
-
|
|
318
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(parse(parserOptions)).pipe(es.mapSync(record => {
|
|
178
319
|
try {
|
|
179
|
-
addParsedDoc(record, file
|
|
180
|
-
} catch (
|
|
181
|
-
|
|
320
|
+
addParsedDoc(record, file);
|
|
321
|
+
} catch (err) {
|
|
322
|
+
logger.error({
|
|
323
|
+
err,
|
|
324
|
+
file
|
|
325
|
+
}, 'Failed to process CSV record');
|
|
182
326
|
}
|
|
183
327
|
}).on('error', err => {
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
328
|
+
logger.error({
|
|
329
|
+
err,
|
|
330
|
+
file
|
|
331
|
+
}, 'Error while reading CSV file');
|
|
332
|
+
})), 'Error while reading CSV file');
|
|
187
333
|
}
|
|
188
|
-
function
|
|
189
|
-
|
|
334
|
+
async function processFile(file) {
|
|
335
|
+
if (sourceFormat === 'csv') {
|
|
336
|
+
await processCsvFile(file);
|
|
337
|
+
return;
|
|
338
|
+
}
|
|
339
|
+
if (sourceFormat === 'ndjson') {
|
|
340
|
+
await processNdjsonFile(file);
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
if (sourceFormat === 'parquet') {
|
|
344
|
+
await processParquetFile(file);
|
|
345
|
+
return;
|
|
346
|
+
}
|
|
347
|
+
if (sourceFormat === 'arrow') {
|
|
348
|
+
await processArrowFile(file);
|
|
349
|
+
return;
|
|
350
|
+
}
|
|
351
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
352
|
+
}
|
|
353
|
+
async function startIndex(files) {
|
|
190
354
|
if (files.length === 0) {
|
|
191
355
|
indexer.finish();
|
|
192
356
|
return;
|
|
193
357
|
}
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if (files.length > 0) {
|
|
199
|
-
startIndex(files);
|
|
200
|
-
return;
|
|
358
|
+
try {
|
|
359
|
+
for (const file of files) {
|
|
360
|
+
// eslint-disable-next-line no-await-in-loop
|
|
361
|
+
await processFile(file);
|
|
201
362
|
}
|
|
363
|
+
} catch (err) {
|
|
364
|
+
logger.error({
|
|
365
|
+
err,
|
|
366
|
+
files
|
|
367
|
+
}, 'Error while processing files');
|
|
368
|
+
} finally {
|
|
202
369
|
indexer.finish();
|
|
203
|
-
|
|
204
|
-
});
|
|
205
|
-
indexer.queueEmitter.on('pause', () => {
|
|
206
|
-
if (finished) return;
|
|
207
|
-
s.pause();
|
|
208
|
-
});
|
|
209
|
-
indexer.queueEmitter.on('resume', () => {
|
|
210
|
-
if (finished) return;
|
|
211
|
-
s.resume();
|
|
212
|
-
});
|
|
370
|
+
}
|
|
213
371
|
}
|
|
214
372
|
return () => {
|
|
215
373
|
try {
|
|
216
374
|
const files = globSync(fileName);
|
|
217
375
|
startIndex(files);
|
|
218
|
-
} catch (
|
|
219
|
-
|
|
376
|
+
} catch (err) {
|
|
377
|
+
logger.error({
|
|
378
|
+
err,
|
|
379
|
+
fileName
|
|
380
|
+
}, 'Error matching files');
|
|
381
|
+
indexer.finish();
|
|
220
382
|
}
|
|
221
383
|
};
|
|
222
384
|
}
|
|
@@ -228,7 +390,8 @@ const parallelCalls = 5;
|
|
|
228
390
|
function indexQueueFactory({
|
|
229
391
|
targetClient: client,
|
|
230
392
|
targetIndexName,
|
|
231
|
-
bufferSize = DEFAULT_BUFFER_SIZE
|
|
393
|
+
bufferSize = DEFAULT_BUFFER_SIZE,
|
|
394
|
+
logger
|
|
232
395
|
}) {
|
|
233
396
|
const queueEmitter = new EventEmitter();
|
|
234
397
|
let docsPerSecond = 0;
|
|
@@ -261,8 +424,9 @@ function indexQueueFactory({
|
|
|
261
424
|
try {
|
|
262
425
|
yield JSON.parse(line); // Parse and yield the JSON object
|
|
263
426
|
} catch (err) {
|
|
264
|
-
|
|
265
|
-
|
|
427
|
+
logger.error({
|
|
428
|
+
err
|
|
429
|
+
}, 'Failed to parse JSON from NDJSON stream');
|
|
266
430
|
}
|
|
267
431
|
}
|
|
268
432
|
}
|
|
@@ -272,7 +436,9 @@ function indexQueueFactory({
|
|
|
272
436
|
try {
|
|
273
437
|
yield JSON.parse(buffer);
|
|
274
438
|
} catch (err) {
|
|
275
|
-
|
|
439
|
+
logger.error({
|
|
440
|
+
err
|
|
441
|
+
}, 'Failed to parse final JSON from NDJSON stream');
|
|
276
442
|
}
|
|
277
443
|
}
|
|
278
444
|
} finally {
|
|
@@ -298,7 +464,7 @@ function indexQueueFactory({
|
|
|
298
464
|
flushInterval: 1000,
|
|
299
465
|
refreshOnCompletion: true,
|
|
300
466
|
datasource: ndjsonStreamIterator(stream),
|
|
301
|
-
onDocument(
|
|
467
|
+
onDocument() {
|
|
302
468
|
docsPerSecond++;
|
|
303
469
|
return {
|
|
304
470
|
index: {
|
|
@@ -307,9 +473,13 @@ function indexQueueFactory({
|
|
|
307
473
|
};
|
|
308
474
|
}
|
|
309
475
|
});
|
|
310
|
-
} catch (
|
|
311
|
-
|
|
312
|
-
|
|
476
|
+
} catch (err) {
|
|
477
|
+
logger.error({
|
|
478
|
+
err,
|
|
479
|
+
targetIndexName
|
|
480
|
+
}, 'Error during bulk indexing');
|
|
481
|
+
queueEmitter.emit('error', err);
|
|
482
|
+
throw err;
|
|
313
483
|
} finally {
|
|
314
484
|
// Clean up interval
|
|
315
485
|
clearInterval(interval);
|
|
@@ -359,7 +529,7 @@ function indexQueueFactory({
|
|
|
359
529
|
|
|
360
530
|
// create a new progress bar instance and use shades_classic theme
|
|
361
531
|
const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
|
362
|
-
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
|
|
532
|
+
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
|
|
363
533
|
return async function indexReader() {
|
|
364
534
|
let docsNum = 0;
|
|
365
535
|
let scrollId;
|
|
@@ -378,8 +548,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
378
548
|
maxRetries: 0
|
|
379
549
|
});
|
|
380
550
|
return Object.keys(response.fields);
|
|
381
|
-
} catch (
|
|
382
|
-
|
|
551
|
+
} catch (err) {
|
|
552
|
+
logger.error({
|
|
553
|
+
err,
|
|
554
|
+
sourceIndexName
|
|
555
|
+
}, 'Failed to fetch populated fields');
|
|
383
556
|
}
|
|
384
557
|
}
|
|
385
558
|
function search(fields) {
|
|
@@ -423,8 +596,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
423
596
|
return;
|
|
424
597
|
}
|
|
425
598
|
indexer.add(doc);
|
|
426
|
-
} catch (
|
|
427
|
-
|
|
599
|
+
} catch (err) {
|
|
600
|
+
logger.error({
|
|
601
|
+
err
|
|
602
|
+
}, 'Failed to process source index document');
|
|
428
603
|
}
|
|
429
604
|
}
|
|
430
605
|
async function fetchNextResponse() {
|
|
@@ -495,17 +670,25 @@ async function inferMappingsFromSource({
|
|
|
495
670
|
mappings,
|
|
496
671
|
inferMappings,
|
|
497
672
|
inferMappingsOptions,
|
|
498
|
-
|
|
673
|
+
logger
|
|
499
674
|
}) {
|
|
500
675
|
if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
|
|
501
676
|
return emptyInferenceResult(mappings);
|
|
502
677
|
}
|
|
678
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
679
|
+
logger.info({
|
|
680
|
+
sourceFormat
|
|
681
|
+
}, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
|
|
682
|
+
return emptyInferenceResult(mappings);
|
|
683
|
+
}
|
|
503
684
|
if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
|
|
504
685
|
return emptyInferenceResult(mappings);
|
|
505
686
|
}
|
|
506
687
|
const files = globSync(fileName);
|
|
507
688
|
if (files.length === 0) {
|
|
508
|
-
|
|
689
|
+
logger.info({
|
|
690
|
+
fileName
|
|
691
|
+
}, 'No files matched for mapping inference');
|
|
509
692
|
return emptyInferenceResult(mappings);
|
|
510
693
|
}
|
|
511
694
|
const {
|
|
@@ -514,7 +697,7 @@ async function inferMappingsFromSource({
|
|
|
514
697
|
} = inferMappingsOptions || {};
|
|
515
698
|
const sampleText = readSample(files[0], sampleBytes);
|
|
516
699
|
if (!sampleText || sampleText.trim() === '') {
|
|
517
|
-
|
|
700
|
+
logger.info('Skipping mapping inference because the sample text is empty');
|
|
518
701
|
return emptyInferenceResult(mappings);
|
|
519
702
|
}
|
|
520
703
|
const params = {
|
|
@@ -541,31 +724,98 @@ async function inferMappingsFromSource({
|
|
|
541
724
|
}
|
|
542
725
|
try {
|
|
543
726
|
const response = await targetClient.textStructure.findStructure(params);
|
|
544
|
-
if (response?.mappings
|
|
545
|
-
|
|
727
|
+
if (response?.mappings) {
|
|
728
|
+
logger.info({
|
|
729
|
+
file: files[0]
|
|
730
|
+
}, 'Inferred mappings via _text_structure/find_structure');
|
|
546
731
|
}
|
|
547
|
-
if (response?.ingest_pipeline
|
|
548
|
-
|
|
732
|
+
if (response?.ingest_pipeline) {
|
|
733
|
+
logger.info('Inferred ingest pipeline via _text_structure/find_structure');
|
|
549
734
|
}
|
|
550
735
|
return {
|
|
551
736
|
mappings: response?.mappings || mappings,
|
|
552
737
|
ingestPipeline: response?.ingest_pipeline
|
|
553
738
|
};
|
|
554
|
-
} catch (
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
}
|
|
739
|
+
} catch (err) {
|
|
740
|
+
logger.warn({
|
|
741
|
+
err
|
|
742
|
+
}, 'Could not infer mappings via _text_structure/find_structure');
|
|
558
743
|
return emptyInferenceResult(mappings);
|
|
559
744
|
}
|
|
560
745
|
}
|
|
561
746
|
|
|
562
|
-
|
|
563
|
-
|
|
747
|
+
const DEFAULT_LOG_LEVEL = 'info';
|
|
748
|
+
function resolveLogLevel(verbose = true) {
|
|
749
|
+
if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
|
|
750
|
+
return process.env.LOG_LEVEL;
|
|
751
|
+
}
|
|
752
|
+
return verbose ? DEFAULT_LOG_LEVEL : 'error';
|
|
753
|
+
}
|
|
754
|
+
function createLogger({
|
|
755
|
+
logger,
|
|
756
|
+
verbose = true
|
|
757
|
+
} = {}) {
|
|
758
|
+
if (logger && typeof logger === 'object') {
|
|
759
|
+
return logger;
|
|
760
|
+
}
|
|
761
|
+
return pino({
|
|
762
|
+
name: 'node-es-transformer',
|
|
763
|
+
level: resolveLogLevel(verbose),
|
|
764
|
+
timestamp: pino.stdTimeFunctions.isoTime,
|
|
765
|
+
serializers: {
|
|
766
|
+
err: pino.stdSerializers.err,
|
|
767
|
+
error: pino.stdSerializers.err
|
|
768
|
+
}
|
|
769
|
+
});
|
|
770
|
+
}
|
|
771
|
+
function createChildLogger(logger, bindings) {
|
|
772
|
+
if (!logger || typeof logger.child !== 'function') {
|
|
773
|
+
return logger;
|
|
774
|
+
}
|
|
775
|
+
return logger.child(bindings);
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
function createPauseWaiter(queueEmitter) {
|
|
779
|
+
let paused = false;
|
|
780
|
+
let waiters = [];
|
|
781
|
+
const onPause = () => {
|
|
782
|
+
paused = true;
|
|
783
|
+
};
|
|
784
|
+
const onResume = () => {
|
|
785
|
+
paused = false;
|
|
786
|
+
waiters.forEach(resolve => resolve());
|
|
787
|
+
waiters = [];
|
|
788
|
+
};
|
|
789
|
+
queueEmitter.on('pause', onPause);
|
|
790
|
+
queueEmitter.on('resume', onResume);
|
|
791
|
+
return {
|
|
792
|
+
async waitIfPaused() {
|
|
793
|
+
if (!paused) return;
|
|
794
|
+
await new Promise(resolve => {
|
|
795
|
+
waiters.push(resolve);
|
|
796
|
+
});
|
|
797
|
+
},
|
|
798
|
+
cleanup() {
|
|
799
|
+
queueEmitter.removeListener('pause', onPause);
|
|
800
|
+
queueEmitter.removeListener('resume', onResume);
|
|
801
|
+
waiters.forEach(resolve => resolve());
|
|
802
|
+
waiters = [];
|
|
803
|
+
}
|
|
804
|
+
};
|
|
805
|
+
}
|
|
806
|
+
async function readStreamToBuffer(stream) {
|
|
807
|
+
const chunks = [];
|
|
808
|
+
for await (const chunk of stream) {
|
|
809
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
810
|
+
}
|
|
811
|
+
return Buffer.concat(chunks);
|
|
812
|
+
}
|
|
813
|
+
function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
814
|
+
function addParsedDoc(parsed) {
|
|
564
815
|
const doc = typeof transform === 'function' ? transform(parsed) : parsed;
|
|
565
816
|
|
|
566
817
|
// if doc is null/undefined we'll skip indexing it
|
|
567
818
|
if (doc === null || typeof doc === 'undefined') {
|
|
568
|
-
streamRef.resume();
|
|
569
819
|
return;
|
|
570
820
|
}
|
|
571
821
|
|
|
@@ -580,50 +830,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
|
|
|
580
830
|
}
|
|
581
831
|
indexer.add(doc);
|
|
582
832
|
}
|
|
583
|
-
function
|
|
584
|
-
|
|
585
|
-
|
|
833
|
+
async function processParquetStream() {
|
|
834
|
+
const {
|
|
835
|
+
waitIfPaused,
|
|
836
|
+
cleanup
|
|
837
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
838
|
+
const parquetBuffer = await readStreamToBuffer(stream);
|
|
839
|
+
const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
|
|
840
|
+
try {
|
|
841
|
+
const cursor = reader.getCursor();
|
|
842
|
+
while (true) {
|
|
843
|
+
// eslint-disable-next-line no-await-in-loop
|
|
844
|
+
const row = await cursor.next();
|
|
845
|
+
if (row === null || typeof row === 'undefined') {
|
|
846
|
+
break;
|
|
847
|
+
}
|
|
848
|
+
addParsedDoc(row);
|
|
849
|
+
// eslint-disable-next-line no-await-in-loop
|
|
850
|
+
await waitIfPaused();
|
|
851
|
+
}
|
|
852
|
+
logger.info('Read entire stream');
|
|
853
|
+
} finally {
|
|
854
|
+
cleanup();
|
|
855
|
+
await reader.close();
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
async function processArrowStream() {
|
|
859
|
+
const {
|
|
860
|
+
waitIfPaused,
|
|
861
|
+
cleanup
|
|
862
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
863
|
+
try {
|
|
864
|
+
const reader = await arrow.RecordBatchReader.from(stream);
|
|
865
|
+
for await (const recordBatch of reader) {
|
|
866
|
+
const {
|
|
867
|
+
fields
|
|
868
|
+
} = recordBatch.schema;
|
|
869
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
870
|
+
const row = {};
|
|
871
|
+
fields.forEach(field => {
|
|
872
|
+
const vector = recordBatch.getChild(field.name);
|
|
873
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
874
|
+
});
|
|
875
|
+
addParsedDoc(row);
|
|
876
|
+
// eslint-disable-next-line no-await-in-loop
|
|
877
|
+
await waitIfPaused();
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
logger.info('Read entire stream');
|
|
881
|
+
} finally {
|
|
882
|
+
cleanup();
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
function processPipeline(buildPipeline, errorMessage) {
|
|
886
|
+
return new Promise((resolve, reject) => {
|
|
887
|
+
let finished = false;
|
|
888
|
+
const s = buildPipeline();
|
|
889
|
+
const onPause = () => {
|
|
890
|
+
if (finished) return;
|
|
891
|
+
s.pause();
|
|
892
|
+
};
|
|
893
|
+
const onResume = () => {
|
|
894
|
+
if (finished) return;
|
|
895
|
+
s.resume();
|
|
896
|
+
};
|
|
897
|
+
function cleanup() {
|
|
898
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
899
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
900
|
+
}
|
|
901
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
902
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
903
|
+
s.on('end', () => {
|
|
904
|
+
finished = true;
|
|
905
|
+
cleanup();
|
|
906
|
+
logger.info('Read entire stream');
|
|
907
|
+
resolve();
|
|
908
|
+
});
|
|
909
|
+
s.on('error', err => {
|
|
910
|
+
finished = true;
|
|
911
|
+
cleanup();
|
|
912
|
+
logger.error({
|
|
913
|
+
err
|
|
914
|
+
}, errorMessage);
|
|
915
|
+
reject(err);
|
|
916
|
+
});
|
|
917
|
+
});
|
|
918
|
+
}
|
|
919
|
+
function processCsvStream() {
|
|
920
|
+
return processPipeline(() => stream.pipe(parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
|
|
586
921
|
try {
|
|
587
|
-
addParsedDoc(record
|
|
588
|
-
} catch (
|
|
589
|
-
|
|
922
|
+
addParsedDoc(record);
|
|
923
|
+
} catch (err) {
|
|
924
|
+
logger.error({
|
|
925
|
+
err
|
|
926
|
+
}, 'Failed to process CSV stream record');
|
|
590
927
|
}
|
|
591
928
|
}).on('error', err => {
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
}
|
|
605
|
-
const parsed = JSON.parse(line);
|
|
606
|
-
addParsedDoc(parsed, s);
|
|
607
|
-
} catch (e) {
|
|
608
|
-
console.log('error', e);
|
|
929
|
+
logger.error({
|
|
930
|
+
err
|
|
931
|
+
}, 'Error while reading CSV stream');
|
|
932
|
+
})), 'Error while reading CSV stream');
|
|
933
|
+
}
|
|
934
|
+
function processNdjsonStream() {
|
|
935
|
+
let skippedHeader = false;
|
|
936
|
+
return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
937
|
+
try {
|
|
938
|
+
// skip empty lines
|
|
939
|
+
if (line === '') {
|
|
940
|
+
return;
|
|
609
941
|
}
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
942
|
+
if (skipHeader && !skippedHeader) {
|
|
943
|
+
skippedHeader = true;
|
|
944
|
+
return;
|
|
945
|
+
}
|
|
946
|
+
const parsed = JSON.parse(line);
|
|
947
|
+
addParsedDoc(parsed);
|
|
948
|
+
} catch (err) {
|
|
949
|
+
logger.error({
|
|
950
|
+
err
|
|
951
|
+
}, 'Failed to process NDJSON stream line');
|
|
952
|
+
}
|
|
953
|
+
}).on('error', err => {
|
|
954
|
+
logger.error({
|
|
955
|
+
err
|
|
956
|
+
}, 'Error while reading stream');
|
|
957
|
+
})), 'Error while reading stream');
|
|
958
|
+
}
|
|
959
|
+
async function startIndex() {
|
|
960
|
+
try {
|
|
961
|
+
if (sourceFormat === 'csv') {
|
|
962
|
+
await processCsvStream();
|
|
963
|
+
} else if (sourceFormat === 'ndjson') {
|
|
964
|
+
await processNdjsonStream();
|
|
965
|
+
} else if (sourceFormat === 'parquet') {
|
|
966
|
+
await processParquetStream();
|
|
967
|
+
} else if (sourceFormat === 'arrow') {
|
|
968
|
+
await processArrowStream();
|
|
969
|
+
} else {
|
|
970
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
971
|
+
}
|
|
972
|
+
} catch (err) {
|
|
973
|
+
logger.error({
|
|
974
|
+
err
|
|
975
|
+
}, 'Error while reading stream');
|
|
976
|
+
} finally {
|
|
616
977
|
indexer.finish();
|
|
617
|
-
|
|
618
|
-
});
|
|
619
|
-
indexer.queueEmitter.on('pause', () => {
|
|
620
|
-
if (finished) return;
|
|
621
|
-
s.pause();
|
|
622
|
-
});
|
|
623
|
-
indexer.queueEmitter.on('resume', () => {
|
|
624
|
-
if (finished) return;
|
|
625
|
-
s.resume();
|
|
626
|
-
});
|
|
978
|
+
}
|
|
627
979
|
}
|
|
628
980
|
return () => {
|
|
629
981
|
startIndex();
|
|
@@ -719,11 +1071,16 @@ async function transformer({
|
|
|
719
1071
|
query,
|
|
720
1072
|
skipHeader = false,
|
|
721
1073
|
transform,
|
|
722
|
-
verbose = true
|
|
1074
|
+
verbose = true,
|
|
1075
|
+
logger: loggerInput
|
|
723
1076
|
}) {
|
|
724
1077
|
if (typeof targetIndexName === 'undefined') {
|
|
725
1078
|
throw Error('targetIndexName must be specified.');
|
|
726
1079
|
}
|
|
1080
|
+
const logger = createLogger({
|
|
1081
|
+
logger: loggerInput,
|
|
1082
|
+
verbose
|
|
1083
|
+
});
|
|
727
1084
|
const defaultClientConfig = {
|
|
728
1085
|
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
|
|
729
1086
|
};
|
|
@@ -740,7 +1097,9 @@ async function transformer({
|
|
|
740
1097
|
mappings,
|
|
741
1098
|
inferMappings,
|
|
742
1099
|
inferMappingsOptions,
|
|
743
|
-
|
|
1100
|
+
logger: createChildLogger(logger, {
|
|
1101
|
+
component: 'mapping-inference'
|
|
1102
|
+
})
|
|
744
1103
|
});
|
|
745
1104
|
const createMapping = createMappingFactory({
|
|
746
1105
|
sourceClient,
|
|
@@ -751,17 +1110,23 @@ async function transformer({
|
|
|
751
1110
|
inferredIngestPipeline: inferenceResult.ingestPipeline,
|
|
752
1111
|
mappingsOverride,
|
|
753
1112
|
indexMappingTotalFieldsLimit,
|
|
754
|
-
verbose,
|
|
755
1113
|
deleteIndex,
|
|
756
|
-
pipeline
|
|
1114
|
+
pipeline,
|
|
1115
|
+
logger: createChildLogger(logger, {
|
|
1116
|
+
component: 'create-mapping'
|
|
1117
|
+
})
|
|
757
1118
|
});
|
|
758
1119
|
const indexer = indexQueueFactory({
|
|
759
1120
|
targetClient,
|
|
760
1121
|
targetIndexName,
|
|
761
|
-
bufferSize
|
|
1122
|
+
bufferSize,
|
|
1123
|
+
logger: createChildLogger(logger, {
|
|
1124
|
+
component: 'index-queue'
|
|
1125
|
+
})
|
|
1126
|
+
});
|
|
762
1127
|
function validateSourceFormat() {
|
|
763
|
-
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
764
|
-
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "
|
|
1128
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
|
|
1129
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
|
|
765
1130
|
}
|
|
766
1131
|
}
|
|
767
1132
|
function getReader() {
|
|
@@ -773,18 +1138,27 @@ async function transformer({
|
|
|
773
1138
|
}
|
|
774
1139
|
if (typeof fileName !== 'undefined') {
|
|
775
1140
|
validateSourceFormat();
|
|
776
|
-
return fileReaderFactory(indexer, fileName, transform, splitRegex,
|
|
1141
|
+
return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1142
|
+
component: 'file-reader'
|
|
1143
|
+
}));
|
|
777
1144
|
}
|
|
778
1145
|
if (typeof sourceIndexName !== 'undefined') {
|
|
779
|
-
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields
|
|
1146
|
+
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
|
|
1147
|
+
component: 'index-reader'
|
|
1148
|
+
}));
|
|
780
1149
|
}
|
|
781
1150
|
if (typeof stream !== 'undefined') {
|
|
782
1151
|
validateSourceFormat();
|
|
783
|
-
return streamReaderFactory(indexer, stream, transform, splitRegex,
|
|
1152
|
+
return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1153
|
+
component: 'stream-reader'
|
|
1154
|
+
}));
|
|
784
1155
|
}
|
|
785
1156
|
return null;
|
|
786
1157
|
}
|
|
787
1158
|
const reader = getReader();
|
|
1159
|
+
if (typeof reader !== 'function') {
|
|
1160
|
+
throw Error('One of fileName, sourceIndexName, or stream must be specified.');
|
|
1161
|
+
}
|
|
788
1162
|
try {
|
|
789
1163
|
const indexExists = await targetClient.indices.exists({
|
|
790
1164
|
index: targetIndexName
|
|
@@ -801,8 +1175,11 @@ async function transformer({
|
|
|
801
1175
|
} else {
|
|
802
1176
|
reader();
|
|
803
1177
|
}
|
|
804
|
-
} catch (
|
|
805
|
-
|
|
1178
|
+
} catch (err) {
|
|
1179
|
+
logger.error({
|
|
1180
|
+
err,
|
|
1181
|
+
targetIndexName
|
|
1182
|
+
}, 'Error checking index existence');
|
|
806
1183
|
} finally {
|
|
807
1184
|
// targetClient.close();
|
|
808
1185
|
}
|