node-es-transformer 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -12
- package/dist/node-es-transformer.cjs.js +567 -127
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +548 -127
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +24 -2
- package/package.json +12 -7
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
import elasticsearch9 from 'es9';
|
|
2
2
|
import elasticsearch8 from 'es8';
|
|
3
|
+
import parquet from '@dsnp/parquetjs';
|
|
4
|
+
import zlib from 'zlib';
|
|
5
|
+
import { PARQUET_COMPRESSION_METHODS } from '@dsnp/parquetjs/dist/lib/compression.js';
|
|
6
|
+
import * as arrow from 'apache-arrow';
|
|
3
7
|
import fs from 'fs';
|
|
4
8
|
import { parse } from 'csv-parse';
|
|
5
9
|
import es from 'event-stream';
|
|
@@ -7,6 +11,7 @@ import { globSync } from 'glob';
|
|
|
7
11
|
import split from 'split2';
|
|
8
12
|
import { PassThrough } from 'stream';
|
|
9
13
|
import cliProgress from 'cli-progress';
|
|
14
|
+
import pino from 'pino';
|
|
10
15
|
|
|
11
16
|
// In earlier versions this was used to set the number of docs to index in a
|
|
12
17
|
// single bulk request. Since we switched to use the helpers.bulk() method from
|
|
@@ -27,9 +32,9 @@ function createMappingFactory({
|
|
|
27
32
|
inferredIngestPipeline,
|
|
28
33
|
mappingsOverride,
|
|
29
34
|
indexMappingTotalFieldsLimit,
|
|
30
|
-
verbose,
|
|
31
35
|
deleteIndex,
|
|
32
|
-
pipeline
|
|
36
|
+
pipeline,
|
|
37
|
+
logger
|
|
33
38
|
}) {
|
|
34
39
|
return async () => {
|
|
35
40
|
let targetMappings = mappingsOverride ? undefined : mappings;
|
|
@@ -48,7 +53,10 @@ function createMappingFactory({
|
|
|
48
53
|
}
|
|
49
54
|
}
|
|
50
55
|
} catch (err) {
|
|
51
|
-
|
|
56
|
+
logger.error({
|
|
57
|
+
err,
|
|
58
|
+
sourceIndexName
|
|
59
|
+
}, 'Error reading source mapping');
|
|
52
60
|
return;
|
|
53
61
|
}
|
|
54
62
|
}
|
|
@@ -80,9 +88,14 @@ function createMappingFactory({
|
|
|
80
88
|
...inferredIngestPipeline
|
|
81
89
|
});
|
|
82
90
|
defaultPipeline = inferredPipelineName;
|
|
83
|
-
|
|
91
|
+
logger.info({
|
|
92
|
+
inferredPipelineName
|
|
93
|
+
}, 'Created inferred ingest pipeline');
|
|
84
94
|
} catch (err) {
|
|
85
|
-
|
|
95
|
+
logger.error({
|
|
96
|
+
err,
|
|
97
|
+
inferredPipelineName
|
|
98
|
+
}, 'Error creating inferred ingest pipeline');
|
|
86
99
|
}
|
|
87
100
|
}
|
|
88
101
|
const settings = {
|
|
@@ -95,22 +108,54 @@ function createMappingFactory({
|
|
|
95
108
|
'index.number_of_replicas': 0
|
|
96
109
|
} : {})
|
|
97
110
|
};
|
|
98
|
-
const
|
|
111
|
+
const response = await targetClient.indices.create({
|
|
99
112
|
index: targetIndexName,
|
|
100
113
|
mappings: targetMappings,
|
|
101
114
|
...(Object.keys(settings).length > 0 ? {
|
|
102
115
|
settings
|
|
103
116
|
} : {})
|
|
104
117
|
});
|
|
105
|
-
|
|
118
|
+
logger.info({
|
|
119
|
+
targetIndexName,
|
|
120
|
+
response
|
|
121
|
+
}, 'Created target mapping');
|
|
106
122
|
}
|
|
107
123
|
} catch (err) {
|
|
108
|
-
|
|
124
|
+
logger.error({
|
|
125
|
+
err,
|
|
126
|
+
targetIndexName
|
|
127
|
+
}, 'Error creating target mapping');
|
|
109
128
|
}
|
|
110
129
|
}
|
|
111
130
|
};
|
|
112
131
|
}
|
|
113
132
|
|
|
133
|
+
function registerZstdCompression() {
|
|
134
|
+
if (PARQUET_COMPRESSION_METHODS.ZSTD) {
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
if (typeof zlib.zstdCompressSync !== 'function' || typeof zlib.zstdDecompressSync !== 'function') {
|
|
138
|
+
PARQUET_COMPRESSION_METHODS.ZSTD = {
|
|
139
|
+
deflate() {
|
|
140
|
+
throw new Error('ZSTD compression requires Node.js with zstd support.');
|
|
141
|
+
},
|
|
142
|
+
inflate() {
|
|
143
|
+
throw new Error('ZSTD compression requires Node.js with zstd support.');
|
|
144
|
+
}
|
|
145
|
+
};
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
PARQUET_COMPRESSION_METHODS.ZSTD = {
|
|
149
|
+
deflate(value) {
|
|
150
|
+
return zlib.zstdCompressSync(value);
|
|
151
|
+
},
|
|
152
|
+
inflate(value) {
|
|
153
|
+
return zlib.zstdDecompressSync(value);
|
|
154
|
+
}
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
registerZstdCompression();
|
|
158
|
+
|
|
114
159
|
function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
115
160
|
const options = {
|
|
116
161
|
bom: true,
|
|
@@ -126,8 +171,36 @@ function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
|
126
171
|
return options;
|
|
127
172
|
}
|
|
128
173
|
|
|
129
|
-
function
|
|
130
|
-
|
|
174
|
+
function createPauseWaiter$1(queueEmitter) {
|
|
175
|
+
let paused = false;
|
|
176
|
+
let waiters = [];
|
|
177
|
+
const onPause = () => {
|
|
178
|
+
paused = true;
|
|
179
|
+
};
|
|
180
|
+
const onResume = () => {
|
|
181
|
+
paused = false;
|
|
182
|
+
waiters.forEach(resolve => resolve());
|
|
183
|
+
waiters = [];
|
|
184
|
+
};
|
|
185
|
+
queueEmitter.on('pause', onPause);
|
|
186
|
+
queueEmitter.on('resume', onResume);
|
|
187
|
+
return {
|
|
188
|
+
async waitIfPaused() {
|
|
189
|
+
if (!paused) return;
|
|
190
|
+
await new Promise(resolve => {
|
|
191
|
+
waiters.push(resolve);
|
|
192
|
+
});
|
|
193
|
+
},
|
|
194
|
+
cleanup() {
|
|
195
|
+
queueEmitter.removeListener('pause', onPause);
|
|
196
|
+
queueEmitter.removeListener('resume', onResume);
|
|
197
|
+
waiters.forEach(resolve => resolve());
|
|
198
|
+
waiters = [];
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
203
|
+
function addParsedDoc(parsed, file) {
|
|
131
204
|
const context = {
|
|
132
205
|
fileName: file
|
|
133
206
|
};
|
|
@@ -135,7 +208,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
135
208
|
|
|
136
209
|
// if doc is null/undefined we'll skip indexing it
|
|
137
210
|
if (doc === null || typeof doc === 'undefined') {
|
|
138
|
-
streamRef.resume();
|
|
139
211
|
return;
|
|
140
212
|
}
|
|
141
213
|
|
|
@@ -150,9 +222,101 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
150
222
|
}
|
|
151
223
|
indexer.add(doc);
|
|
152
224
|
}
|
|
153
|
-
function
|
|
225
|
+
async function processParquetFile(file) {
|
|
226
|
+
const {
|
|
227
|
+
waitIfPaused,
|
|
228
|
+
cleanup
|
|
229
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
230
|
+
const reader = await parquet.ParquetReader.openFile(file);
|
|
231
|
+
try {
|
|
232
|
+
const cursor = reader.getCursor();
|
|
233
|
+
while (true) {
|
|
234
|
+
// eslint-disable-next-line no-await-in-loop
|
|
235
|
+
const row = await cursor.next();
|
|
236
|
+
if (row === null || typeof row === 'undefined') {
|
|
237
|
+
break;
|
|
238
|
+
}
|
|
239
|
+
addParsedDoc(row, file);
|
|
240
|
+
// eslint-disable-next-line no-await-in-loop
|
|
241
|
+
await waitIfPaused();
|
|
242
|
+
}
|
|
243
|
+
logger.info({
|
|
244
|
+
file
|
|
245
|
+
}, 'Read entire file');
|
|
246
|
+
} finally {
|
|
247
|
+
cleanup();
|
|
248
|
+
await reader.close();
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
async function processArrowFile(file) {
|
|
252
|
+
const {
|
|
253
|
+
waitIfPaused,
|
|
254
|
+
cleanup
|
|
255
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
256
|
+
try {
|
|
257
|
+
const reader = await arrow.RecordBatchReader.from(fs.createReadStream(file));
|
|
258
|
+
for await (const recordBatch of reader) {
|
|
259
|
+
const {
|
|
260
|
+
fields
|
|
261
|
+
} = recordBatch.schema;
|
|
262
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
263
|
+
const row = {};
|
|
264
|
+
fields.forEach(field => {
|
|
265
|
+
const vector = recordBatch.getChild(field.name);
|
|
266
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
267
|
+
});
|
|
268
|
+
addParsedDoc(row, file);
|
|
269
|
+
// eslint-disable-next-line no-await-in-loop
|
|
270
|
+
await waitIfPaused();
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
logger.info({
|
|
274
|
+
file
|
|
275
|
+
}, 'Read entire file');
|
|
276
|
+
} finally {
|
|
277
|
+
cleanup();
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
function processStreamFile(file, buildStream, errorMessage) {
|
|
281
|
+
return new Promise((resolve, reject) => {
|
|
282
|
+
let finished = false;
|
|
283
|
+
const s = buildStream();
|
|
284
|
+
const onPause = () => {
|
|
285
|
+
if (finished) return;
|
|
286
|
+
s.pause();
|
|
287
|
+
};
|
|
288
|
+
const onResume = () => {
|
|
289
|
+
if (finished) return;
|
|
290
|
+
s.resume();
|
|
291
|
+
};
|
|
292
|
+
function cleanup() {
|
|
293
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
294
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
295
|
+
}
|
|
296
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
297
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
298
|
+
s.on('end', () => {
|
|
299
|
+
finished = true;
|
|
300
|
+
cleanup();
|
|
301
|
+
logger.info({
|
|
302
|
+
file
|
|
303
|
+
}, 'Read entire file');
|
|
304
|
+
resolve();
|
|
305
|
+
});
|
|
306
|
+
s.on('error', err => {
|
|
307
|
+
finished = true;
|
|
308
|
+
cleanup();
|
|
309
|
+
logger.error({
|
|
310
|
+
err,
|
|
311
|
+
file
|
|
312
|
+
}, errorMessage);
|
|
313
|
+
reject(err);
|
|
314
|
+
});
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
function processNdjsonFile(file) {
|
|
154
318
|
let skippedHeader = false;
|
|
155
|
-
|
|
319
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
156
320
|
try {
|
|
157
321
|
// skip empty lines
|
|
158
322
|
if (line === '') {
|
|
@@ -163,72 +327,115 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, sk
|
|
|
163
327
|
return;
|
|
164
328
|
}
|
|
165
329
|
const parsed = JSON.parse(line);
|
|
166
|
-
addParsedDoc(parsed, file
|
|
167
|
-
} catch (
|
|
168
|
-
|
|
330
|
+
addParsedDoc(parsed, file);
|
|
331
|
+
} catch (err) {
|
|
332
|
+
logger.error({
|
|
333
|
+
err,
|
|
334
|
+
file
|
|
335
|
+
}, 'Failed to process NDJSON line');
|
|
169
336
|
}
|
|
170
337
|
}).on('error', err => {
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
338
|
+
logger.error({
|
|
339
|
+
err,
|
|
340
|
+
file
|
|
341
|
+
}, 'Error while reading file');
|
|
342
|
+
})), 'Error while reading file');
|
|
174
343
|
}
|
|
175
|
-
function
|
|
344
|
+
function processCsvFile(file) {
|
|
176
345
|
const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
|
|
177
|
-
|
|
346
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(parse(parserOptions)).pipe(es.mapSync(record => {
|
|
178
347
|
try {
|
|
179
|
-
addParsedDoc(record, file
|
|
180
|
-
} catch (
|
|
181
|
-
|
|
348
|
+
addParsedDoc(record, file);
|
|
349
|
+
} catch (err) {
|
|
350
|
+
logger.error({
|
|
351
|
+
err,
|
|
352
|
+
file
|
|
353
|
+
}, 'Failed to process CSV record');
|
|
182
354
|
}
|
|
183
355
|
}).on('error', err => {
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
356
|
+
logger.error({
|
|
357
|
+
err,
|
|
358
|
+
file
|
|
359
|
+
}, 'Error while reading CSV file');
|
|
360
|
+
})), 'Error while reading CSV file');
|
|
187
361
|
}
|
|
188
|
-
function
|
|
189
|
-
|
|
362
|
+
async function processFile(file) {
|
|
363
|
+
if (sourceFormat === 'csv') {
|
|
364
|
+
await processCsvFile(file);
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
if (sourceFormat === 'ndjson') {
|
|
368
|
+
await processNdjsonFile(file);
|
|
369
|
+
return;
|
|
370
|
+
}
|
|
371
|
+
if (sourceFormat === 'parquet') {
|
|
372
|
+
await processParquetFile(file);
|
|
373
|
+
return;
|
|
374
|
+
}
|
|
375
|
+
if (sourceFormat === 'arrow') {
|
|
376
|
+
await processArrowFile(file);
|
|
377
|
+
return;
|
|
378
|
+
}
|
|
379
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
380
|
+
}
|
|
381
|
+
async function startIndex(files) {
|
|
190
382
|
if (files.length === 0) {
|
|
191
383
|
indexer.finish();
|
|
192
384
|
return;
|
|
193
385
|
}
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if (files.length > 0) {
|
|
199
|
-
startIndex(files);
|
|
200
|
-
return;
|
|
386
|
+
try {
|
|
387
|
+
for (const file of files) {
|
|
388
|
+
// eslint-disable-next-line no-await-in-loop
|
|
389
|
+
await processFile(file);
|
|
201
390
|
}
|
|
391
|
+
} catch (err) {
|
|
392
|
+
logger.error({
|
|
393
|
+
err,
|
|
394
|
+
files
|
|
395
|
+
}, 'Error while processing files');
|
|
396
|
+
} finally {
|
|
202
397
|
indexer.finish();
|
|
203
|
-
|
|
204
|
-
});
|
|
205
|
-
indexer.queueEmitter.on('pause', () => {
|
|
206
|
-
if (finished) return;
|
|
207
|
-
s.pause();
|
|
208
|
-
});
|
|
209
|
-
indexer.queueEmitter.on('resume', () => {
|
|
210
|
-
if (finished) return;
|
|
211
|
-
s.resume();
|
|
212
|
-
});
|
|
398
|
+
}
|
|
213
399
|
}
|
|
214
400
|
return () => {
|
|
215
401
|
try {
|
|
216
402
|
const files = globSync(fileName);
|
|
217
403
|
startIndex(files);
|
|
218
|
-
} catch (
|
|
219
|
-
|
|
404
|
+
} catch (err) {
|
|
405
|
+
logger.error({
|
|
406
|
+
err,
|
|
407
|
+
fileName
|
|
408
|
+
}, 'Error matching files');
|
|
409
|
+
indexer.finish();
|
|
220
410
|
}
|
|
221
411
|
};
|
|
222
412
|
}
|
|
223
413
|
|
|
224
414
|
const EventEmitter = require('events');
|
|
225
415
|
const parallelCalls = 5;
|
|
416
|
+
const MAX_SAFE_BIGINT = BigInt(Number.MAX_SAFE_INTEGER);
|
|
417
|
+
const MIN_SAFE_BIGINT = BigInt(Number.MIN_SAFE_INTEGER);
|
|
418
|
+
function coerceBigInt(value) {
|
|
419
|
+
if (value >= MIN_SAFE_BIGINT && value <= MAX_SAFE_BIGINT) {
|
|
420
|
+
return Number(value);
|
|
421
|
+
}
|
|
422
|
+
return value.toString();
|
|
423
|
+
}
|
|
424
|
+
function safeStringify(doc) {
|
|
425
|
+
return JSON.stringify(doc, (_key, value) => {
|
|
426
|
+
if (typeof value === 'bigint') {
|
|
427
|
+
return coerceBigInt(value);
|
|
428
|
+
}
|
|
429
|
+
return value;
|
|
430
|
+
});
|
|
431
|
+
}
|
|
226
432
|
|
|
227
433
|
// a simple helper queue to bulk index documents
|
|
228
434
|
function indexQueueFactory({
|
|
229
435
|
targetClient: client,
|
|
230
436
|
targetIndexName,
|
|
231
|
-
bufferSize = DEFAULT_BUFFER_SIZE
|
|
437
|
+
bufferSize = DEFAULT_BUFFER_SIZE,
|
|
438
|
+
logger
|
|
232
439
|
}) {
|
|
233
440
|
const queueEmitter = new EventEmitter();
|
|
234
441
|
let docsPerSecond = 0;
|
|
@@ -261,8 +468,9 @@ function indexQueueFactory({
|
|
|
261
468
|
try {
|
|
262
469
|
yield JSON.parse(line); // Parse and yield the JSON object
|
|
263
470
|
} catch (err) {
|
|
264
|
-
|
|
265
|
-
|
|
471
|
+
logger.error({
|
|
472
|
+
err
|
|
473
|
+
}, 'Failed to parse JSON from NDJSON stream');
|
|
266
474
|
}
|
|
267
475
|
}
|
|
268
476
|
}
|
|
@@ -272,7 +480,9 @@ function indexQueueFactory({
|
|
|
272
480
|
try {
|
|
273
481
|
yield JSON.parse(buffer);
|
|
274
482
|
} catch (err) {
|
|
275
|
-
|
|
483
|
+
logger.error({
|
|
484
|
+
err
|
|
485
|
+
}, 'Failed to parse final JSON from NDJSON stream');
|
|
276
486
|
}
|
|
277
487
|
}
|
|
278
488
|
} finally {
|
|
@@ -298,7 +508,7 @@ function indexQueueFactory({
|
|
|
298
508
|
flushInterval: 1000,
|
|
299
509
|
refreshOnCompletion: true,
|
|
300
510
|
datasource: ndjsonStreamIterator(stream),
|
|
301
|
-
onDocument(
|
|
511
|
+
onDocument() {
|
|
302
512
|
docsPerSecond++;
|
|
303
513
|
return {
|
|
304
514
|
index: {
|
|
@@ -307,9 +517,13 @@ function indexQueueFactory({
|
|
|
307
517
|
};
|
|
308
518
|
}
|
|
309
519
|
});
|
|
310
|
-
} catch (
|
|
311
|
-
|
|
312
|
-
|
|
520
|
+
} catch (err) {
|
|
521
|
+
logger.error({
|
|
522
|
+
err,
|
|
523
|
+
targetIndexName
|
|
524
|
+
}, 'Error during bulk indexing');
|
|
525
|
+
queueEmitter.emit('error', err);
|
|
526
|
+
throw err;
|
|
313
527
|
} finally {
|
|
314
528
|
// Clean up interval
|
|
315
529
|
clearInterval(interval);
|
|
@@ -338,7 +552,7 @@ function indexQueueFactory({
|
|
|
338
552
|
if (finished) {
|
|
339
553
|
throw new Error('Unexpected doc added after indexer should finish.');
|
|
340
554
|
}
|
|
341
|
-
const canContinue = stream.write(`${
|
|
555
|
+
const canContinue = stream.write(`${safeStringify(doc)}\n`);
|
|
342
556
|
if (!canContinue) {
|
|
343
557
|
queueEmitter.emit('pause');
|
|
344
558
|
|
|
@@ -359,7 +573,7 @@ function indexQueueFactory({
|
|
|
359
573
|
|
|
360
574
|
// create a new progress bar instance and use shades_classic theme
|
|
361
575
|
const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
|
362
|
-
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
|
|
576
|
+
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
|
|
363
577
|
return async function indexReader() {
|
|
364
578
|
let docsNum = 0;
|
|
365
579
|
let scrollId;
|
|
@@ -378,8 +592,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
378
592
|
maxRetries: 0
|
|
379
593
|
});
|
|
380
594
|
return Object.keys(response.fields);
|
|
381
|
-
} catch (
|
|
382
|
-
|
|
595
|
+
} catch (err) {
|
|
596
|
+
logger.error({
|
|
597
|
+
err,
|
|
598
|
+
sourceIndexName
|
|
599
|
+
}, 'Failed to fetch populated fields');
|
|
383
600
|
}
|
|
384
601
|
}
|
|
385
602
|
function search(fields) {
|
|
@@ -423,8 +640,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
423
640
|
return;
|
|
424
641
|
}
|
|
425
642
|
indexer.add(doc);
|
|
426
|
-
} catch (
|
|
427
|
-
|
|
643
|
+
} catch (err) {
|
|
644
|
+
logger.error({
|
|
645
|
+
err
|
|
646
|
+
}, 'Failed to process source index document');
|
|
428
647
|
}
|
|
429
648
|
}
|
|
430
649
|
async function fetchNextResponse() {
|
|
@@ -495,17 +714,25 @@ async function inferMappingsFromSource({
|
|
|
495
714
|
mappings,
|
|
496
715
|
inferMappings,
|
|
497
716
|
inferMappingsOptions,
|
|
498
|
-
|
|
717
|
+
logger
|
|
499
718
|
}) {
|
|
500
719
|
if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
|
|
501
720
|
return emptyInferenceResult(mappings);
|
|
502
721
|
}
|
|
722
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
723
|
+
logger.info({
|
|
724
|
+
sourceFormat
|
|
725
|
+
}, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
|
|
726
|
+
return emptyInferenceResult(mappings);
|
|
727
|
+
}
|
|
503
728
|
if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
|
|
504
729
|
return emptyInferenceResult(mappings);
|
|
505
730
|
}
|
|
506
731
|
const files = globSync(fileName);
|
|
507
732
|
if (files.length === 0) {
|
|
508
|
-
|
|
733
|
+
logger.info({
|
|
734
|
+
fileName
|
|
735
|
+
}, 'No files matched for mapping inference');
|
|
509
736
|
return emptyInferenceResult(mappings);
|
|
510
737
|
}
|
|
511
738
|
const {
|
|
@@ -514,7 +741,7 @@ async function inferMappingsFromSource({
|
|
|
514
741
|
} = inferMappingsOptions || {};
|
|
515
742
|
const sampleText = readSample(files[0], sampleBytes);
|
|
516
743
|
if (!sampleText || sampleText.trim() === '') {
|
|
517
|
-
|
|
744
|
+
logger.info('Skipping mapping inference because the sample text is empty');
|
|
518
745
|
return emptyInferenceResult(mappings);
|
|
519
746
|
}
|
|
520
747
|
const params = {
|
|
@@ -541,31 +768,98 @@ async function inferMappingsFromSource({
|
|
|
541
768
|
}
|
|
542
769
|
try {
|
|
543
770
|
const response = await targetClient.textStructure.findStructure(params);
|
|
544
|
-
if (response?.mappings
|
|
545
|
-
|
|
771
|
+
if (response?.mappings) {
|
|
772
|
+
logger.info({
|
|
773
|
+
file: files[0]
|
|
774
|
+
}, 'Inferred mappings via _text_structure/find_structure');
|
|
546
775
|
}
|
|
547
|
-
if (response?.ingest_pipeline
|
|
548
|
-
|
|
776
|
+
if (response?.ingest_pipeline) {
|
|
777
|
+
logger.info('Inferred ingest pipeline via _text_structure/find_structure');
|
|
549
778
|
}
|
|
550
779
|
return {
|
|
551
780
|
mappings: response?.mappings || mappings,
|
|
552
781
|
ingestPipeline: response?.ingest_pipeline
|
|
553
782
|
};
|
|
554
|
-
} catch (
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
}
|
|
783
|
+
} catch (err) {
|
|
784
|
+
logger.warn({
|
|
785
|
+
err
|
|
786
|
+
}, 'Could not infer mappings via _text_structure/find_structure');
|
|
558
787
|
return emptyInferenceResult(mappings);
|
|
559
788
|
}
|
|
560
789
|
}
|
|
561
790
|
|
|
562
|
-
|
|
563
|
-
|
|
791
|
+
const DEFAULT_LOG_LEVEL = 'info';
|
|
792
|
+
function resolveLogLevel(verbose = true) {
|
|
793
|
+
if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
|
|
794
|
+
return process.env.LOG_LEVEL;
|
|
795
|
+
}
|
|
796
|
+
return verbose ? DEFAULT_LOG_LEVEL : 'error';
|
|
797
|
+
}
|
|
798
|
+
function createLogger({
|
|
799
|
+
logger,
|
|
800
|
+
verbose = true
|
|
801
|
+
} = {}) {
|
|
802
|
+
if (logger && typeof logger === 'object') {
|
|
803
|
+
return logger;
|
|
804
|
+
}
|
|
805
|
+
return pino({
|
|
806
|
+
name: 'node-es-transformer',
|
|
807
|
+
level: resolveLogLevel(verbose),
|
|
808
|
+
timestamp: pino.stdTimeFunctions.isoTime,
|
|
809
|
+
serializers: {
|
|
810
|
+
err: pino.stdSerializers.err,
|
|
811
|
+
error: pino.stdSerializers.err
|
|
812
|
+
}
|
|
813
|
+
});
|
|
814
|
+
}
|
|
815
|
+
function createChildLogger(logger, bindings) {
|
|
816
|
+
if (!logger || typeof logger.child !== 'function') {
|
|
817
|
+
return logger;
|
|
818
|
+
}
|
|
819
|
+
return logger.child(bindings);
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
function createPauseWaiter(queueEmitter) {
|
|
823
|
+
let paused = false;
|
|
824
|
+
let waiters = [];
|
|
825
|
+
const onPause = () => {
|
|
826
|
+
paused = true;
|
|
827
|
+
};
|
|
828
|
+
const onResume = () => {
|
|
829
|
+
paused = false;
|
|
830
|
+
waiters.forEach(resolve => resolve());
|
|
831
|
+
waiters = [];
|
|
832
|
+
};
|
|
833
|
+
queueEmitter.on('pause', onPause);
|
|
834
|
+
queueEmitter.on('resume', onResume);
|
|
835
|
+
return {
|
|
836
|
+
async waitIfPaused() {
|
|
837
|
+
if (!paused) return;
|
|
838
|
+
await new Promise(resolve => {
|
|
839
|
+
waiters.push(resolve);
|
|
840
|
+
});
|
|
841
|
+
},
|
|
842
|
+
cleanup() {
|
|
843
|
+
queueEmitter.removeListener('pause', onPause);
|
|
844
|
+
queueEmitter.removeListener('resume', onResume);
|
|
845
|
+
waiters.forEach(resolve => resolve());
|
|
846
|
+
waiters = [];
|
|
847
|
+
}
|
|
848
|
+
};
|
|
849
|
+
}
|
|
850
|
+
async function readStreamToBuffer(stream) {
|
|
851
|
+
const chunks = [];
|
|
852
|
+
for await (const chunk of stream) {
|
|
853
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
854
|
+
}
|
|
855
|
+
return Buffer.concat(chunks);
|
|
856
|
+
}
|
|
857
|
+
function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
858
|
+
function addParsedDoc(parsed) {
|
|
564
859
|
const doc = typeof transform === 'function' ? transform(parsed) : parsed;
|
|
565
860
|
|
|
566
861
|
// if doc is null/undefined we'll skip indexing it
|
|
567
862
|
if (doc === null || typeof doc === 'undefined') {
|
|
568
|
-
streamRef.resume();
|
|
569
863
|
return;
|
|
570
864
|
}
|
|
571
865
|
|
|
@@ -580,50 +874,152 @@ function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, sk
|
|
|
580
874
|
}
|
|
581
875
|
indexer.add(doc);
|
|
582
876
|
}
|
|
583
|
-
function
|
|
584
|
-
|
|
585
|
-
|
|
877
|
+
async function processParquetStream() {
|
|
878
|
+
const {
|
|
879
|
+
waitIfPaused,
|
|
880
|
+
cleanup
|
|
881
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
882
|
+
const parquetBuffer = await readStreamToBuffer(stream);
|
|
883
|
+
const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
|
|
884
|
+
try {
|
|
885
|
+
const cursor = reader.getCursor();
|
|
886
|
+
while (true) {
|
|
887
|
+
// eslint-disable-next-line no-await-in-loop
|
|
888
|
+
const row = await cursor.next();
|
|
889
|
+
if (row === null || typeof row === 'undefined') {
|
|
890
|
+
break;
|
|
891
|
+
}
|
|
892
|
+
addParsedDoc(row);
|
|
893
|
+
// eslint-disable-next-line no-await-in-loop
|
|
894
|
+
await waitIfPaused();
|
|
895
|
+
}
|
|
896
|
+
logger.info('Read entire stream');
|
|
897
|
+
} finally {
|
|
898
|
+
cleanup();
|
|
899
|
+
await reader.close();
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
async function processArrowStream() {
|
|
903
|
+
const {
|
|
904
|
+
waitIfPaused,
|
|
905
|
+
cleanup
|
|
906
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
907
|
+
try {
|
|
908
|
+
const reader = await arrow.RecordBatchReader.from(stream);
|
|
909
|
+
for await (const recordBatch of reader) {
|
|
910
|
+
const {
|
|
911
|
+
fields
|
|
912
|
+
} = recordBatch.schema;
|
|
913
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
914
|
+
const row = {};
|
|
915
|
+
fields.forEach(field => {
|
|
916
|
+
const vector = recordBatch.getChild(field.name);
|
|
917
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
918
|
+
});
|
|
919
|
+
addParsedDoc(row);
|
|
920
|
+
// eslint-disable-next-line no-await-in-loop
|
|
921
|
+
await waitIfPaused();
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
logger.info('Read entire stream');
|
|
925
|
+
} finally {
|
|
926
|
+
cleanup();
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
function processPipeline(buildPipeline, errorMessage) {
|
|
930
|
+
return new Promise((resolve, reject) => {
|
|
931
|
+
let finished = false;
|
|
932
|
+
const s = buildPipeline();
|
|
933
|
+
const onPause = () => {
|
|
934
|
+
if (finished) return;
|
|
935
|
+
s.pause();
|
|
936
|
+
};
|
|
937
|
+
const onResume = () => {
|
|
938
|
+
if (finished) return;
|
|
939
|
+
s.resume();
|
|
940
|
+
};
|
|
941
|
+
function cleanup() {
|
|
942
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
943
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
944
|
+
}
|
|
945
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
946
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
947
|
+
s.on('end', () => {
|
|
948
|
+
finished = true;
|
|
949
|
+
cleanup();
|
|
950
|
+
logger.info('Read entire stream');
|
|
951
|
+
resolve();
|
|
952
|
+
});
|
|
953
|
+
s.on('error', err => {
|
|
954
|
+
finished = true;
|
|
955
|
+
cleanup();
|
|
956
|
+
logger.error({
|
|
957
|
+
err
|
|
958
|
+
}, errorMessage);
|
|
959
|
+
reject(err);
|
|
960
|
+
});
|
|
961
|
+
});
|
|
962
|
+
}
|
|
963
|
+
function processCsvStream() {
|
|
964
|
+
return processPipeline(() => stream.pipe(parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
|
|
586
965
|
try {
|
|
587
|
-
addParsedDoc(record
|
|
588
|
-
} catch (
|
|
589
|
-
|
|
966
|
+
addParsedDoc(record);
|
|
967
|
+
} catch (err) {
|
|
968
|
+
logger.error({
|
|
969
|
+
err
|
|
970
|
+
}, 'Failed to process CSV stream record');
|
|
590
971
|
}
|
|
591
972
|
}).on('error', err => {
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
}
|
|
605
|
-
const parsed = JSON.parse(line);
|
|
606
|
-
addParsedDoc(parsed, s);
|
|
607
|
-
} catch (e) {
|
|
608
|
-
console.log('error', e);
|
|
973
|
+
logger.error({
|
|
974
|
+
err
|
|
975
|
+
}, 'Error while reading CSV stream');
|
|
976
|
+
})), 'Error while reading CSV stream');
|
|
977
|
+
}
|
|
978
|
+
function processNdjsonStream() {
|
|
979
|
+
let skippedHeader = false;
|
|
980
|
+
return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
981
|
+
try {
|
|
982
|
+
// skip empty lines
|
|
983
|
+
if (line === '') {
|
|
984
|
+
return;
|
|
609
985
|
}
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
986
|
+
if (skipHeader && !skippedHeader) {
|
|
987
|
+
skippedHeader = true;
|
|
988
|
+
return;
|
|
989
|
+
}
|
|
990
|
+
const parsed = JSON.parse(line);
|
|
991
|
+
addParsedDoc(parsed);
|
|
992
|
+
} catch (err) {
|
|
993
|
+
logger.error({
|
|
994
|
+
err
|
|
995
|
+
}, 'Failed to process NDJSON stream line');
|
|
996
|
+
}
|
|
997
|
+
}).on('error', err => {
|
|
998
|
+
logger.error({
|
|
999
|
+
err
|
|
1000
|
+
}, 'Error while reading stream');
|
|
1001
|
+
})), 'Error while reading stream');
|
|
1002
|
+
}
|
|
1003
|
+
async function startIndex() {
|
|
1004
|
+
try {
|
|
1005
|
+
if (sourceFormat === 'csv') {
|
|
1006
|
+
await processCsvStream();
|
|
1007
|
+
} else if (sourceFormat === 'ndjson') {
|
|
1008
|
+
await processNdjsonStream();
|
|
1009
|
+
} else if (sourceFormat === 'parquet') {
|
|
1010
|
+
await processParquetStream();
|
|
1011
|
+
} else if (sourceFormat === 'arrow') {
|
|
1012
|
+
await processArrowStream();
|
|
1013
|
+
} else {
|
|
1014
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
1015
|
+
}
|
|
1016
|
+
} catch (err) {
|
|
1017
|
+
logger.error({
|
|
1018
|
+
err
|
|
1019
|
+
}, 'Error while reading stream');
|
|
1020
|
+
} finally {
|
|
616
1021
|
indexer.finish();
|
|
617
|
-
|
|
618
|
-
});
|
|
619
|
-
indexer.queueEmitter.on('pause', () => {
|
|
620
|
-
if (finished) return;
|
|
621
|
-
s.pause();
|
|
622
|
-
});
|
|
623
|
-
indexer.queueEmitter.on('resume', () => {
|
|
624
|
-
if (finished) return;
|
|
625
|
-
s.resume();
|
|
626
|
-
});
|
|
1022
|
+
}
|
|
627
1023
|
}
|
|
628
1024
|
return () => {
|
|
629
1025
|
startIndex();
|
|
@@ -719,11 +1115,16 @@ async function transformer({
|
|
|
719
1115
|
query,
|
|
720
1116
|
skipHeader = false,
|
|
721
1117
|
transform,
|
|
722
|
-
verbose = true
|
|
1118
|
+
verbose = true,
|
|
1119
|
+
logger: loggerInput
|
|
723
1120
|
}) {
|
|
724
1121
|
if (typeof targetIndexName === 'undefined') {
|
|
725
1122
|
throw Error('targetIndexName must be specified.');
|
|
726
1123
|
}
|
|
1124
|
+
const logger = createLogger({
|
|
1125
|
+
logger: loggerInput,
|
|
1126
|
+
verbose
|
|
1127
|
+
});
|
|
727
1128
|
const defaultClientConfig = {
|
|
728
1129
|
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
|
|
729
1130
|
};
|
|
@@ -740,7 +1141,9 @@ async function transformer({
|
|
|
740
1141
|
mappings,
|
|
741
1142
|
inferMappings,
|
|
742
1143
|
inferMappingsOptions,
|
|
743
|
-
|
|
1144
|
+
logger: createChildLogger(logger, {
|
|
1145
|
+
component: 'mapping-inference'
|
|
1146
|
+
})
|
|
744
1147
|
});
|
|
745
1148
|
const createMapping = createMappingFactory({
|
|
746
1149
|
sourceClient,
|
|
@@ -751,17 +1154,23 @@ async function transformer({
|
|
|
751
1154
|
inferredIngestPipeline: inferenceResult.ingestPipeline,
|
|
752
1155
|
mappingsOverride,
|
|
753
1156
|
indexMappingTotalFieldsLimit,
|
|
754
|
-
verbose,
|
|
755
1157
|
deleteIndex,
|
|
756
|
-
pipeline
|
|
1158
|
+
pipeline,
|
|
1159
|
+
logger: createChildLogger(logger, {
|
|
1160
|
+
component: 'create-mapping'
|
|
1161
|
+
})
|
|
757
1162
|
});
|
|
758
1163
|
const indexer = indexQueueFactory({
|
|
759
1164
|
targetClient,
|
|
760
1165
|
targetIndexName,
|
|
761
|
-
bufferSize
|
|
1166
|
+
bufferSize,
|
|
1167
|
+
logger: createChildLogger(logger, {
|
|
1168
|
+
component: 'index-queue'
|
|
1169
|
+
})
|
|
1170
|
+
});
|
|
762
1171
|
function validateSourceFormat() {
|
|
763
|
-
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
764
|
-
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "
|
|
1172
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
|
|
1173
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
|
|
765
1174
|
}
|
|
766
1175
|
}
|
|
767
1176
|
function getReader() {
|
|
@@ -773,18 +1182,27 @@ async function transformer({
|
|
|
773
1182
|
}
|
|
774
1183
|
if (typeof fileName !== 'undefined') {
|
|
775
1184
|
validateSourceFormat();
|
|
776
|
-
return fileReaderFactory(indexer, fileName, transform, splitRegex,
|
|
1185
|
+
return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1186
|
+
component: 'file-reader'
|
|
1187
|
+
}));
|
|
777
1188
|
}
|
|
778
1189
|
if (typeof sourceIndexName !== 'undefined') {
|
|
779
|
-
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields
|
|
1190
|
+
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
|
|
1191
|
+
component: 'index-reader'
|
|
1192
|
+
}));
|
|
780
1193
|
}
|
|
781
1194
|
if (typeof stream !== 'undefined') {
|
|
782
1195
|
validateSourceFormat();
|
|
783
|
-
return streamReaderFactory(indexer, stream, transform, splitRegex,
|
|
1196
|
+
return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1197
|
+
component: 'stream-reader'
|
|
1198
|
+
}));
|
|
784
1199
|
}
|
|
785
1200
|
return null;
|
|
786
1201
|
}
|
|
787
1202
|
const reader = getReader();
|
|
1203
|
+
if (typeof reader !== 'function') {
|
|
1204
|
+
throw Error('One of fileName, sourceIndexName, or stream must be specified.');
|
|
1205
|
+
}
|
|
788
1206
|
try {
|
|
789
1207
|
const indexExists = await targetClient.indices.exists({
|
|
790
1208
|
index: targetIndexName
|
|
@@ -801,8 +1219,11 @@ async function transformer({
|
|
|
801
1219
|
} else {
|
|
802
1220
|
reader();
|
|
803
1221
|
}
|
|
804
|
-
} catch (
|
|
805
|
-
|
|
1222
|
+
} catch (err) {
|
|
1223
|
+
logger.error({
|
|
1224
|
+
err,
|
|
1225
|
+
targetIndexName
|
|
1226
|
+
}, 'Error checking index existence');
|
|
806
1227
|
} finally {
|
|
807
1228
|
// targetClient.close();
|
|
808
1229
|
}
|