node-es-transformer 1.0.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -13
- package/dist/node-es-transformer.cjs.js +722 -128
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +704 -129
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +81 -2
- package/package.json +12 -6
|
@@ -2,12 +2,35 @@
|
|
|
2
2
|
|
|
3
3
|
var elasticsearch9 = require('es9');
|
|
4
4
|
var elasticsearch8 = require('es8');
|
|
5
|
+
var parquet = require('@dsnp/parquetjs');
|
|
6
|
+
var arrow = require('apache-arrow');
|
|
5
7
|
var fs = require('fs');
|
|
8
|
+
var csvParse = require('csv-parse');
|
|
6
9
|
var es = require('event-stream');
|
|
7
10
|
var glob = require('glob');
|
|
8
11
|
var split = require('split2');
|
|
9
12
|
var stream = require('stream');
|
|
10
13
|
var cliProgress = require('cli-progress');
|
|
14
|
+
var pino = require('pino');
|
|
15
|
+
|
|
16
|
+
function _interopNamespaceDefault(e) {
|
|
17
|
+
var n = Object.create(null);
|
|
18
|
+
if (e) {
|
|
19
|
+
Object.keys(e).forEach(function (k) {
|
|
20
|
+
if (k !== 'default') {
|
|
21
|
+
var d = Object.getOwnPropertyDescriptor(e, k);
|
|
22
|
+
Object.defineProperty(n, k, d.get ? d : {
|
|
23
|
+
enumerable: true,
|
|
24
|
+
get: function () { return e[k]; }
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
n.default = e;
|
|
30
|
+
return Object.freeze(n);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
var arrow__namespace = /*#__PURE__*/_interopNamespaceDefault(arrow);
|
|
11
34
|
|
|
12
35
|
// In earlier versions this was used to set the number of docs to index in a
|
|
13
36
|
// single bulk request. Since we switched to use the helpers.bulk() method from
|
|
@@ -25,14 +48,16 @@ function createMappingFactory({
|
|
|
25
48
|
targetClient,
|
|
26
49
|
targetIndexName,
|
|
27
50
|
mappings,
|
|
51
|
+
inferredIngestPipeline,
|
|
28
52
|
mappingsOverride,
|
|
29
53
|
indexMappingTotalFieldsLimit,
|
|
30
|
-
verbose,
|
|
31
54
|
deleteIndex,
|
|
32
|
-
pipeline
|
|
55
|
+
pipeline,
|
|
56
|
+
logger
|
|
33
57
|
}) {
|
|
34
58
|
return async () => {
|
|
35
59
|
let targetMappings = mappingsOverride ? undefined : mappings;
|
|
60
|
+
let defaultPipeline = pipeline;
|
|
36
61
|
if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
|
|
37
62
|
try {
|
|
38
63
|
const mapping = await sourceClient.indices.getMapping({
|
|
@@ -47,7 +72,10 @@ function createMappingFactory({
|
|
|
47
72
|
}
|
|
48
73
|
}
|
|
49
74
|
} catch (err) {
|
|
50
|
-
|
|
75
|
+
logger.error({
|
|
76
|
+
err,
|
|
77
|
+
sourceIndexName
|
|
78
|
+
}, 'Error reading source mapping');
|
|
51
79
|
return;
|
|
52
80
|
}
|
|
53
81
|
}
|
|
@@ -71,93 +99,312 @@ function createMappingFactory({
|
|
|
71
99
|
});
|
|
72
100
|
}
|
|
73
101
|
if (indexExists === false || deleteIndex === true) {
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
102
|
+
if (typeof defaultPipeline === 'undefined' && typeof inferredIngestPipeline === 'object' && inferredIngestPipeline !== null && typeof targetClient?.ingest?.putPipeline === 'function') {
|
|
103
|
+
const inferredPipelineName = `${targetIndexName}-inferred-pipeline`;
|
|
104
|
+
try {
|
|
105
|
+
await targetClient.ingest.putPipeline({
|
|
106
|
+
id: inferredPipelineName,
|
|
107
|
+
...inferredIngestPipeline
|
|
108
|
+
});
|
|
109
|
+
defaultPipeline = inferredPipelineName;
|
|
110
|
+
logger.info({
|
|
111
|
+
inferredPipelineName
|
|
112
|
+
}, 'Created inferred ingest pipeline');
|
|
113
|
+
} catch (err) {
|
|
114
|
+
logger.error({
|
|
115
|
+
err,
|
|
116
|
+
inferredPipelineName
|
|
117
|
+
}, 'Error creating inferred ingest pipeline');
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
const settings = {
|
|
121
|
+
...(defaultPipeline !== undefined ? {
|
|
122
|
+
'index.default_pipeline': defaultPipeline
|
|
83
123
|
} : {}),
|
|
84
124
|
...(indexMappingTotalFieldsLimit !== undefined ? {
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
125
|
+
'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
|
|
126
|
+
'index.number_of_shards': 1,
|
|
127
|
+
'index.number_of_replicas': 0
|
|
128
|
+
} : {})
|
|
129
|
+
};
|
|
130
|
+
const response = await targetClient.indices.create({
|
|
131
|
+
index: targetIndexName,
|
|
132
|
+
mappings: targetMappings,
|
|
133
|
+
...(Object.keys(settings).length > 0 ? {
|
|
134
|
+
settings
|
|
90
135
|
} : {})
|
|
91
136
|
});
|
|
92
|
-
|
|
137
|
+
logger.info({
|
|
138
|
+
targetIndexName,
|
|
139
|
+
response
|
|
140
|
+
}, 'Created target mapping');
|
|
93
141
|
}
|
|
94
142
|
} catch (err) {
|
|
95
|
-
|
|
143
|
+
logger.error({
|
|
144
|
+
err,
|
|
145
|
+
targetIndexName
|
|
146
|
+
}, 'Error creating target mapping');
|
|
96
147
|
}
|
|
97
148
|
}
|
|
98
149
|
};
|
|
99
150
|
}
|
|
100
151
|
|
|
101
|
-
function
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
152
|
+
function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
153
|
+
const options = {
|
|
154
|
+
bom: true,
|
|
155
|
+
columns: true,
|
|
156
|
+
trim: true,
|
|
157
|
+
skip_empty_lines: true,
|
|
158
|
+
...csvOptions
|
|
159
|
+
};
|
|
160
|
+
const consumesHeader = options.columns === true || typeof options.columns === 'function';
|
|
161
|
+
if (skipHeader && !consumesHeader && typeof options.from_line === 'undefined') {
|
|
162
|
+
options.from_line = 2;
|
|
163
|
+
}
|
|
164
|
+
return options;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function createPauseWaiter$1(queueEmitter) {
|
|
168
|
+
let paused = false;
|
|
169
|
+
let waiters = [];
|
|
170
|
+
const onPause = () => {
|
|
171
|
+
paused = true;
|
|
172
|
+
};
|
|
173
|
+
const onResume = () => {
|
|
174
|
+
paused = false;
|
|
175
|
+
waiters.forEach(resolve => resolve());
|
|
176
|
+
waiters = [];
|
|
177
|
+
};
|
|
178
|
+
queueEmitter.on('pause', onPause);
|
|
179
|
+
queueEmitter.on('resume', onResume);
|
|
180
|
+
return {
|
|
181
|
+
async waitIfPaused() {
|
|
182
|
+
if (!paused) return;
|
|
183
|
+
await new Promise(resolve => {
|
|
184
|
+
waiters.push(resolve);
|
|
185
|
+
});
|
|
186
|
+
},
|
|
187
|
+
cleanup() {
|
|
188
|
+
queueEmitter.removeListener('pause', onPause);
|
|
189
|
+
queueEmitter.removeListener('resume', onResume);
|
|
190
|
+
waiters.forEach(resolve => resolve());
|
|
191
|
+
waiters = [];
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
function fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
196
|
+
function addParsedDoc(parsed, file) {
|
|
197
|
+
const context = {
|
|
198
|
+
fileName: file
|
|
199
|
+
};
|
|
200
|
+
const doc = typeof transform === 'function' ? transform(parsed, context) : parsed;
|
|
201
|
+
|
|
202
|
+
// if doc is null/undefined we'll skip indexing it
|
|
203
|
+
if (doc === null || typeof doc === 'undefined') {
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// the transform callback may return an array of docs so we can emit
|
|
208
|
+
// multiple docs from a single line
|
|
209
|
+
if (Array.isArray(doc)) {
|
|
210
|
+
doc.forEach(d => {
|
|
211
|
+
if (d === null || typeof d === 'undefined') return;
|
|
212
|
+
indexer.add(d);
|
|
213
|
+
});
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
indexer.add(doc);
|
|
217
|
+
}
|
|
218
|
+
async function processParquetFile(file) {
|
|
219
|
+
const {
|
|
220
|
+
waitIfPaused,
|
|
221
|
+
cleanup
|
|
222
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
223
|
+
const reader = await parquet.ParquetReader.openFile(file);
|
|
224
|
+
try {
|
|
225
|
+
const cursor = reader.getCursor();
|
|
226
|
+
while (true) {
|
|
227
|
+
// eslint-disable-next-line no-await-in-loop
|
|
228
|
+
const row = await cursor.next();
|
|
229
|
+
if (row === null || typeof row === 'undefined') {
|
|
230
|
+
break;
|
|
231
|
+
}
|
|
232
|
+
addParsedDoc(row, file);
|
|
233
|
+
// eslint-disable-next-line no-await-in-loop
|
|
234
|
+
await waitIfPaused();
|
|
235
|
+
}
|
|
236
|
+
logger.info({
|
|
237
|
+
file
|
|
238
|
+
}, 'Read entire file');
|
|
239
|
+
} finally {
|
|
240
|
+
cleanup();
|
|
241
|
+
await reader.close();
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
async function processArrowFile(file) {
|
|
245
|
+
const {
|
|
246
|
+
waitIfPaused,
|
|
247
|
+
cleanup
|
|
248
|
+
} = createPauseWaiter$1(indexer.queueEmitter);
|
|
249
|
+
try {
|
|
250
|
+
const reader = await arrow__namespace.RecordBatchReader.from(fs.createReadStream(file));
|
|
251
|
+
for await (const recordBatch of reader) {
|
|
252
|
+
const {
|
|
253
|
+
fields
|
|
254
|
+
} = recordBatch.schema;
|
|
255
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
256
|
+
const row = {};
|
|
257
|
+
fields.forEach(field => {
|
|
258
|
+
const vector = recordBatch.getChild(field.name);
|
|
259
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
260
|
+
});
|
|
261
|
+
addParsedDoc(row, file);
|
|
262
|
+
// eslint-disable-next-line no-await-in-loop
|
|
263
|
+
await waitIfPaused();
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
logger.info({
|
|
267
|
+
file
|
|
268
|
+
}, 'Read entire file');
|
|
269
|
+
} finally {
|
|
270
|
+
cleanup();
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
function processStreamFile(file, buildStream, errorMessage) {
|
|
274
|
+
return new Promise((resolve, reject) => {
|
|
275
|
+
let finished = false;
|
|
276
|
+
const s = buildStream();
|
|
277
|
+
const onPause = () => {
|
|
278
|
+
if (finished) return;
|
|
279
|
+
s.pause();
|
|
280
|
+
};
|
|
281
|
+
const onResume = () => {
|
|
282
|
+
if (finished) return;
|
|
283
|
+
s.resume();
|
|
284
|
+
};
|
|
285
|
+
function cleanup() {
|
|
286
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
287
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
288
|
+
}
|
|
289
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
290
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
291
|
+
s.on('end', () => {
|
|
292
|
+
finished = true;
|
|
293
|
+
cleanup();
|
|
294
|
+
logger.info({
|
|
295
|
+
file
|
|
296
|
+
}, 'Read entire file');
|
|
297
|
+
resolve();
|
|
298
|
+
});
|
|
299
|
+
s.on('error', err => {
|
|
300
|
+
finished = true;
|
|
301
|
+
cleanup();
|
|
302
|
+
logger.error({
|
|
303
|
+
err,
|
|
304
|
+
file
|
|
305
|
+
}, errorMessage);
|
|
306
|
+
reject(err);
|
|
307
|
+
});
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
function processNdjsonFile(file) {
|
|
311
|
+
let skippedHeader = false;
|
|
312
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
106
313
|
try {
|
|
107
314
|
// skip empty lines
|
|
108
315
|
if (line === '') {
|
|
109
316
|
return;
|
|
110
317
|
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
// if doc is undefined we'll skip indexing it
|
|
114
|
-
if (typeof doc === 'undefined') {
|
|
115
|
-
s.resume();
|
|
116
|
-
return;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
// the transform callback may return an array of docs so we can emit
|
|
120
|
-
// multiple docs from a single line
|
|
121
|
-
if (Array.isArray(doc)) {
|
|
122
|
-
doc.forEach(d => indexer.add(d));
|
|
318
|
+
if (skipHeader && !skippedHeader) {
|
|
319
|
+
skippedHeader = true;
|
|
123
320
|
return;
|
|
124
321
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
322
|
+
const parsed = JSON.parse(line);
|
|
323
|
+
addParsedDoc(parsed, file);
|
|
324
|
+
} catch (err) {
|
|
325
|
+
logger.error({
|
|
326
|
+
err,
|
|
327
|
+
file
|
|
328
|
+
}, 'Failed to process NDJSON line');
|
|
128
329
|
}
|
|
129
330
|
}).on('error', err => {
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
331
|
+
logger.error({
|
|
332
|
+
err,
|
|
333
|
+
file
|
|
334
|
+
}, 'Error while reading file');
|
|
335
|
+
})), 'Error while reading file');
|
|
336
|
+
}
|
|
337
|
+
function processCsvFile(file) {
|
|
338
|
+
const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
|
|
339
|
+
return processStreamFile(file, () => fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
|
|
340
|
+
try {
|
|
341
|
+
addParsedDoc(record, file);
|
|
342
|
+
} catch (err) {
|
|
343
|
+
logger.error({
|
|
344
|
+
err,
|
|
345
|
+
file
|
|
346
|
+
}, 'Failed to process CSV record');
|
|
347
|
+
}
|
|
348
|
+
}).on('error', err => {
|
|
349
|
+
logger.error({
|
|
350
|
+
err,
|
|
351
|
+
file
|
|
352
|
+
}, 'Error while reading CSV file');
|
|
353
|
+
})), 'Error while reading CSV file');
|
|
354
|
+
}
|
|
355
|
+
async function processFile(file) {
|
|
356
|
+
if (sourceFormat === 'csv') {
|
|
357
|
+
await processCsvFile(file);
|
|
358
|
+
return;
|
|
359
|
+
}
|
|
360
|
+
if (sourceFormat === 'ndjson') {
|
|
361
|
+
await processNdjsonFile(file);
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
if (sourceFormat === 'parquet') {
|
|
365
|
+
await processParquetFile(file);
|
|
366
|
+
return;
|
|
367
|
+
}
|
|
368
|
+
if (sourceFormat === 'arrow') {
|
|
369
|
+
await processArrowFile(file);
|
|
370
|
+
return;
|
|
371
|
+
}
|
|
372
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
373
|
+
}
|
|
374
|
+
async function startIndex(files) {
|
|
375
|
+
if (files.length === 0) {
|
|
376
|
+
indexer.finish();
|
|
377
|
+
return;
|
|
378
|
+
}
|
|
379
|
+
try {
|
|
380
|
+
for (const file of files) {
|
|
381
|
+
// eslint-disable-next-line no-await-in-loop
|
|
382
|
+
await processFile(file);
|
|
136
383
|
}
|
|
384
|
+
} catch (err) {
|
|
385
|
+
logger.error({
|
|
386
|
+
err,
|
|
387
|
+
files
|
|
388
|
+
}, 'Error while processing files');
|
|
389
|
+
} finally {
|
|
137
390
|
indexer.finish();
|
|
138
|
-
|
|
139
|
-
}));
|
|
140
|
-
indexer.queueEmitter.on('pause', () => {
|
|
141
|
-
if (finished) return;
|
|
142
|
-
s.pause();
|
|
143
|
-
});
|
|
144
|
-
indexer.queueEmitter.on('resume', () => {
|
|
145
|
-
if (finished) return;
|
|
146
|
-
s.resume();
|
|
147
|
-
});
|
|
391
|
+
}
|
|
148
392
|
}
|
|
149
393
|
return () => {
|
|
150
394
|
try {
|
|
151
395
|
const files = glob.globSync(fileName);
|
|
152
396
|
startIndex(files);
|
|
153
|
-
} catch (
|
|
154
|
-
|
|
397
|
+
} catch (err) {
|
|
398
|
+
logger.error({
|
|
399
|
+
err,
|
|
400
|
+
fileName
|
|
401
|
+
}, 'Error matching files');
|
|
402
|
+
indexer.finish();
|
|
155
403
|
}
|
|
156
404
|
};
|
|
157
405
|
}
|
|
158
406
|
|
|
159
407
|
const EventEmitter = require('events');
|
|
160
|
-
const queueEmitter = new EventEmitter();
|
|
161
408
|
const parallelCalls = 5;
|
|
162
409
|
|
|
163
410
|
// a simple helper queue to bulk index documents
|
|
@@ -165,21 +412,20 @@ function indexQueueFactory({
|
|
|
165
412
|
targetClient: client,
|
|
166
413
|
targetIndexName,
|
|
167
414
|
bufferSize = DEFAULT_BUFFER_SIZE,
|
|
168
|
-
|
|
415
|
+
logger
|
|
169
416
|
}) {
|
|
417
|
+
const queueEmitter = new EventEmitter();
|
|
170
418
|
let docsPerSecond = 0;
|
|
171
419
|
const flushBytes = bufferSize * 1024; // Convert KB to Bytes
|
|
172
420
|
const highWaterMark = flushBytes * parallelCalls;
|
|
173
421
|
|
|
174
|
-
// Create a
|
|
175
|
-
const stream$1 = new stream.
|
|
176
|
-
read() {},
|
|
177
|
-
// Implement read but we manage pushing manually
|
|
422
|
+
// Create a PassThrough stream (readable + writable) for proper backpressure
|
|
423
|
+
const stream$1 = new stream.PassThrough({
|
|
178
424
|
highWaterMark // Buffer size for backpressure management
|
|
179
425
|
});
|
|
180
426
|
async function* ndjsonStreamIterator(readableStream) {
|
|
181
427
|
let buffer = ''; // To hold the incomplete data
|
|
182
|
-
|
|
428
|
+
|
|
183
429
|
try {
|
|
184
430
|
// Iterate over the stream using async iteration
|
|
185
431
|
for await (const chunk of readableStream) {
|
|
@@ -193,16 +439,15 @@ function indexQueueFactory({
|
|
|
193
439
|
|
|
194
440
|
// Yield each complete JSON object
|
|
195
441
|
for (const line of lines) {
|
|
196
|
-
if (line.trim()) {
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
}
|
|
442
|
+
if (!line.trim()) {
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
445
|
+
try {
|
|
446
|
+
yield JSON.parse(line); // Parse and yield the JSON object
|
|
447
|
+
} catch (err) {
|
|
448
|
+
logger.error({
|
|
449
|
+
err
|
|
450
|
+
}, 'Failed to parse JSON from NDJSON stream');
|
|
206
451
|
}
|
|
207
452
|
}
|
|
208
453
|
}
|
|
@@ -212,7 +457,9 @@ function indexQueueFactory({
|
|
|
212
457
|
try {
|
|
213
458
|
yield JSON.parse(buffer);
|
|
214
459
|
} catch (err) {
|
|
215
|
-
|
|
460
|
+
logger.error({
|
|
461
|
+
err
|
|
462
|
+
}, 'Failed to parse final JSON from NDJSON stream');
|
|
216
463
|
}
|
|
217
464
|
}
|
|
218
465
|
} finally {
|
|
@@ -238,7 +485,7 @@ function indexQueueFactory({
|
|
|
238
485
|
flushInterval: 1000,
|
|
239
486
|
refreshOnCompletion: true,
|
|
240
487
|
datasource: ndjsonStreamIterator(stream$1),
|
|
241
|
-
onDocument(
|
|
488
|
+
onDocument() {
|
|
242
489
|
docsPerSecond++;
|
|
243
490
|
return {
|
|
244
491
|
index: {
|
|
@@ -247,9 +494,13 @@ function indexQueueFactory({
|
|
|
247
494
|
};
|
|
248
495
|
}
|
|
249
496
|
});
|
|
250
|
-
} catch (
|
|
251
|
-
|
|
252
|
-
|
|
497
|
+
} catch (err) {
|
|
498
|
+
logger.error({
|
|
499
|
+
err,
|
|
500
|
+
targetIndexName
|
|
501
|
+
}, 'Error during bulk indexing');
|
|
502
|
+
queueEmitter.emit('error', err);
|
|
503
|
+
throw err;
|
|
253
504
|
} finally {
|
|
254
505
|
// Clean up interval
|
|
255
506
|
clearInterval(interval);
|
|
@@ -278,7 +529,7 @@ function indexQueueFactory({
|
|
|
278
529
|
if (finished) {
|
|
279
530
|
throw new Error('Unexpected doc added after indexer should finish.');
|
|
280
531
|
}
|
|
281
|
-
const canContinue = stream$1.
|
|
532
|
+
const canContinue = stream$1.write(`${JSON.stringify(doc)}\n`);
|
|
282
533
|
if (!canContinue) {
|
|
283
534
|
queueEmitter.emit('pause');
|
|
284
535
|
|
|
@@ -291,7 +542,7 @@ function indexQueueFactory({
|
|
|
291
542
|
},
|
|
292
543
|
finish: () => {
|
|
293
544
|
finished = true;
|
|
294
|
-
stream$1.
|
|
545
|
+
stream$1.end();
|
|
295
546
|
},
|
|
296
547
|
queueEmitter
|
|
297
548
|
};
|
|
@@ -299,7 +550,7 @@ function indexQueueFactory({
|
|
|
299
550
|
|
|
300
551
|
// create a new progress bar instance and use shades_classic theme
|
|
301
552
|
const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
|
|
302
|
-
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false) {
|
|
553
|
+
function indexReaderFactory(indexer, sourceIndexName, transform, client, query, searchSize = DEFAULT_SEARCH_SIZE, populatedFields = false, logger) {
|
|
303
554
|
return async function indexReader() {
|
|
304
555
|
let docsNum = 0;
|
|
305
556
|
let scrollId;
|
|
@@ -318,8 +569,11 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
318
569
|
maxRetries: 0
|
|
319
570
|
});
|
|
320
571
|
return Object.keys(response.fields);
|
|
321
|
-
} catch (
|
|
322
|
-
|
|
572
|
+
} catch (err) {
|
|
573
|
+
logger.error({
|
|
574
|
+
err,
|
|
575
|
+
sourceIndexName
|
|
576
|
+
}, 'Failed to fetch populated fields');
|
|
323
577
|
}
|
|
324
578
|
}
|
|
325
579
|
function search(fields) {
|
|
@@ -363,8 +617,10 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
363
617
|
return;
|
|
364
618
|
}
|
|
365
619
|
indexer.add(doc);
|
|
366
|
-
} catch (
|
|
367
|
-
|
|
620
|
+
} catch (err) {
|
|
621
|
+
logger.error({
|
|
622
|
+
err
|
|
623
|
+
}, 'Failed to process source index document');
|
|
368
624
|
}
|
|
369
625
|
}
|
|
370
626
|
async function fetchNextResponse() {
|
|
@@ -408,48 +664,339 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
408
664
|
};
|
|
409
665
|
}
|
|
410
666
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
667
|
+
const DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES = 100000;
|
|
668
|
+
const DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE = 1000;
|
|
669
|
+
function readSample(filePath, sampleBytes) {
|
|
670
|
+
const fd = fs.openSync(filePath, 'r');
|
|
671
|
+
try {
|
|
672
|
+
const buffer = Buffer.alloc(sampleBytes);
|
|
673
|
+
const bytesRead = fs.readSync(fd, buffer, 0, sampleBytes, 0);
|
|
674
|
+
return buffer.subarray(0, bytesRead).toString('utf8');
|
|
675
|
+
} finally {
|
|
676
|
+
fs.closeSync(fd);
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
function emptyInferenceResult(mappings) {
|
|
680
|
+
return {
|
|
681
|
+
mappings,
|
|
682
|
+
ingestPipeline: undefined
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
async function inferMappingsFromSource({
|
|
686
|
+
targetClient,
|
|
687
|
+
fileName,
|
|
688
|
+
sourceFormat,
|
|
689
|
+
csvOptions,
|
|
690
|
+
skipHeader,
|
|
691
|
+
mappings,
|
|
692
|
+
inferMappings,
|
|
693
|
+
inferMappingsOptions,
|
|
694
|
+
logger
|
|
695
|
+
}) {
|
|
696
|
+
if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
|
|
697
|
+
return emptyInferenceResult(mappings);
|
|
698
|
+
}
|
|
699
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
700
|
+
logger.info({
|
|
701
|
+
sourceFormat
|
|
702
|
+
}, 'Skipping mapping inference. Inference is only supported for ndjson and csv.');
|
|
703
|
+
return emptyInferenceResult(mappings);
|
|
704
|
+
}
|
|
705
|
+
if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
|
|
706
|
+
return emptyInferenceResult(mappings);
|
|
707
|
+
}
|
|
708
|
+
const files = glob.globSync(fileName);
|
|
709
|
+
if (files.length === 0) {
|
|
710
|
+
logger.info({
|
|
711
|
+
fileName
|
|
712
|
+
}, 'No files matched for mapping inference');
|
|
713
|
+
return emptyInferenceResult(mappings);
|
|
714
|
+
}
|
|
715
|
+
const {
|
|
716
|
+
sampleBytes = DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES,
|
|
717
|
+
...requestParams
|
|
718
|
+
} = inferMappingsOptions || {};
|
|
719
|
+
const sampleText = readSample(files[0], sampleBytes);
|
|
720
|
+
if (!sampleText || sampleText.trim() === '') {
|
|
721
|
+
logger.info('Skipping mapping inference because the sample text is empty');
|
|
722
|
+
return emptyInferenceResult(mappings);
|
|
723
|
+
}
|
|
724
|
+
const params = {
|
|
725
|
+
body: sampleText,
|
|
726
|
+
lines_to_sample: DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE,
|
|
727
|
+
...requestParams
|
|
728
|
+
};
|
|
729
|
+
if (typeof params.format === 'undefined') {
|
|
730
|
+
params.format = sourceFormat === 'csv' ? 'delimited' : 'ndjson';
|
|
731
|
+
}
|
|
732
|
+
if (sourceFormat === 'csv') {
|
|
733
|
+
if (typeof params.delimiter === 'undefined' && typeof csvOptions?.delimiter === 'string') {
|
|
734
|
+
params.delimiter = csvOptions.delimiter;
|
|
735
|
+
}
|
|
736
|
+
if (typeof params.quote === 'undefined' && typeof csvOptions?.quote === 'string') {
|
|
737
|
+
params.quote = csvOptions.quote;
|
|
738
|
+
}
|
|
739
|
+
if (typeof params.has_header_row === 'undefined' && typeof csvOptions?.columns === 'boolean') {
|
|
740
|
+
params.has_header_row = csvOptions.columns;
|
|
741
|
+
}
|
|
742
|
+
if (typeof params.has_header_row === 'undefined' && skipHeader) {
|
|
743
|
+
params.has_header_row = true;
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
try {
|
|
747
|
+
const response = await targetClient.textStructure.findStructure(params);
|
|
748
|
+
if (response?.mappings) {
|
|
749
|
+
logger.info({
|
|
750
|
+
file: files[0]
|
|
751
|
+
}, 'Inferred mappings via _text_structure/find_structure');
|
|
752
|
+
}
|
|
753
|
+
if (response?.ingest_pipeline) {
|
|
754
|
+
logger.info('Inferred ingest pipeline via _text_structure/find_structure');
|
|
755
|
+
}
|
|
756
|
+
return {
|
|
757
|
+
mappings: response?.mappings || mappings,
|
|
758
|
+
ingestPipeline: response?.ingest_pipeline
|
|
759
|
+
};
|
|
760
|
+
} catch (err) {
|
|
761
|
+
logger.warn({
|
|
762
|
+
err
|
|
763
|
+
}, 'Could not infer mappings via _text_structure/find_structure');
|
|
764
|
+
return emptyInferenceResult(mappings);
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
const DEFAULT_LOG_LEVEL = 'info';
|
|
769
|
+
function resolveLogLevel(verbose = true) {
|
|
770
|
+
if (typeof process.env.LOG_LEVEL === 'string' && process.env.LOG_LEVEL.trim() !== '') {
|
|
771
|
+
return process.env.LOG_LEVEL;
|
|
772
|
+
}
|
|
773
|
+
return verbose ? DEFAULT_LOG_LEVEL : 'error';
|
|
774
|
+
}
|
|
775
|
+
function createLogger({
|
|
776
|
+
logger,
|
|
777
|
+
verbose = true
|
|
778
|
+
} = {}) {
|
|
779
|
+
if (logger && typeof logger === 'object') {
|
|
780
|
+
return logger;
|
|
781
|
+
}
|
|
782
|
+
return pino({
|
|
783
|
+
name: 'node-es-transformer',
|
|
784
|
+
level: resolveLogLevel(verbose),
|
|
785
|
+
timestamp: pino.stdTimeFunctions.isoTime,
|
|
786
|
+
serializers: {
|
|
787
|
+
err: pino.stdSerializers.err,
|
|
788
|
+
error: pino.stdSerializers.err
|
|
789
|
+
}
|
|
790
|
+
});
|
|
791
|
+
}
|
|
792
|
+
function createChildLogger(logger, bindings) {
|
|
793
|
+
if (!logger || typeof logger.child !== 'function') {
|
|
794
|
+
return logger;
|
|
795
|
+
}
|
|
796
|
+
return logger.child(bindings);
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
function createPauseWaiter(queueEmitter) {
|
|
800
|
+
let paused = false;
|
|
801
|
+
let waiters = [];
|
|
802
|
+
const onPause = () => {
|
|
803
|
+
paused = true;
|
|
804
|
+
};
|
|
805
|
+
const onResume = () => {
|
|
806
|
+
paused = false;
|
|
807
|
+
waiters.forEach(resolve => resolve());
|
|
808
|
+
waiters = [];
|
|
809
|
+
};
|
|
810
|
+
queueEmitter.on('pause', onPause);
|
|
811
|
+
queueEmitter.on('resume', onResume);
|
|
812
|
+
return {
|
|
813
|
+
async waitIfPaused() {
|
|
814
|
+
if (!paused) return;
|
|
815
|
+
await new Promise(resolve => {
|
|
816
|
+
waiters.push(resolve);
|
|
817
|
+
});
|
|
818
|
+
},
|
|
819
|
+
cleanup() {
|
|
820
|
+
queueEmitter.removeListener('pause', onPause);
|
|
821
|
+
queueEmitter.removeListener('resume', onResume);
|
|
822
|
+
waiters.forEach(resolve => resolve());
|
|
823
|
+
waiters = [];
|
|
824
|
+
}
|
|
825
|
+
};
|
|
826
|
+
}
|
|
827
|
+
async function readStreamToBuffer(stream) {
|
|
828
|
+
const chunks = [];
|
|
829
|
+
for await (const chunk of stream) {
|
|
830
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
831
|
+
}
|
|
832
|
+
return Buffer.concat(chunks);
|
|
833
|
+
}
|
|
834
|
+
function streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}, logger) {
|
|
835
|
+
function addParsedDoc(parsed) {
|
|
836
|
+
const doc = typeof transform === 'function' ? transform(parsed) : parsed;
|
|
837
|
+
|
|
838
|
+
// if doc is null/undefined we'll skip indexing it
|
|
839
|
+
if (doc === null || typeof doc === 'undefined') {
|
|
840
|
+
return;
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// the transform callback may return an array of docs so we can emit
|
|
844
|
+
// multiple docs from a single line
|
|
845
|
+
if (Array.isArray(doc)) {
|
|
846
|
+
doc.forEach(d => {
|
|
847
|
+
if (d === null || typeof d === 'undefined') return;
|
|
848
|
+
indexer.add(d);
|
|
849
|
+
});
|
|
850
|
+
return;
|
|
851
|
+
}
|
|
852
|
+
indexer.add(doc);
|
|
853
|
+
}
|
|
854
|
+
async function processParquetStream() {
|
|
855
|
+
const {
|
|
856
|
+
waitIfPaused,
|
|
857
|
+
cleanup
|
|
858
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
859
|
+
const parquetBuffer = await readStreamToBuffer(stream);
|
|
860
|
+
const reader = await parquet.ParquetReader.openBuffer(parquetBuffer);
|
|
861
|
+
try {
|
|
862
|
+
const cursor = reader.getCursor();
|
|
863
|
+
while (true) {
|
|
864
|
+
// eslint-disable-next-line no-await-in-loop
|
|
865
|
+
const row = await cursor.next();
|
|
866
|
+
if (row === null || typeof row === 'undefined') {
|
|
867
|
+
break;
|
|
868
|
+
}
|
|
869
|
+
addParsedDoc(row);
|
|
870
|
+
// eslint-disable-next-line no-await-in-loop
|
|
871
|
+
await waitIfPaused();
|
|
872
|
+
}
|
|
873
|
+
logger.info('Read entire stream');
|
|
874
|
+
} finally {
|
|
875
|
+
cleanup();
|
|
876
|
+
await reader.close();
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
async function processArrowStream() {
|
|
880
|
+
const {
|
|
881
|
+
waitIfPaused,
|
|
882
|
+
cleanup
|
|
883
|
+
} = createPauseWaiter(indexer.queueEmitter);
|
|
884
|
+
try {
|
|
885
|
+
const reader = await arrow__namespace.RecordBatchReader.from(stream);
|
|
886
|
+
for await (const recordBatch of reader) {
|
|
887
|
+
const {
|
|
888
|
+
fields
|
|
889
|
+
} = recordBatch.schema;
|
|
890
|
+
for (let rowIndex = 0; rowIndex < recordBatch.numRows; rowIndex++) {
|
|
891
|
+
const row = {};
|
|
892
|
+
fields.forEach(field => {
|
|
893
|
+
const vector = recordBatch.getChild(field.name);
|
|
894
|
+
row[field.name] = vector ? vector.get(rowIndex) : undefined;
|
|
895
|
+
});
|
|
896
|
+
addParsedDoc(row);
|
|
897
|
+
// eslint-disable-next-line no-await-in-loop
|
|
898
|
+
await waitIfPaused();
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
logger.info('Read entire stream');
|
|
902
|
+
} finally {
|
|
903
|
+
cleanup();
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
function processPipeline(buildPipeline, errorMessage) {
|
|
907
|
+
return new Promise((resolve, reject) => {
|
|
908
|
+
let finished = false;
|
|
909
|
+
const s = buildPipeline();
|
|
910
|
+
const onPause = () => {
|
|
911
|
+
if (finished) return;
|
|
912
|
+
s.pause();
|
|
913
|
+
};
|
|
914
|
+
const onResume = () => {
|
|
915
|
+
if (finished) return;
|
|
916
|
+
s.resume();
|
|
917
|
+
};
|
|
918
|
+
function cleanup() {
|
|
919
|
+
indexer.queueEmitter.removeListener('pause', onPause);
|
|
920
|
+
indexer.queueEmitter.removeListener('resume', onResume);
|
|
921
|
+
}
|
|
922
|
+
indexer.queueEmitter.on('pause', onPause);
|
|
923
|
+
indexer.queueEmitter.on('resume', onResume);
|
|
924
|
+
s.on('end', () => {
|
|
925
|
+
finished = true;
|
|
926
|
+
cleanup();
|
|
927
|
+
logger.info('Read entire stream');
|
|
928
|
+
resolve();
|
|
929
|
+
});
|
|
930
|
+
s.on('error', err => {
|
|
931
|
+
finished = true;
|
|
932
|
+
cleanup();
|
|
933
|
+
logger.error({
|
|
934
|
+
err
|
|
935
|
+
}, errorMessage);
|
|
936
|
+
reject(err);
|
|
937
|
+
});
|
|
938
|
+
});
|
|
939
|
+
}
|
|
940
|
+
function processCsvStream() {
|
|
941
|
+
return processPipeline(() => stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
|
|
942
|
+
try {
|
|
943
|
+
addParsedDoc(record);
|
|
944
|
+
} catch (err) {
|
|
945
|
+
logger.error({
|
|
946
|
+
err
|
|
947
|
+
}, 'Failed to process CSV stream record');
|
|
948
|
+
}
|
|
949
|
+
}).on('error', err => {
|
|
950
|
+
logger.error({
|
|
951
|
+
err
|
|
952
|
+
}, 'Error while reading CSV stream');
|
|
953
|
+
})), 'Error while reading CSV stream');
|
|
954
|
+
}
|
|
955
|
+
function processNdjsonStream() {
|
|
956
|
+
let skippedHeader = false;
|
|
957
|
+
return processPipeline(() => stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
415
958
|
try {
|
|
416
959
|
// skip empty lines
|
|
417
960
|
if (line === '') {
|
|
418
961
|
return;
|
|
419
962
|
}
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
// if doc is undefined we'll skip indexing it
|
|
423
|
-
if (typeof doc === 'undefined') {
|
|
424
|
-
s.resume();
|
|
425
|
-
return;
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
// the transform callback may return an array of docs so we can emit
|
|
429
|
-
// multiple docs from a single line
|
|
430
|
-
if (Array.isArray(doc)) {
|
|
431
|
-
doc.forEach(d => indexer.add(d));
|
|
963
|
+
if (skipHeader && !skippedHeader) {
|
|
964
|
+
skippedHeader = true;
|
|
432
965
|
return;
|
|
433
966
|
}
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
967
|
+
const parsed = JSON.parse(line);
|
|
968
|
+
addParsedDoc(parsed);
|
|
969
|
+
} catch (err) {
|
|
970
|
+
logger.error({
|
|
971
|
+
err
|
|
972
|
+
}, 'Failed to process NDJSON stream line');
|
|
437
973
|
}
|
|
438
974
|
}).on('error', err => {
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
975
|
+
logger.error({
|
|
976
|
+
err
|
|
977
|
+
}, 'Error while reading stream');
|
|
978
|
+
})), 'Error while reading stream');
|
|
979
|
+
}
|
|
980
|
+
async function startIndex() {
|
|
981
|
+
try {
|
|
982
|
+
if (sourceFormat === 'csv') {
|
|
983
|
+
await processCsvStream();
|
|
984
|
+
} else if (sourceFormat === 'ndjson') {
|
|
985
|
+
await processNdjsonStream();
|
|
986
|
+
} else if (sourceFormat === 'parquet') {
|
|
987
|
+
await processParquetStream();
|
|
988
|
+
} else if (sourceFormat === 'arrow') {
|
|
989
|
+
await processArrowStream();
|
|
990
|
+
} else {
|
|
991
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}`);
|
|
992
|
+
}
|
|
993
|
+
} catch (err) {
|
|
994
|
+
logger.error({
|
|
995
|
+
err
|
|
996
|
+
}, 'Error while reading stream');
|
|
997
|
+
} finally {
|
|
442
998
|
indexer.finish();
|
|
443
|
-
|
|
444
|
-
}));
|
|
445
|
-
indexer.queueEmitter.on('pause', () => {
|
|
446
|
-
if (finished) return;
|
|
447
|
-
s.pause();
|
|
448
|
-
});
|
|
449
|
-
indexer.queueEmitter.on('resume', () => {
|
|
450
|
-
if (finished) return;
|
|
451
|
-
s.resume();
|
|
452
|
-
});
|
|
999
|
+
}
|
|
453
1000
|
}
|
|
454
1001
|
return () => {
|
|
455
1002
|
startIndex();
|
|
@@ -530,22 +1077,31 @@ async function transformer({
|
|
|
530
1077
|
searchSize = DEFAULT_SEARCH_SIZE,
|
|
531
1078
|
stream,
|
|
532
1079
|
fileName,
|
|
1080
|
+
sourceFormat = 'ndjson',
|
|
1081
|
+
csvOptions = {},
|
|
533
1082
|
splitRegex = /\n/,
|
|
534
1083
|
sourceIndexName,
|
|
535
1084
|
targetIndexName,
|
|
536
1085
|
mappings,
|
|
537
1086
|
mappingsOverride = false,
|
|
1087
|
+
inferMappings = false,
|
|
1088
|
+
inferMappingsOptions = {},
|
|
538
1089
|
indexMappingTotalFieldsLimit,
|
|
539
1090
|
pipeline,
|
|
540
1091
|
populatedFields = false,
|
|
541
1092
|
query,
|
|
542
1093
|
skipHeader = false,
|
|
543
1094
|
transform,
|
|
544
|
-
verbose = true
|
|
1095
|
+
verbose = true,
|
|
1096
|
+
logger: loggerInput
|
|
545
1097
|
}) {
|
|
546
1098
|
if (typeof targetIndexName === 'undefined') {
|
|
547
1099
|
throw Error('targetIndexName must be specified.');
|
|
548
1100
|
}
|
|
1101
|
+
const logger = createLogger({
|
|
1102
|
+
logger: loggerInput,
|
|
1103
|
+
verbose
|
|
1104
|
+
});
|
|
549
1105
|
const defaultClientConfig = {
|
|
550
1106
|
node: process.env.ELASTICSEARCH_URL || 'http://localhost:9200'
|
|
551
1107
|
};
|
|
@@ -553,23 +1109,47 @@ async function transformer({
|
|
|
553
1109
|
// Support both old (config) and new (client instance) patterns
|
|
554
1110
|
const sourceClient = await getOrCreateClient(sourceClientInput || sourceClientConfig, defaultClientConfig, sourceClientVersion);
|
|
555
1111
|
const targetClient = await getOrCreateClient(targetClientInput || targetClientConfig || sourceClientInput || sourceClientConfig, defaultClientConfig, targetClientVersion);
|
|
1112
|
+
const inferenceResult = await inferMappingsFromSource({
|
|
1113
|
+
targetClient,
|
|
1114
|
+
fileName,
|
|
1115
|
+
sourceFormat,
|
|
1116
|
+
csvOptions,
|
|
1117
|
+
skipHeader,
|
|
1118
|
+
mappings,
|
|
1119
|
+
inferMappings,
|
|
1120
|
+
inferMappingsOptions,
|
|
1121
|
+
logger: createChildLogger(logger, {
|
|
1122
|
+
component: 'mapping-inference'
|
|
1123
|
+
})
|
|
1124
|
+
});
|
|
556
1125
|
const createMapping = createMappingFactory({
|
|
557
1126
|
sourceClient,
|
|
558
1127
|
sourceIndexName,
|
|
559
1128
|
targetClient,
|
|
560
1129
|
targetIndexName,
|
|
561
|
-
mappings,
|
|
1130
|
+
mappings: inferenceResult.mappings,
|
|
1131
|
+
inferredIngestPipeline: inferenceResult.ingestPipeline,
|
|
562
1132
|
mappingsOverride,
|
|
563
1133
|
indexMappingTotalFieldsLimit,
|
|
564
|
-
verbose,
|
|
565
1134
|
deleteIndex,
|
|
566
|
-
pipeline
|
|
1135
|
+
pipeline,
|
|
1136
|
+
logger: createChildLogger(logger, {
|
|
1137
|
+
component: 'create-mapping'
|
|
1138
|
+
})
|
|
567
1139
|
});
|
|
568
1140
|
const indexer = indexQueueFactory({
|
|
569
1141
|
targetClient,
|
|
570
1142
|
targetIndexName,
|
|
571
1143
|
bufferSize,
|
|
572
|
-
|
|
1144
|
+
logger: createChildLogger(logger, {
|
|
1145
|
+
component: 'index-queue'
|
|
1146
|
+
})
|
|
1147
|
+
});
|
|
1148
|
+
function validateSourceFormat() {
|
|
1149
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv' && sourceFormat !== 'parquet' && sourceFormat !== 'arrow') {
|
|
1150
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson", "csv", "parquet", or "arrow".`);
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
573
1153
|
function getReader() {
|
|
574
1154
|
if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
|
|
575
1155
|
throw Error('Only either one of fileName or sourceIndexName can be specified.');
|
|
@@ -578,17 +1158,28 @@ async function transformer({
|
|
|
578
1158
|
throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
|
|
579
1159
|
}
|
|
580
1160
|
if (typeof fileName !== 'undefined') {
|
|
581
|
-
|
|
1161
|
+
validateSourceFormat();
|
|
1162
|
+
return fileReaderFactory(indexer, fileName, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1163
|
+
component: 'file-reader'
|
|
1164
|
+
}));
|
|
582
1165
|
}
|
|
583
1166
|
if (typeof sourceIndexName !== 'undefined') {
|
|
584
|
-
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields
|
|
1167
|
+
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields, createChildLogger(logger, {
|
|
1168
|
+
component: 'index-reader'
|
|
1169
|
+
}));
|
|
585
1170
|
}
|
|
586
1171
|
if (typeof stream !== 'undefined') {
|
|
587
|
-
|
|
1172
|
+
validateSourceFormat();
|
|
1173
|
+
return streamReaderFactory(indexer, stream, transform, splitRegex, skipHeader, sourceFormat, csvOptions, createChildLogger(logger, {
|
|
1174
|
+
component: 'stream-reader'
|
|
1175
|
+
}));
|
|
588
1176
|
}
|
|
589
1177
|
return null;
|
|
590
1178
|
}
|
|
591
1179
|
const reader = getReader();
|
|
1180
|
+
if (typeof reader !== 'function') {
|
|
1181
|
+
throw Error('One of fileName, sourceIndexName, or stream must be specified.');
|
|
1182
|
+
}
|
|
592
1183
|
try {
|
|
593
1184
|
const indexExists = await targetClient.indices.exists({
|
|
594
1185
|
index: targetIndexName
|
|
@@ -605,8 +1196,11 @@ async function transformer({
|
|
|
605
1196
|
} else {
|
|
606
1197
|
reader();
|
|
607
1198
|
}
|
|
608
|
-
} catch (
|
|
609
|
-
|
|
1199
|
+
} catch (err) {
|
|
1200
|
+
logger.error({
|
|
1201
|
+
err,
|
|
1202
|
+
targetIndexName
|
|
1203
|
+
}, 'Error checking index existence');
|
|
610
1204
|
} finally {
|
|
611
1205
|
// targetClient.close();
|
|
612
1206
|
}
|