node-es-transformer 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -4
- package/dist/node-es-transformer.cjs.js +279 -81
- package/dist/node-es-transformer.cjs.js.map +1 -1
- package/dist/node-es-transformer.esm.js +280 -82
- package/dist/node-es-transformer.esm.js.map +1 -1
- package/index.d.ts +58 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -96,7 +96,7 @@ yarn add node-es-transformer
|
|
|
96
96
|
|
|
97
97
|
## Usage
|
|
98
98
|
|
|
99
|
-
### Read from a file
|
|
99
|
+
### Read NDJSON from a file
|
|
100
100
|
|
|
101
101
|
```javascript
|
|
102
102
|
const transformer = require('node-es-transformer');
|
|
@@ -129,6 +129,50 @@ transformer({
|
|
|
129
129
|
});
|
|
130
130
|
```
|
|
131
131
|
|
|
132
|
+
### Read CSV from a file
|
|
133
|
+
|
|
134
|
+
```javascript
|
|
135
|
+
const transformer = require('node-es-transformer');
|
|
136
|
+
|
|
137
|
+
transformer({
|
|
138
|
+
fileName: 'users.csv',
|
|
139
|
+
sourceFormat: 'csv',
|
|
140
|
+
targetIndexName: 'users-index',
|
|
141
|
+
mappings: {
|
|
142
|
+
properties: {
|
|
143
|
+
id: { type: 'integer' },
|
|
144
|
+
first_name: { type: 'keyword' },
|
|
145
|
+
last_name: { type: 'keyword' },
|
|
146
|
+
full_name: { type: 'keyword' },
|
|
147
|
+
},
|
|
148
|
+
},
|
|
149
|
+
transform(row) {
|
|
150
|
+
return {
|
|
151
|
+
...row,
|
|
152
|
+
id: Number(row.id),
|
|
153
|
+
full_name: `${row.first_name} ${row.last_name}`,
|
|
154
|
+
};
|
|
155
|
+
},
|
|
156
|
+
});
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Infer mappings from CSV sample
|
|
160
|
+
|
|
161
|
+
```javascript
|
|
162
|
+
const transformer = require('node-es-transformer');
|
|
163
|
+
|
|
164
|
+
transformer({
|
|
165
|
+
fileName: 'users.csv',
|
|
166
|
+
sourceFormat: 'csv',
|
|
167
|
+
targetIndexName: 'users-index',
|
|
168
|
+
inferMappings: true,
|
|
169
|
+
inferMappingsOptions: {
|
|
170
|
+
sampleBytes: 200000,
|
|
171
|
+
lines_to_sample: 2000,
|
|
172
|
+
},
|
|
173
|
+
});
|
|
174
|
+
```
|
|
175
|
+
|
|
132
176
|
### Read from another index
|
|
133
177
|
|
|
134
178
|
```javascript
|
|
@@ -242,9 +286,11 @@ All options are passed to the main `transformer()` function.
|
|
|
242
286
|
|
|
243
287
|
Choose **one** of these sources:
|
|
244
288
|
|
|
245
|
-
- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json`).
|
|
289
|
+
- **`fileName`** (string): Source filename to ingest. Supports wildcards (e.g., `logs/*.json` or `data/*.csv`).
|
|
246
290
|
- **`sourceIndexName`** (string): Source Elasticsearch index to reindex from.
|
|
247
291
|
- **`stream`** (Readable): Node.js readable stream to ingest from.
|
|
292
|
+
- **`sourceFormat`** (`'ndjson' | 'csv'`): Format for file/stream sources. Default: `'ndjson'`.
|
|
293
|
+
- **`csvOptions`** (object): CSV parser options (delimiter, quote, columns, etc.) used when `sourceFormat: 'csv'`.
|
|
248
294
|
|
|
249
295
|
#### Client Configuration
|
|
250
296
|
|
|
@@ -259,10 +305,14 @@ Choose **one** of these sources:
|
|
|
259
305
|
|
|
260
306
|
- **`mappings`** (object): Elasticsearch document mappings for target index. If reindexing and not provided, mappings are copied from source index.
|
|
261
307
|
- **`mappingsOverride`** (boolean): When reindexing, apply `mappings` on top of source index mappings. Default: `false`.
|
|
308
|
+
- **`inferMappings`** (boolean): Infer mappings for `fileName` sources via `/_text_structure/find_structure`. Ignored when `mappings` is provided. If inference returns `ingest_pipeline`, it is created as `<targetIndexName>-inferred-pipeline` and applied as the index default pipeline (unless `pipeline` is explicitly set). Default: `false`.
|
|
309
|
+
- **`inferMappingsOptions`** (object): Options for `/_text_structure/find_structure` (for example `sampleBytes`, `lines_to_sample`, `delimiter`, `quote`, `has_header_row`, `timeout`).
|
|
262
310
|
- **`deleteIndex`** (boolean): Delete target index if it exists before starting. Default: `false`.
|
|
263
311
|
- **`indexMappingTotalFieldsLimit`** (number): Field limit for target index (`index.mapping.total_fields.limit` setting).
|
|
264
312
|
- **`pipeline`** (string): Elasticsearch ingest pipeline name to use during indexing.
|
|
265
313
|
|
|
314
|
+
When `inferMappings` is enabled, the target cluster must allow `/_text_structure/find_structure` (cluster privilege: `monitor_text_structure`). If inferred ingest pipelines are used, the target cluster must also allow creating ingest pipelines (`_ingest/pipeline`).
|
|
315
|
+
|
|
266
316
|
#### Performance Options
|
|
267
317
|
|
|
268
318
|
- **`bufferSize`** (number): Buffer size threshold in KBytes for bulk indexing. Default: `5120` (5 MB).
|
|
@@ -276,8 +326,12 @@ Choose **one** of these sources:
|
|
|
276
326
|
- Return array of documents to split one source into multiple targets
|
|
277
327
|
- Return `null`/`undefined` to skip document
|
|
278
328
|
- **`query`** (object): Elasticsearch [DSL query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) to filter source documents.
|
|
279
|
-
- **`splitRegex`** (RegExp): Line split regex for file/stream sources
|
|
280
|
-
- **`skipHeader`** (boolean):
|
|
329
|
+
- **`splitRegex`** (RegExp): Line split regex for file/stream sources when `sourceFormat` is `'ndjson'`. Default: `/\n/`.
|
|
330
|
+
- **`skipHeader`** (boolean): Header skipping for file/stream sources.
|
|
331
|
+
- NDJSON: skips the first non-empty line
|
|
332
|
+
- CSV: skips the first data line only when `csvOptions.columns` does not consume headers
|
|
333
|
+
- Default: `false`
|
|
334
|
+
- Applies only to `fileName`/`stream` sources
|
|
281
335
|
- **`verbose`** (boolean): Enable logging and progress bars. Default: `true`.
|
|
282
336
|
|
|
283
337
|
### Return Value
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
var elasticsearch9 = require('es9');
|
|
4
4
|
var elasticsearch8 = require('es8');
|
|
5
5
|
var fs = require('fs');
|
|
6
|
+
var csvParse = require('csv-parse');
|
|
6
7
|
var es = require('event-stream');
|
|
7
8
|
var glob = require('glob');
|
|
8
9
|
var split = require('split2');
|
|
@@ -25,6 +26,7 @@ function createMappingFactory({
|
|
|
25
26
|
targetClient,
|
|
26
27
|
targetIndexName,
|
|
27
28
|
mappings,
|
|
29
|
+
inferredIngestPipeline,
|
|
28
30
|
mappingsOverride,
|
|
29
31
|
indexMappingTotalFieldsLimit,
|
|
30
32
|
verbose,
|
|
@@ -33,6 +35,7 @@ function createMappingFactory({
|
|
|
33
35
|
}) {
|
|
34
36
|
return async () => {
|
|
35
37
|
let targetMappings = mappingsOverride ? undefined : mappings;
|
|
38
|
+
let defaultPipeline = pipeline;
|
|
36
39
|
if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
|
|
37
40
|
try {
|
|
38
41
|
const mapping = await sourceClient.indices.getMapping({
|
|
@@ -71,22 +74,34 @@ function createMappingFactory({
|
|
|
71
74
|
});
|
|
72
75
|
}
|
|
73
76
|
if (indexExists === false || deleteIndex === true) {
|
|
77
|
+
if (typeof defaultPipeline === 'undefined' && typeof inferredIngestPipeline === 'object' && inferredIngestPipeline !== null && typeof targetClient?.ingest?.putPipeline === 'function') {
|
|
78
|
+
const inferredPipelineName = `${targetIndexName}-inferred-pipeline`;
|
|
79
|
+
try {
|
|
80
|
+
await targetClient.ingest.putPipeline({
|
|
81
|
+
id: inferredPipelineName,
|
|
82
|
+
...inferredIngestPipeline
|
|
83
|
+
});
|
|
84
|
+
defaultPipeline = inferredPipelineName;
|
|
85
|
+
if (verbose) console.log(`Created inferred ingest pipeline ${inferredPipelineName}`);
|
|
86
|
+
} catch (err) {
|
|
87
|
+
console.log('Error creating inferred ingest pipeline', err);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
const settings = {
|
|
91
|
+
...(defaultPipeline !== undefined ? {
|
|
92
|
+
'index.default_pipeline': defaultPipeline
|
|
93
|
+
} : {}),
|
|
94
|
+
...(indexMappingTotalFieldsLimit !== undefined ? {
|
|
95
|
+
'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
|
|
96
|
+
'index.number_of_shards': 1,
|
|
97
|
+
'index.number_of_replicas': 0
|
|
98
|
+
} : {})
|
|
99
|
+
};
|
|
74
100
|
const resp = await targetClient.indices.create({
|
|
75
101
|
index: targetIndexName,
|
|
76
102
|
mappings: targetMappings,
|
|
77
|
-
...(
|
|
78
|
-
settings
|
|
79
|
-
index: {
|
|
80
|
-
default_pipeline: pipeline
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
} : {}),
|
|
84
|
-
...(indexMappingTotalFieldsLimit !== undefined ? {
|
|
85
|
-
settings: {
|
|
86
|
-
'index.mapping.total_fields.limit': indexMappingTotalFieldsLimit,
|
|
87
|
-
'index.number_of_shards': 1,
|
|
88
|
-
'index.number_of_replicas': 0
|
|
89
|
-
}
|
|
103
|
+
...(Object.keys(settings).length > 0 ? {
|
|
104
|
+
settings
|
|
90
105
|
} : {})
|
|
91
106
|
});
|
|
92
107
|
if (verbose) console.log('Created target mapping', resp);
|
|
@@ -98,37 +113,89 @@ function createMappingFactory({
|
|
|
98
113
|
};
|
|
99
114
|
}
|
|
100
115
|
|
|
101
|
-
function
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
116
|
+
function getCsvParserOptions(csvOptions = {}, skipHeader = false) {
|
|
117
|
+
const options = {
|
|
118
|
+
bom: true,
|
|
119
|
+
columns: true,
|
|
120
|
+
trim: true,
|
|
121
|
+
skip_empty_lines: true,
|
|
122
|
+
...csvOptions
|
|
123
|
+
};
|
|
124
|
+
const consumesHeader = options.columns === true || typeof options.columns === 'function';
|
|
125
|
+
if (skipHeader && !consumesHeader && typeof options.from_line === 'undefined') {
|
|
126
|
+
options.from_line = 2;
|
|
127
|
+
}
|
|
128
|
+
return options;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
|
|
132
|
+
function addParsedDoc(parsed, file, streamRef) {
|
|
133
|
+
const context = {
|
|
134
|
+
fileName: file
|
|
135
|
+
};
|
|
136
|
+
const doc = typeof transform === 'function' ? transform(parsed, context) : parsed;
|
|
137
|
+
|
|
138
|
+
// if doc is null/undefined we'll skip indexing it
|
|
139
|
+
if (doc === null || typeof doc === 'undefined') {
|
|
140
|
+
streamRef.resume();
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// the transform callback may return an array of docs so we can emit
|
|
145
|
+
// multiple docs from a single line
|
|
146
|
+
if (Array.isArray(doc)) {
|
|
147
|
+
doc.forEach(d => {
|
|
148
|
+
if (d === null || typeof d === 'undefined') return;
|
|
149
|
+
indexer.add(d);
|
|
150
|
+
});
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
indexer.add(doc);
|
|
154
|
+
}
|
|
155
|
+
function createNdjsonReader(file) {
|
|
156
|
+
let skippedHeader = false;
|
|
105
157
|
const s = fs.createReadStream(file).pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
106
158
|
try {
|
|
107
159
|
// skip empty lines
|
|
108
160
|
if (line === '') {
|
|
109
161
|
return;
|
|
110
162
|
}
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
// if doc is undefined we'll skip indexing it
|
|
114
|
-
if (typeof doc === 'undefined') {
|
|
115
|
-
s.resume();
|
|
116
|
-
return;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
// the transform callback may return an array of docs so we can emit
|
|
120
|
-
// multiple docs from a single line
|
|
121
|
-
if (Array.isArray(doc)) {
|
|
122
|
-
doc.forEach(d => indexer.add(d));
|
|
163
|
+
if (skipHeader && !skippedHeader) {
|
|
164
|
+
skippedHeader = true;
|
|
123
165
|
return;
|
|
124
166
|
}
|
|
125
|
-
|
|
167
|
+
const parsed = JSON.parse(line);
|
|
168
|
+
addParsedDoc(parsed, file, s);
|
|
126
169
|
} catch (e) {
|
|
127
170
|
console.log('error', e);
|
|
128
171
|
}
|
|
129
172
|
}).on('error', err => {
|
|
130
173
|
console.log('Error while reading file.', err);
|
|
131
|
-
})
|
|
174
|
+
}));
|
|
175
|
+
return s;
|
|
176
|
+
}
|
|
177
|
+
function createCsvReader(file) {
|
|
178
|
+
const parserOptions = getCsvParserOptions(csvOptions, skipHeader);
|
|
179
|
+
const s = fs.createReadStream(file).pipe(csvParse.parse(parserOptions)).pipe(es.mapSync(record => {
|
|
180
|
+
try {
|
|
181
|
+
addParsedDoc(record, file, s);
|
|
182
|
+
} catch (e) {
|
|
183
|
+
console.log('error', e);
|
|
184
|
+
}
|
|
185
|
+
}).on('error', err => {
|
|
186
|
+
console.log('Error while reading CSV file.', err);
|
|
187
|
+
}));
|
|
188
|
+
return s;
|
|
189
|
+
}
|
|
190
|
+
function startIndex(files) {
|
|
191
|
+
let finished = false;
|
|
192
|
+
if (files.length === 0) {
|
|
193
|
+
indexer.finish();
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
const file = files.shift();
|
|
197
|
+
const s = sourceFormat === 'csv' ? createCsvReader(file) : createNdjsonReader(file);
|
|
198
|
+
s.on('end', () => {
|
|
132
199
|
if (verbose) console.log('Read entire file: ', file);
|
|
133
200
|
if (files.length > 0) {
|
|
134
201
|
startIndex(files);
|
|
@@ -136,7 +203,7 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
|
136
203
|
}
|
|
137
204
|
indexer.finish();
|
|
138
205
|
finished = true;
|
|
139
|
-
})
|
|
206
|
+
});
|
|
140
207
|
indexer.queueEmitter.on('pause', () => {
|
|
141
208
|
if (finished) return;
|
|
142
209
|
s.pause();
|
|
@@ -157,29 +224,26 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
|
157
224
|
}
|
|
158
225
|
|
|
159
226
|
const EventEmitter = require('events');
|
|
160
|
-
const queueEmitter = new EventEmitter();
|
|
161
227
|
const parallelCalls = 5;
|
|
162
228
|
|
|
163
229
|
// a simple helper queue to bulk index documents
|
|
164
230
|
function indexQueueFactory({
|
|
165
231
|
targetClient: client,
|
|
166
232
|
targetIndexName,
|
|
167
|
-
bufferSize = DEFAULT_BUFFER_SIZE
|
|
168
|
-
skipHeader = false
|
|
233
|
+
bufferSize = DEFAULT_BUFFER_SIZE
|
|
169
234
|
}) {
|
|
235
|
+
const queueEmitter = new EventEmitter();
|
|
170
236
|
let docsPerSecond = 0;
|
|
171
237
|
const flushBytes = bufferSize * 1024; // Convert KB to Bytes
|
|
172
238
|
const highWaterMark = flushBytes * parallelCalls;
|
|
173
239
|
|
|
174
|
-
// Create a
|
|
175
|
-
const stream$1 = new stream.
|
|
176
|
-
read() {},
|
|
177
|
-
// Implement read but we manage pushing manually
|
|
240
|
+
// Create a PassThrough stream (readable + writable) for proper backpressure
|
|
241
|
+
const stream$1 = new stream.PassThrough({
|
|
178
242
|
highWaterMark // Buffer size for backpressure management
|
|
179
243
|
});
|
|
180
244
|
async function* ndjsonStreamIterator(readableStream) {
|
|
181
245
|
let buffer = ''; // To hold the incomplete data
|
|
182
|
-
|
|
246
|
+
|
|
183
247
|
try {
|
|
184
248
|
// Iterate over the stream using async iteration
|
|
185
249
|
for await (const chunk of readableStream) {
|
|
@@ -193,16 +257,14 @@ function indexQueueFactory({
|
|
|
193
257
|
|
|
194
258
|
// Yield each complete JSON object
|
|
195
259
|
for (const line of lines) {
|
|
196
|
-
if (line.trim()) {
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
console.error('Failed to parse JSON:', err);
|
|
205
|
-
}
|
|
260
|
+
if (!line.trim()) {
|
|
261
|
+
continue;
|
|
262
|
+
}
|
|
263
|
+
try {
|
|
264
|
+
yield JSON.parse(line); // Parse and yield the JSON object
|
|
265
|
+
} catch (err) {
|
|
266
|
+
// Handle JSON parse errors if necessary
|
|
267
|
+
console.error('Failed to parse JSON:', err);
|
|
206
268
|
}
|
|
207
269
|
}
|
|
208
270
|
}
|
|
@@ -278,7 +340,7 @@ function indexQueueFactory({
|
|
|
278
340
|
if (finished) {
|
|
279
341
|
throw new Error('Unexpected doc added after indexer should finish.');
|
|
280
342
|
}
|
|
281
|
-
const canContinue = stream$1.
|
|
343
|
+
const canContinue = stream$1.write(`${JSON.stringify(doc)}\n`);
|
|
282
344
|
if (!canContinue) {
|
|
283
345
|
queueEmitter.emit('pause');
|
|
284
346
|
|
|
@@ -291,7 +353,7 @@ function indexQueueFactory({
|
|
|
291
353
|
},
|
|
292
354
|
finish: () => {
|
|
293
355
|
finished = true;
|
|
294
|
-
stream$1.
|
|
356
|
+
stream$1.end();
|
|
295
357
|
},
|
|
296
358
|
queueEmitter
|
|
297
359
|
};
|
|
@@ -408,40 +470,154 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client, query,
|
|
|
408
470
|
};
|
|
409
471
|
}
|
|
410
472
|
|
|
411
|
-
|
|
473
|
+
const DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES = 100000;
|
|
474
|
+
const DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE = 1000;
|
|
475
|
+
function readSample(filePath, sampleBytes) {
|
|
476
|
+
const fd = fs.openSync(filePath, 'r');
|
|
477
|
+
try {
|
|
478
|
+
const buffer = Buffer.alloc(sampleBytes);
|
|
479
|
+
const bytesRead = fs.readSync(fd, buffer, 0, sampleBytes, 0);
|
|
480
|
+
return buffer.subarray(0, bytesRead).toString('utf8');
|
|
481
|
+
} finally {
|
|
482
|
+
fs.closeSync(fd);
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
function emptyInferenceResult(mappings) {
|
|
486
|
+
return {
|
|
487
|
+
mappings,
|
|
488
|
+
ingestPipeline: undefined
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
async function inferMappingsFromSource({
|
|
492
|
+
targetClient,
|
|
493
|
+
fileName,
|
|
494
|
+
sourceFormat,
|
|
495
|
+
csvOptions,
|
|
496
|
+
skipHeader,
|
|
497
|
+
mappings,
|
|
498
|
+
inferMappings,
|
|
499
|
+
inferMappingsOptions,
|
|
500
|
+
verbose
|
|
501
|
+
}) {
|
|
502
|
+
if (!inferMappings || typeof mappings !== 'undefined' || typeof fileName === 'undefined') {
|
|
503
|
+
return emptyInferenceResult(mappings);
|
|
504
|
+
}
|
|
505
|
+
if (typeof targetClient?.textStructure?.findStructure !== 'function' || sourceFormat === 'xml' || sourceFormat === 'semi_structured_text') {
|
|
506
|
+
return emptyInferenceResult(mappings);
|
|
507
|
+
}
|
|
508
|
+
const files = glob.globSync(fileName);
|
|
509
|
+
if (files.length === 0) {
|
|
510
|
+
if (verbose) console.log(`No files matched for mapping inference: ${fileName}`);
|
|
511
|
+
return emptyInferenceResult(mappings);
|
|
512
|
+
}
|
|
513
|
+
const {
|
|
514
|
+
sampleBytes = DEFAULT_INFER_MAPPINGS_SAMPLE_BYTES,
|
|
515
|
+
...requestParams
|
|
516
|
+
} = inferMappingsOptions || {};
|
|
517
|
+
const sampleText = readSample(files[0], sampleBytes);
|
|
518
|
+
if (!sampleText || sampleText.trim() === '') {
|
|
519
|
+
if (verbose) console.log('Skipping mapping inference because the sample text is empty.');
|
|
520
|
+
return emptyInferenceResult(mappings);
|
|
521
|
+
}
|
|
522
|
+
const params = {
|
|
523
|
+
body: sampleText,
|
|
524
|
+
lines_to_sample: DEFAULT_INFER_MAPPINGS_LINES_TO_SAMPLE,
|
|
525
|
+
...requestParams
|
|
526
|
+
};
|
|
527
|
+
if (typeof params.format === 'undefined') {
|
|
528
|
+
params.format = sourceFormat === 'csv' ? 'delimited' : 'ndjson';
|
|
529
|
+
}
|
|
530
|
+
if (sourceFormat === 'csv') {
|
|
531
|
+
if (typeof params.delimiter === 'undefined' && typeof csvOptions?.delimiter === 'string') {
|
|
532
|
+
params.delimiter = csvOptions.delimiter;
|
|
533
|
+
}
|
|
534
|
+
if (typeof params.quote === 'undefined' && typeof csvOptions?.quote === 'string') {
|
|
535
|
+
params.quote = csvOptions.quote;
|
|
536
|
+
}
|
|
537
|
+
if (typeof params.has_header_row === 'undefined' && typeof csvOptions?.columns === 'boolean') {
|
|
538
|
+
params.has_header_row = csvOptions.columns;
|
|
539
|
+
}
|
|
540
|
+
if (typeof params.has_header_row === 'undefined' && skipHeader) {
|
|
541
|
+
params.has_header_row = true;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
try {
|
|
545
|
+
const response = await targetClient.textStructure.findStructure(params);
|
|
546
|
+
if (response?.mappings && verbose) {
|
|
547
|
+
console.log(`Inferred mappings via _text_structure/find_structure from ${files[0]}`);
|
|
548
|
+
}
|
|
549
|
+
if (response?.ingest_pipeline && verbose) {
|
|
550
|
+
console.log('Inferred ingest pipeline via _text_structure/find_structure');
|
|
551
|
+
}
|
|
552
|
+
return {
|
|
553
|
+
mappings: response?.mappings || mappings,
|
|
554
|
+
ingestPipeline: response?.ingest_pipeline
|
|
555
|
+
};
|
|
556
|
+
} catch (error) {
|
|
557
|
+
if (verbose) {
|
|
558
|
+
console.log('Could not infer mappings via _text_structure/find_structure:', error.message);
|
|
559
|
+
}
|
|
560
|
+
return emptyInferenceResult(mappings);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
function streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader = false, sourceFormat = 'ndjson', csvOptions = {}) {
|
|
565
|
+
function addParsedDoc(parsed, streamRef) {
|
|
566
|
+
const doc = typeof transform === 'function' ? transform(parsed) : parsed;
|
|
567
|
+
|
|
568
|
+
// if doc is null/undefined we'll skip indexing it
|
|
569
|
+
if (doc === null || typeof doc === 'undefined') {
|
|
570
|
+
streamRef.resume();
|
|
571
|
+
return;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
// the transform callback may return an array of docs so we can emit
|
|
575
|
+
// multiple docs from a single line
|
|
576
|
+
if (Array.isArray(doc)) {
|
|
577
|
+
doc.forEach(d => {
|
|
578
|
+
if (d === null || typeof d === 'undefined') return;
|
|
579
|
+
indexer.add(d);
|
|
580
|
+
});
|
|
581
|
+
return;
|
|
582
|
+
}
|
|
583
|
+
indexer.add(doc);
|
|
584
|
+
}
|
|
412
585
|
function startIndex() {
|
|
413
586
|
let finished = false;
|
|
414
|
-
const s = stream.pipe(
|
|
587
|
+
const s = sourceFormat === 'csv' ? stream.pipe(csvParse.parse(getCsvParserOptions(csvOptions, skipHeader))).pipe(es.mapSync(record => {
|
|
415
588
|
try {
|
|
416
|
-
|
|
417
|
-
if (line === '') {
|
|
418
|
-
return;
|
|
419
|
-
}
|
|
420
|
-
const doc = typeof transform === 'function' ? JSON.stringify(transform(JSON.parse(line))) : line;
|
|
421
|
-
|
|
422
|
-
// if doc is undefined we'll skip indexing it
|
|
423
|
-
if (typeof doc === 'undefined') {
|
|
424
|
-
s.resume();
|
|
425
|
-
return;
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
// the transform callback may return an array of docs so we can emit
|
|
429
|
-
// multiple docs from a single line
|
|
430
|
-
if (Array.isArray(doc)) {
|
|
431
|
-
doc.forEach(d => indexer.add(d));
|
|
432
|
-
return;
|
|
433
|
-
}
|
|
434
|
-
indexer.add(doc);
|
|
589
|
+
addParsedDoc(record, s);
|
|
435
590
|
} catch (e) {
|
|
436
591
|
console.log('error', e);
|
|
437
592
|
}
|
|
438
593
|
}).on('error', err => {
|
|
439
|
-
console.log('Error while reading stream.', err);
|
|
440
|
-
})
|
|
594
|
+
console.log('Error while reading CSV stream.', err);
|
|
595
|
+
})) : (() => {
|
|
596
|
+
let skippedHeader = false;
|
|
597
|
+
return stream.pipe(split(splitRegex)).pipe(es.mapSync(line => {
|
|
598
|
+
try {
|
|
599
|
+
// skip empty lines
|
|
600
|
+
if (line === '') {
|
|
601
|
+
return;
|
|
602
|
+
}
|
|
603
|
+
if (skipHeader && !skippedHeader) {
|
|
604
|
+
skippedHeader = true;
|
|
605
|
+
return;
|
|
606
|
+
}
|
|
607
|
+
const parsed = JSON.parse(line);
|
|
608
|
+
addParsedDoc(parsed, s);
|
|
609
|
+
} catch (e) {
|
|
610
|
+
console.log('error', e);
|
|
611
|
+
}
|
|
612
|
+
}).on('error', err => {
|
|
613
|
+
console.log('Error while reading stream.', err);
|
|
614
|
+
}));
|
|
615
|
+
})();
|
|
616
|
+
s.on('end', () => {
|
|
441
617
|
if (verbose) console.log('Read entire stream.');
|
|
442
618
|
indexer.finish();
|
|
443
619
|
finished = true;
|
|
444
|
-
})
|
|
620
|
+
});
|
|
445
621
|
indexer.queueEmitter.on('pause', () => {
|
|
446
622
|
if (finished) return;
|
|
447
623
|
s.pause();
|
|
@@ -530,11 +706,15 @@ async function transformer({
|
|
|
530
706
|
searchSize = DEFAULT_SEARCH_SIZE,
|
|
531
707
|
stream,
|
|
532
708
|
fileName,
|
|
709
|
+
sourceFormat = 'ndjson',
|
|
710
|
+
csvOptions = {},
|
|
533
711
|
splitRegex = /\n/,
|
|
534
712
|
sourceIndexName,
|
|
535
713
|
targetIndexName,
|
|
536
714
|
mappings,
|
|
537
715
|
mappingsOverride = false,
|
|
716
|
+
inferMappings = false,
|
|
717
|
+
inferMappingsOptions = {},
|
|
538
718
|
indexMappingTotalFieldsLimit,
|
|
539
719
|
pipeline,
|
|
540
720
|
populatedFields = false,
|
|
@@ -553,12 +733,24 @@ async function transformer({
|
|
|
553
733
|
// Support both old (config) and new (client instance) patterns
|
|
554
734
|
const sourceClient = await getOrCreateClient(sourceClientInput || sourceClientConfig, defaultClientConfig, sourceClientVersion);
|
|
555
735
|
const targetClient = await getOrCreateClient(targetClientInput || targetClientConfig || sourceClientInput || sourceClientConfig, defaultClientConfig, targetClientVersion);
|
|
736
|
+
const inferenceResult = await inferMappingsFromSource({
|
|
737
|
+
targetClient,
|
|
738
|
+
fileName,
|
|
739
|
+
sourceFormat,
|
|
740
|
+
csvOptions,
|
|
741
|
+
skipHeader,
|
|
742
|
+
mappings,
|
|
743
|
+
inferMappings,
|
|
744
|
+
inferMappingsOptions,
|
|
745
|
+
verbose
|
|
746
|
+
});
|
|
556
747
|
const createMapping = createMappingFactory({
|
|
557
748
|
sourceClient,
|
|
558
749
|
sourceIndexName,
|
|
559
750
|
targetClient,
|
|
560
751
|
targetIndexName,
|
|
561
|
-
mappings,
|
|
752
|
+
mappings: inferenceResult.mappings,
|
|
753
|
+
inferredIngestPipeline: inferenceResult.ingestPipeline,
|
|
562
754
|
mappingsOverride,
|
|
563
755
|
indexMappingTotalFieldsLimit,
|
|
564
756
|
verbose,
|
|
@@ -568,8 +760,12 @@ async function transformer({
|
|
|
568
760
|
const indexer = indexQueueFactory({
|
|
569
761
|
targetClient,
|
|
570
762
|
targetIndexName,
|
|
571
|
-
bufferSize
|
|
572
|
-
|
|
763
|
+
bufferSize});
|
|
764
|
+
function validateSourceFormat() {
|
|
765
|
+
if (sourceFormat !== 'ndjson' && sourceFormat !== 'csv') {
|
|
766
|
+
throw Error(`Unsupported sourceFormat: ${sourceFormat}. Use "ndjson" or "csv".`);
|
|
767
|
+
}
|
|
768
|
+
}
|
|
573
769
|
function getReader() {
|
|
574
770
|
if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
|
|
575
771
|
throw Error('Only either one of fileName or sourceIndexName can be specified.');
|
|
@@ -578,13 +774,15 @@ async function transformer({
|
|
|
578
774
|
throw Error('Only one of fileName, sourceIndexName, or stream can be specified.');
|
|
579
775
|
}
|
|
580
776
|
if (typeof fileName !== 'undefined') {
|
|
581
|
-
|
|
777
|
+
validateSourceFormat();
|
|
778
|
+
return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
|
|
582
779
|
}
|
|
583
780
|
if (typeof sourceIndexName !== 'undefined') {
|
|
584
781
|
return indexReaderFactory(indexer, sourceIndexName, transform, sourceClient, query, searchSize, populatedFields);
|
|
585
782
|
}
|
|
586
783
|
if (typeof stream !== 'undefined') {
|
|
587
|
-
|
|
784
|
+
validateSourceFormat();
|
|
785
|
+
return streamReaderFactory(indexer, stream, transform, splitRegex, verbose, skipHeader, sourceFormat, csvOptions);
|
|
588
786
|
}
|
|
589
787
|
return null;
|
|
590
788
|
}
|