node-es-transformer 1.0.0-beta1 → 1.0.0-beta2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/dist/node-es-transformer.cjs.js +55 -11
- package/dist/node-es-transformer.esm.js +55 -11
- package/package.json +8 -3
package/README.md
CHANGED
|
@@ -118,6 +118,7 @@ transformer({
|
|
|
118
118
|
- `mappings`: Optional Elasticsearch document mappings. If not set and you're reindexing from another index, the mappings from the existing index will be used.
|
|
119
119
|
- `mappingsOverride`: If you're reindexing and this is set to `true`, `mappings` will be applied on top of the source index's mappings. Defaults to `false`.
|
|
120
120
|
- `indexMappingTotalFieldsLimit`: Optional field limit for the target index to be created that will be passed on as the `index.mapping.total_fields.limit` setting.
|
|
121
|
+
- `populatedFields`: If `true`, fetches a set of random documents to identify which fields are actually used by documents. Can be useful for indices with lots of field mappings to increase query/reindex performance. Defaults to `false`.
|
|
121
122
|
- `query`: Optional Elasticsearch [DSL query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) to filter documents from the source index.
|
|
122
123
|
- `skipHeader`: If true, skips the first line of the source file. Defaults to `false`.
|
|
123
124
|
- `transform(line)`: A callback function which allows the transformation of a source line into one or several documents.
|
|
@@ -152,6 +153,8 @@ docker pull docker.elastic.co/elasticsearch/elasticsearch:8.10.4
|
|
|
152
153
|
docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.10.4
|
|
153
154
|
```
|
|
154
155
|
|
|
156
|
+
To commit, use `cz`. To prepare a release, use e.g. `yarn release -- --release-as 1.0.0-beta2`.
|
|
157
|
+
|
|
155
158
|
## License
|
|
156
159
|
|
|
157
160
|
[Apache 2.0](LICENSE).
|
|
@@ -77,7 +77,16 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
|
77
77
|
es
|
|
78
78
|
.mapSync(function (line) {
|
|
79
79
|
try {
|
|
80
|
-
|
|
80
|
+
// skip empty lines
|
|
81
|
+
if (line === '') {
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
var doc =
|
|
86
|
+
typeof transform === 'function'
|
|
87
|
+
? JSON.stringify(transform(JSON.parse(line)))
|
|
88
|
+
: line;
|
|
89
|
+
|
|
81
90
|
// if doc is undefined we'll skip indexing it
|
|
82
91
|
if (typeof doc === 'undefined') {
|
|
83
92
|
s.resume();
|
|
@@ -269,40 +278,73 @@ function indexReaderFactory(
|
|
|
269
278
|
transform,
|
|
270
279
|
client,
|
|
271
280
|
query,
|
|
272
|
-
bufferSize
|
|
281
|
+
bufferSize,
|
|
282
|
+
populatedFields
|
|
273
283
|
) {
|
|
274
284
|
if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
|
|
285
|
+
if ( populatedFields === void 0 ) populatedFields = false;
|
|
275
286
|
|
|
276
287
|
return async function indexReader() {
|
|
277
288
|
var responseQueue = [];
|
|
278
289
|
var docsNum = 0;
|
|
279
290
|
|
|
280
|
-
function
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
291
|
+
async function fetchPopulatedFields() {
|
|
292
|
+
try {
|
|
293
|
+
var response = await client.search({
|
|
294
|
+
index: sourceIndexName,
|
|
295
|
+
size: bufferSize,
|
|
296
|
+
query: {
|
|
297
|
+
function_score: {
|
|
298
|
+
query: query,
|
|
299
|
+
random_score: {},
|
|
300
|
+
},
|
|
301
|
+
},
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
// Get all field names for each returned doc and flatten it
|
|
305
|
+
// to a list of unique field names used across all docs.
|
|
306
|
+
return new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1));
|
|
307
|
+
} catch (e) {
|
|
308
|
+
console.log('error', e);
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function search(fields) {
|
|
313
|
+
return client.search(Object.assign({}, {index: sourceIndexName,
|
|
314
|
+
scroll: '600s',
|
|
284
315
|
size: bufferSize,
|
|
285
|
-
query: query,
|
|
286
|
-
|
|
316
|
+
query: query},
|
|
317
|
+
(fields ? { _source: fields } : {})));
|
|
287
318
|
}
|
|
288
319
|
|
|
289
320
|
function scroll(id) {
|
|
290
321
|
return client.scroll({
|
|
291
322
|
scroll_id: id,
|
|
292
|
-
scroll: '
|
|
323
|
+
scroll: '600s',
|
|
293
324
|
});
|
|
294
325
|
}
|
|
295
326
|
|
|
327
|
+
var fieldsWithData;
|
|
328
|
+
|
|
329
|
+
// identify populated fields
|
|
330
|
+
if (populatedFields) {
|
|
331
|
+
fieldsWithData = await fetchPopulatedFields();
|
|
332
|
+
console.log('fieldsWithData', fieldsWithData);
|
|
333
|
+
}
|
|
334
|
+
|
|
296
335
|
// start things off by searching, setting a scroll timeout, and pushing
|
|
297
336
|
// our first response into the queue to be processed
|
|
298
|
-
var se = await search();
|
|
337
|
+
var se = await search(fieldsWithData);
|
|
299
338
|
responseQueue.push(se);
|
|
300
339
|
progressBar.start(se.hits.total.value, 0);
|
|
340
|
+
console.log('se', se.hits.hits[0]);
|
|
301
341
|
|
|
302
342
|
function processHit(hit) {
|
|
303
343
|
docsNum += 1;
|
|
304
344
|
try {
|
|
305
345
|
var doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
|
|
346
|
+
// console.log('doc', doc);
|
|
347
|
+
|
|
306
348
|
// if doc is undefined we'll skip indexing it
|
|
307
349
|
if (typeof doc === 'undefined') {
|
|
308
350
|
return;
|
|
@@ -398,6 +440,7 @@ async function transformer(ref) {
|
|
|
398
440
|
var mappings = ref.mappings;
|
|
399
441
|
var mappingsOverride = ref.mappingsOverride; if ( mappingsOverride === void 0 ) mappingsOverride = false;
|
|
400
442
|
var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
|
|
443
|
+
var populatedFields = ref.populatedFields; if ( populatedFields === void 0 ) populatedFields = false;
|
|
401
444
|
var query = ref.query;
|
|
402
445
|
var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
|
|
403
446
|
var transform = ref.transform;
|
|
@@ -454,7 +497,8 @@ async function transformer(ref) {
|
|
|
454
497
|
transform,
|
|
455
498
|
sourceClient,
|
|
456
499
|
query,
|
|
457
|
-
bufferSize
|
|
500
|
+
bufferSize,
|
|
501
|
+
populatedFields
|
|
458
502
|
);
|
|
459
503
|
}
|
|
460
504
|
|
|
@@ -73,7 +73,16 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
|
|
|
73
73
|
es
|
|
74
74
|
.mapSync(function (line) {
|
|
75
75
|
try {
|
|
76
|
-
|
|
76
|
+
// skip empty lines
|
|
77
|
+
if (line === '') {
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
var doc =
|
|
82
|
+
typeof transform === 'function'
|
|
83
|
+
? JSON.stringify(transform(JSON.parse(line)))
|
|
84
|
+
: line;
|
|
85
|
+
|
|
77
86
|
// if doc is undefined we'll skip indexing it
|
|
78
87
|
if (typeof doc === 'undefined') {
|
|
79
88
|
s.resume();
|
|
@@ -265,40 +274,73 @@ function indexReaderFactory(
|
|
|
265
274
|
transform,
|
|
266
275
|
client,
|
|
267
276
|
query,
|
|
268
|
-
bufferSize
|
|
277
|
+
bufferSize,
|
|
278
|
+
populatedFields
|
|
269
279
|
) {
|
|
270
280
|
if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
|
|
281
|
+
if ( populatedFields === void 0 ) populatedFields = false;
|
|
271
282
|
|
|
272
283
|
return async function indexReader() {
|
|
273
284
|
var responseQueue = [];
|
|
274
285
|
var docsNum = 0;
|
|
275
286
|
|
|
276
|
-
function
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
287
|
+
async function fetchPopulatedFields() {
|
|
288
|
+
try {
|
|
289
|
+
var response = await client.search({
|
|
290
|
+
index: sourceIndexName,
|
|
291
|
+
size: bufferSize,
|
|
292
|
+
query: {
|
|
293
|
+
function_score: {
|
|
294
|
+
query: query,
|
|
295
|
+
random_score: {},
|
|
296
|
+
},
|
|
297
|
+
},
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
// Get all field names for each returned doc and flatten it
|
|
301
|
+
// to a list of unique field names used across all docs.
|
|
302
|
+
return new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1));
|
|
303
|
+
} catch (e) {
|
|
304
|
+
console.log('error', e);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
function search(fields) {
|
|
309
|
+
return client.search(Object.assign({}, {index: sourceIndexName,
|
|
310
|
+
scroll: '600s',
|
|
280
311
|
size: bufferSize,
|
|
281
|
-
query: query,
|
|
282
|
-
|
|
312
|
+
query: query},
|
|
313
|
+
(fields ? { _source: fields } : {})));
|
|
283
314
|
}
|
|
284
315
|
|
|
285
316
|
function scroll(id) {
|
|
286
317
|
return client.scroll({
|
|
287
318
|
scroll_id: id,
|
|
288
|
-
scroll: '
|
|
319
|
+
scroll: '600s',
|
|
289
320
|
});
|
|
290
321
|
}
|
|
291
322
|
|
|
323
|
+
var fieldsWithData;
|
|
324
|
+
|
|
325
|
+
// identify populated fields
|
|
326
|
+
if (populatedFields) {
|
|
327
|
+
fieldsWithData = await fetchPopulatedFields();
|
|
328
|
+
console.log('fieldsWithData', fieldsWithData);
|
|
329
|
+
}
|
|
330
|
+
|
|
292
331
|
// start things off by searching, setting a scroll timeout, and pushing
|
|
293
332
|
// our first response into the queue to be processed
|
|
294
|
-
var se = await search();
|
|
333
|
+
var se = await search(fieldsWithData);
|
|
295
334
|
responseQueue.push(se);
|
|
296
335
|
progressBar.start(se.hits.total.value, 0);
|
|
336
|
+
console.log('se', se.hits.hits[0]);
|
|
297
337
|
|
|
298
338
|
function processHit(hit) {
|
|
299
339
|
docsNum += 1;
|
|
300
340
|
try {
|
|
301
341
|
var doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
|
|
342
|
+
// console.log('doc', doc);
|
|
343
|
+
|
|
302
344
|
// if doc is undefined we'll skip indexing it
|
|
303
345
|
if (typeof doc === 'undefined') {
|
|
304
346
|
return;
|
|
@@ -394,6 +436,7 @@ async function transformer(ref) {
|
|
|
394
436
|
var mappings = ref.mappings;
|
|
395
437
|
var mappingsOverride = ref.mappingsOverride; if ( mappingsOverride === void 0 ) mappingsOverride = false;
|
|
396
438
|
var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
|
|
439
|
+
var populatedFields = ref.populatedFields; if ( populatedFields === void 0 ) populatedFields = false;
|
|
397
440
|
var query = ref.query;
|
|
398
441
|
var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
|
|
399
442
|
var transform = ref.transform;
|
|
@@ -450,7 +493,8 @@ async function transformer(ref) {
|
|
|
450
493
|
transform,
|
|
451
494
|
sourceClient,
|
|
452
495
|
query,
|
|
453
|
-
bufferSize
|
|
496
|
+
bufferSize,
|
|
497
|
+
populatedFields
|
|
454
498
|
);
|
|
455
499
|
}
|
|
456
500
|
|
package/package.json
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"license": "Apache-2.0",
|
|
15
15
|
"author": "Walter Rafelsberger <walter@rafelsberger.at>",
|
|
16
16
|
"contributors": [],
|
|
17
|
-
"version": "1.0.0-
|
|
17
|
+
"version": "1.0.0-beta2",
|
|
18
18
|
"main": "dist/node-es-transformer.cjs.js",
|
|
19
19
|
"module": "dist/node-es-transformer.esm.js",
|
|
20
20
|
"dependencies": {
|
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
},
|
|
26
26
|
"devDependencies": {
|
|
27
27
|
"acorn": "^6.4.2",
|
|
28
|
+
"async-retry": "^1.3.3",
|
|
28
29
|
"commit-and-tag-version": "^11.3.0",
|
|
29
30
|
"cz-conventional-changelog": "^3.3.0",
|
|
30
31
|
"eslint": "^8.51.0",
|
|
@@ -35,7 +36,6 @@
|
|
|
35
36
|
"eslint-plugin-jsx-a11y": "6.7.1",
|
|
36
37
|
"eslint-plugin-prettier": "^3.3.1",
|
|
37
38
|
"eslint-plugin-react": "7.32.2",
|
|
38
|
-
"frisby": "^2.1.3",
|
|
39
39
|
"jest": "^29.7.0",
|
|
40
40
|
"prettier": "^2.2.1",
|
|
41
41
|
"rollup": "0.66.6",
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
"scripts": {
|
|
47
47
|
"build": "rollup -c",
|
|
48
48
|
"dev": "rollup -c -w",
|
|
49
|
-
"test": "jest",
|
|
49
|
+
"test": "jest --runInBand --detectOpenHandles --forceExit",
|
|
50
50
|
"pretest": "npm run build",
|
|
51
51
|
"release": "commit-and-tag-version",
|
|
52
52
|
"create-sample-data-10000": "node scripts/create_sample_data_10000",
|
|
@@ -59,5 +59,10 @@
|
|
|
59
59
|
"commitizen": {
|
|
60
60
|
"path": "./node_modules/cz-conventional-changelog"
|
|
61
61
|
}
|
|
62
|
+
},
|
|
63
|
+
"jest": {
|
|
64
|
+
"testMatch": [
|
|
65
|
+
"**/__tests__/**/*.test.js"
|
|
66
|
+
]
|
|
62
67
|
}
|
|
63
68
|
}
|