node-es-transformer 1.0.0-beta1 → 1.0.0-beta2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -118,6 +118,7 @@ transformer({
118
118
  - `mappings`: Optional Elasticsearch document mappings. If not set and you're reindexing from another index, the mappings from the existing index will be used.
119
119
  - `mappingsOverride`: If you're reindexing and this is set to `true`, `mappings` will be applied on top of the source index's mappings. Defaults to `false`.
120
120
  - `indexMappingTotalFieldsLimit`: Optional field limit for the target index to be created that will be passed on as the `index.mapping.total_fields.limit` setting.
121
+ - `populatedFields`: If `true`, fetches a set of random documents to identify which fields are actually used by documents. Can be useful for indices with lots of field mappings to increase query/reindex performance. Defaults to `false`.
121
122
  - `query`: Optional Elasticsearch [DSL query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html) to filter documents from the source index.
122
123
  - `skipHeader`: If true, skips the first line of the source file. Defaults to `false`.
123
124
  - `transform(line)`: A callback function which allows the transformation of a source line into one or several documents.
@@ -152,6 +153,8 @@ docker pull docker.elastic.co/elasticsearch/elasticsearch:8.10.4
152
153
  docker run --name es01 --net elastic -p 9200:9200 -it -m 1GB -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.10.4
153
154
  ```
154
155
 
156
+ To commit, use `cz`. To prepare a release, use e.g. `yarn release -- --release-as 1.0.0-beta2`.
157
+
155
158
  ## License
156
159
 
157
160
  [Apache 2.0](LICENSE).
@@ -77,7 +77,16 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
77
77
  es
78
78
  .mapSync(function (line) {
79
79
  try {
80
- var doc = typeof transform === 'function' ? transform(line) : line;
80
+ // skip empty lines
81
+ if (line === '') {
82
+ return;
83
+ }
84
+
85
+ var doc =
86
+ typeof transform === 'function'
87
+ ? JSON.stringify(transform(JSON.parse(line)))
88
+ : line;
89
+
81
90
  // if doc is undefined we'll skip indexing it
82
91
  if (typeof doc === 'undefined') {
83
92
  s.resume();
@@ -269,40 +278,73 @@ function indexReaderFactory(
269
278
  transform,
270
279
  client,
271
280
  query,
272
- bufferSize
281
+ bufferSize,
282
+ populatedFields
273
283
  ) {
274
284
  if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
285
+ if ( populatedFields === void 0 ) populatedFields = false;
275
286
 
276
287
  return async function indexReader() {
277
288
  var responseQueue = [];
278
289
  var docsNum = 0;
279
290
 
280
- function search() {
281
- return client.search({
282
- index: sourceIndexName,
283
- scroll: '30s',
291
+ async function fetchPopulatedFields() {
292
+ try {
293
+ var response = await client.search({
294
+ index: sourceIndexName,
295
+ size: bufferSize,
296
+ query: {
297
+ function_score: {
298
+ query: query,
299
+ random_score: {},
300
+ },
301
+ },
302
+ });
303
+
304
+ // Get all field names for each returned doc and flatten it
305
+ // to a list of unique field names used across all docs.
306
+ return new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1));
307
+ } catch (e) {
308
+ console.log('error', e);
309
+ }
310
+ }
311
+
312
+ function search(fields) {
313
+ return client.search(Object.assign({}, {index: sourceIndexName,
314
+ scroll: '600s',
284
315
  size: bufferSize,
285
- query: query,
286
- });
316
+ query: query},
317
+ (fields ? { _source: fields } : {})));
287
318
  }
288
319
 
289
320
  function scroll(id) {
290
321
  return client.scroll({
291
322
  scroll_id: id,
292
- scroll: '30s',
323
+ scroll: '600s',
293
324
  });
294
325
  }
295
326
 
327
+ var fieldsWithData;
328
+
329
+ // identify populated fields
330
+ if (populatedFields) {
331
+ fieldsWithData = await fetchPopulatedFields();
332
+ console.log('fieldsWithData', fieldsWithData);
333
+ }
334
+
296
335
  // start things off by searching, setting a scroll timeout, and pushing
297
336
  // our first response into the queue to be processed
298
- var se = await search();
337
+ var se = await search(fieldsWithData);
299
338
  responseQueue.push(se);
300
339
  progressBar.start(se.hits.total.value, 0);
340
+ console.log('se', se.hits.hits[0]);
301
341
 
302
342
  function processHit(hit) {
303
343
  docsNum += 1;
304
344
  try {
305
345
  var doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
346
+ // console.log('doc', doc);
347
+
306
348
  // if doc is undefined we'll skip indexing it
307
349
  if (typeof doc === 'undefined') {
308
350
  return;
@@ -398,6 +440,7 @@ async function transformer(ref) {
398
440
  var mappings = ref.mappings;
399
441
  var mappingsOverride = ref.mappingsOverride; if ( mappingsOverride === void 0 ) mappingsOverride = false;
400
442
  var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
443
+ var populatedFields = ref.populatedFields; if ( populatedFields === void 0 ) populatedFields = false;
401
444
  var query = ref.query;
402
445
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
403
446
  var transform = ref.transform;
@@ -454,7 +497,8 @@ async function transformer(ref) {
454
497
  transform,
455
498
  sourceClient,
456
499
  query,
457
- bufferSize
500
+ bufferSize,
501
+ populatedFields
458
502
  );
459
503
  }
460
504
 
@@ -73,7 +73,16 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
73
73
  es
74
74
  .mapSync(function (line) {
75
75
  try {
76
- var doc = typeof transform === 'function' ? transform(line) : line;
76
+ // skip empty lines
77
+ if (line === '') {
78
+ return;
79
+ }
80
+
81
+ var doc =
82
+ typeof transform === 'function'
83
+ ? JSON.stringify(transform(JSON.parse(line)))
84
+ : line;
85
+
77
86
  // if doc is undefined we'll skip indexing it
78
87
  if (typeof doc === 'undefined') {
79
88
  s.resume();
@@ -265,40 +274,73 @@ function indexReaderFactory(
265
274
  transform,
266
275
  client,
267
276
  query,
268
- bufferSize
277
+ bufferSize,
278
+ populatedFields
269
279
  ) {
270
280
  if ( bufferSize === void 0 ) bufferSize = DEFAULT_BUFFER_SIZE;
281
+ if ( populatedFields === void 0 ) populatedFields = false;
271
282
 
272
283
  return async function indexReader() {
273
284
  var responseQueue = [];
274
285
  var docsNum = 0;
275
286
 
276
- function search() {
277
- return client.search({
278
- index: sourceIndexName,
279
- scroll: '30s',
287
+ async function fetchPopulatedFields() {
288
+ try {
289
+ var response = await client.search({
290
+ index: sourceIndexName,
291
+ size: bufferSize,
292
+ query: {
293
+ function_score: {
294
+ query: query,
295
+ random_score: {},
296
+ },
297
+ },
298
+ });
299
+
300
+ // Get all field names for each returned doc and flatten it
301
+ // to a list of unique field names used across all docs.
302
+ return new Set(response.hits.hits.map(function (d) { return Object.keys(d._source); }).flat(1));
303
+ } catch (e) {
304
+ console.log('error', e);
305
+ }
306
+ }
307
+
308
+ function search(fields) {
309
+ return client.search(Object.assign({}, {index: sourceIndexName,
310
+ scroll: '600s',
280
311
  size: bufferSize,
281
- query: query,
282
- });
312
+ query: query},
313
+ (fields ? { _source: fields } : {})));
283
314
  }
284
315
 
285
316
  function scroll(id) {
286
317
  return client.scroll({
287
318
  scroll_id: id,
288
- scroll: '30s',
319
+ scroll: '600s',
289
320
  });
290
321
  }
291
322
 
323
+ var fieldsWithData;
324
+
325
+ // identify populated fields
326
+ if (populatedFields) {
327
+ fieldsWithData = await fetchPopulatedFields();
328
+ console.log('fieldsWithData', fieldsWithData);
329
+ }
330
+
292
331
  // start things off by searching, setting a scroll timeout, and pushing
293
332
  // our first response into the queue to be processed
294
- var se = await search();
333
+ var se = await search(fieldsWithData);
295
334
  responseQueue.push(se);
296
335
  progressBar.start(se.hits.total.value, 0);
336
+ console.log('se', se.hits.hits[0]);
297
337
 
298
338
  function processHit(hit) {
299
339
  docsNum += 1;
300
340
  try {
301
341
  var doc = typeof transform === 'function' ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
342
+ // console.log('doc', doc);
343
+
302
344
  // if doc is undefined we'll skip indexing it
303
345
  if (typeof doc === 'undefined') {
304
346
  return;
@@ -394,6 +436,7 @@ async function transformer(ref) {
394
436
  var mappings = ref.mappings;
395
437
  var mappingsOverride = ref.mappingsOverride; if ( mappingsOverride === void 0 ) mappingsOverride = false;
396
438
  var indexMappingTotalFieldsLimit = ref.indexMappingTotalFieldsLimit;
439
+ var populatedFields = ref.populatedFields; if ( populatedFields === void 0 ) populatedFields = false;
397
440
  var query = ref.query;
398
441
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
399
442
  var transform = ref.transform;
@@ -450,7 +493,8 @@ async function transformer(ref) {
450
493
  transform,
451
494
  sourceClient,
452
495
  query,
453
- bufferSize
496
+ bufferSize,
497
+ populatedFields
454
498
  );
455
499
  }
456
500
 
package/package.json CHANGED
@@ -14,7 +14,7 @@
14
14
  "license": "Apache-2.0",
15
15
  "author": "Walter Rafelsberger <walter@rafelsberger.at>",
16
16
  "contributors": [],
17
- "version": "1.0.0-beta1",
17
+ "version": "1.0.0-beta2",
18
18
  "main": "dist/node-es-transformer.cjs.js",
19
19
  "module": "dist/node-es-transformer.esm.js",
20
20
  "dependencies": {
@@ -25,6 +25,7 @@
25
25
  },
26
26
  "devDependencies": {
27
27
  "acorn": "^6.4.2",
28
+ "async-retry": "^1.3.3",
28
29
  "commit-and-tag-version": "^11.3.0",
29
30
  "cz-conventional-changelog": "^3.3.0",
30
31
  "eslint": "^8.51.0",
@@ -35,7 +36,6 @@
35
36
  "eslint-plugin-jsx-a11y": "6.7.1",
36
37
  "eslint-plugin-prettier": "^3.3.1",
37
38
  "eslint-plugin-react": "7.32.2",
38
- "frisby": "^2.1.3",
39
39
  "jest": "^29.7.0",
40
40
  "prettier": "^2.2.1",
41
41
  "rollup": "0.66.6",
@@ -46,7 +46,7 @@
46
46
  "scripts": {
47
47
  "build": "rollup -c",
48
48
  "dev": "rollup -c -w",
49
- "test": "jest",
49
+ "test": "jest --runInBand --detectOpenHandles --forceExit",
50
50
  "pretest": "npm run build",
51
51
  "release": "commit-and-tag-version",
52
52
  "create-sample-data-10000": "node scripts/create_sample_data_10000",
@@ -59,5 +59,10 @@
59
59
  "commitizen": {
60
60
  "path": "./node_modules/cz-conventional-changelog"
61
61
  }
62
+ },
63
+ "jest": {
64
+ "testMatch": [
65
+ "**/__tests__/**/*.test.js"
66
+ ]
62
67
  }
63
68
  }