node-es-transformer 1.0.0-alpha7 → 1.0.0-alpha9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,7 +12,6 @@ If you're looking for a nodejs based tool which allows you to ingest large CSV/J
12
12
 
13
13
  While I'd generally recommend using [Logstash](https://www.elastic.co/products/logstash), [filebeat](https://www.elastic.co/products/beats/filebeat) or [Ingest Nodes](https://www.elastic.co/guide/en/elasticsearch/reference/master/ingest.html) for established use cases, this tool may be of help especially if you feel more at home in the JavaScript/nodejs universe and have use cases with customized ingestion and data transformation needs.
14
14
 
15
-
16
15
  **This is experimental code, use at your own risk. Nonetheless, I encourage you to give it a try so I can gather some feedback.**
17
16
 
18
17
  ### So why is this still _alpha_?
@@ -21,13 +20,13 @@ While I'd generally recommend using [Logstash](https://www.elastic.co/products/l
21
20
  - The code needs some more safety measures to avoid some possible accidental data loss scenarios.
22
21
  - No test coverage yet.
23
22
 
24
- ----
23
+ ---
25
24
 
26
25
  Now that we've talked about the caveats, let's have a look what you actually get with this tool:
27
26
 
28
27
  ## Features
29
28
 
30
- - Buffering/Streaming for both reading and indexing. Files are read using streaming and Elasticsearch ingestion is done using buffered bulk indexing. This is tailored towards ingestion of large files. Successfully tested so far with JSON and CSV files in the range of 20-30 GBytes. On a single machine running both `node-es-transformer` and Elasticsearch ingestion rates up to 20k documents/second were achieved (2,9 GHz Intel Core i7, 16GByte RAM, SSD).
29
+ - Buffering/Streaming for both reading and indexing. Files are read using streaming and Elasticsearch ingestion is done using buffered bulk indexing. This is tailored towards ingestion of large files. Successfully tested so far with JSON and CSV files in the range of 20-30 GBytes. On a single machine running both `node-es-transformer` and Elasticsearch ingestion rates up to 20k documents/second were achieved (2,9 GHz Intel Core i7, 16GByte RAM, SSD).
31
30
  - Supports wildcards to ingest/transform a range of files in one go.
32
31
  - Supports fetching documents from existing indices using search/scroll. This allows you to reindex with custom data transformations just using JavaScript in the `transform` callback.
33
32
  - The `transform` callback gives you each source document, but you can split it up in multiple ones and return an array of documents. An example use case for this: Each source document is a Tweet and you want to transform that into an entity centric index based on Hashtags.
@@ -46,22 +45,19 @@ const transformer = require('node-es-transformer');
46
45
  transformer({
47
46
  fileName: 'filename.json',
48
47
  targetIndexName: 'my-index',
49
- typeName: 'doc',
50
48
  mappings: {
51
- doc: {
52
- properties: {
53
- '@timestamp': {
54
- type: 'date'
55
- },
56
- 'first_name': {
57
- type: 'keyword'
58
- },
59
- 'last_name': {
60
- type: 'keyword'
61
- }
62
- 'full_name': {
63
- type: 'keyword'
64
- }
49
+ properties: {
50
+ '@timestamp': {
51
+ type: 'date'
52
+ },
53
+ 'first_name': {
54
+ type: 'keyword'
55
+ },
56
+ 'last_name': {
57
+ type: 'keyword'
58
+ }
59
+ 'full_name': {
60
+ type: 'keyword'
65
61
  }
66
62
  }
67
63
  },
@@ -82,22 +78,20 @@ const transformer = require('node-es-transformer');
82
78
  transformer({
83
79
  sourceIndexName: 'my-source-index',
84
80
  targetIndexName: 'my-target-index',
85
- typeName: 'doc',
81
+ // optional, if you skip mappings, they will be fetched from the source index.
86
82
  mappings: {
87
- doc: {
88
- properties: {
89
- '@timestamp': {
90
- type: 'date'
91
- },
92
- 'first_name': {
93
- type: 'keyword'
94
- },
95
- 'last_name': {
96
- type: 'keyword'
97
- }
98
- 'full_name': {
99
- type: 'keyword'
100
- }
83
+ properties: {
84
+ '@timestamp': {
85
+ type: 'date'
86
+ },
87
+ 'first_name': {
88
+ type: 'keyword'
89
+ },
90
+ 'last_name': {
91
+ type: 'keyword'
92
+ }
93
+ 'full_name': {
94
+ type: 'keyword'
101
95
  }
102
96
  }
103
97
  },
@@ -112,15 +106,13 @@ transformer({
112
106
 
113
107
  ### Options
114
108
 
115
- - `deleteIndex`: Setting to automatically delete an existing index, default is `false`.
116
- - `host`: Elasticsearch host, defaults to `localhost`.
117
- - `port`: Elasticsearch port, defaults to `9200`.
109
+ - `deleteIndex`: Setting to automatically delete an existing index, default is `false`.
110
+ - `sourceClientConfig`/`targetClientConfig`: Optional Elasticsearch client options, defaults to `{ node: 'http://localhost:9200' }`.
118
111
  - `bufferSize`: The amount of documents inserted with each Elasticsearch bulk insert request, default is `1000`.
119
112
  - `fileName`: Source filename to ingest, supports wildcards. If this is set, `sourceIndexName` is not allowed.
120
113
  - `splitRegex`: Custom line split regex, defaults to `/\n/`.
121
- - `sourceIndexName`: The source Elasticsearch to reindex from. If this is set, `fileName` is not allowed.
114
+ - `sourceIndexName`: The source Elasticsearch index to reindex from. If this is set, `fileName` is not allowed.
122
115
  - `targetIndexName`: The target Elasticsearch index where documents will be indexed.
123
- - `typeName`: Elasticsearch document type name.
124
116
  - `mappings`: Elasticsearch document mapping.
125
117
  - `skipHeader`: If true, skips the first line of the source file. Defaults to `false`.
126
118
  - `transform(line)`: A callback function which allows the transformation of a source line into one or several documents.
@@ -138,10 +130,10 @@ yarn
138
130
 
139
131
  `yarn build` builds the library to `dist`, generating two files:
140
132
 
141
- * `dist/node-es-transformer.cjs.js`
142
- A CommonJS bundle, suitable for use in Node.js, that `require`s the external dependency. This corresponds to the `"main"` field in package.json
143
- * `dist/node-es-transformer.esm.js`
144
- an ES module bundle, suitable for use in other people's libraries and applications, that `import`s the external dependency. This corresponds to the `"module"` field in package.json
133
+ - `dist/node-es-transformer.cjs.js`
134
+ A CommonJS bundle, suitable for use in Node.js, that `require`s the external dependency. This corresponds to the `"main"` field in package.json
135
+ - `dist/node-es-transformer.esm.js`
136
+ an ES module bundle, suitable for use in other people's libraries and applications, that `import`s the external dependency. This corresponds to the `"module"` field in package.json
145
137
 
146
138
  `yarn dev` builds the library, then keeps rebuilding it whenever the source files change using [rollup-watch](https://github.com/rollup/rollup-watch).
147
139
 
@@ -5,40 +5,47 @@ function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'defau
5
5
  var fs = _interopDefault(require('fs'));
6
6
  var es = _interopDefault(require('event-stream'));
7
7
  var glob = _interopDefault(require('glob'));
8
- var elasticsearch = _interopDefault(require('elasticsearch'));
8
+ var cliProgress = _interopDefault(require('cli-progress'));
9
+ var elasticsearch = _interopDefault(require('@elastic/elasticsearch'));
9
10
 
10
11
  function createMappingFactory(ref) {
11
- var client = ref.client;
12
+ var sourceClient = ref.sourceClient;
13
+ var sourceIndexName = ref.sourceIndexName;
14
+ var targetClient = ref.targetClient;
12
15
  var targetIndexName = ref.targetIndexName;
13
16
  var mappings = ref.mappings;
14
17
  var verbose = ref.verbose;
15
18
 
16
- return function () { return (new Promise(function (resolve, reject) {
17
- console.log('targetIndexName', targetIndexName);
18
- if (
19
- typeof mappings === 'object'
20
- && mappings !== null
21
- ) {
22
- client.indices.create({
23
- index: targetIndexName,
24
- body: { mappings: mappings },
25
- }, function (err, resp) {
26
- if (err) {
27
- console.log('Error creating mapping', err);
28
- reject();
29
- return;
30
- }
31
- if (verbose) { console.log('Created mapping', resp); }
32
- resolve();
33
- });
34
- } else {
35
- resolve();
19
+ return async function () {
20
+ var targetMappings = mappings;
21
+
22
+ if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
23
+ try {
24
+ var mapping = await sourceClient.indices.getMapping({ index: sourceIndexName });
25
+ targetMappings = mapping[sourceIndexName].mappings;
26
+ } catch (err) {
27
+ console.log('Error reading source mapping', err);
28
+ return;
29
+ }
36
30
  }
37
- })); };
31
+
32
+ if (typeof targetMappings === 'object' && targetMappings !== null) {
33
+ try {
34
+ var resp = await targetClient.indices.create(
35
+ {
36
+ index: targetIndexName,
37
+ body: { mappings: targetMappings },
38
+ }
39
+ );
40
+ if (verbose) { console.log('Created target mapping', resp); }
41
+ } catch (err) {
42
+ console.log('Error creating target mapping', err);
43
+ }
44
+ }
45
+ };
38
46
  }
39
47
 
40
48
  function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
41
- console.log('splitRegex', splitRegex);
42
49
  function startIndex(files) {
43
50
  var file = files.shift();
44
51
  var s = fs.createReadStream(file)
@@ -48,15 +55,11 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
48
55
  try {
49
56
  var doc = (typeof transform === 'function') ? transform(line) : line;
50
57
  // if doc is undefined we'll skip indexing it
51
- if (
52
- typeof doc === 'undefined'
53
- || (Array.isArray(doc) && doc.length === 0)
54
- ) {
58
+ if (typeof doc === 'undefined') {
55
59
  s.resume();
56
60
  return;
57
61
  }
58
62
 
59
- //console.log('continue?');
60
63
  // the transform callback may return an array of docs so we can emit
61
64
  // multiple docs from a single line
62
65
  if (Array.isArray(doc)) {
@@ -81,7 +84,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
81
84
  }));
82
85
 
83
86
  indexer.queueEmitter.on('resume', function () {
84
- //console.log('on resume');
85
87
  s.resume();
86
88
  });
87
89
  }
@@ -99,9 +101,8 @@ var queueEmitter = new EventEmitter();
99
101
 
100
102
  // a simple helper queue to bulk index documents
101
103
  function indexQueueFactory(ref) {
102
- var client = ref.client;
104
+ var client = ref.targetClient;
103
105
  var targetIndexName = ref.targetIndexName;
104
- var typeName = ref.typeName; if ( typeName === void 0 ) typeName = 'doc';
105
106
  var bufferSize = ref.bufferSize; if ( bufferSize === void 0 ) bufferSize = 1000;
106
107
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
107
108
  var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
@@ -110,36 +111,40 @@ function indexQueueFactory(ref) {
110
111
  var queue = [];
111
112
  var ingesting = false;
112
113
 
113
- var ingest = function (b) {
114
+ var ingest = async function (b) {
114
115
  if (typeof b !== 'undefined') {
115
116
  queue.push(b);
117
+ queueEmitter.emit('queue-size', queue.length);
116
118
  }
117
119
 
118
120
  if (ingesting === false) {
119
121
  var docs = queue.shift();
122
+ queueEmitter.emit('queue-size', queue.length);
120
123
  ingesting = true;
121
124
  if (verbose) { console.log(("bulk ingest docs: " + (docs.length / 2) + ", queue length: " + (queue.length))); }
122
125
 
123
- client.bulk({ body: docs }, function () {
126
+ try {
127
+ await client.bulk({ body: docs });
124
128
  ingesting = false;
125
129
  if (queue.length > 0) {
126
130
  ingest();
127
131
  }
128
- });
132
+ } catch (err) {
133
+ console.log('bulk index error', err);
134
+ }
129
135
  }
130
136
 
131
137
  // console.log(`ingest: queue.length ${queue.length}`);
132
138
  if (queue.length === 0) {
139
+ queueEmitter.emit('queue-size', 0);
133
140
  queueEmitter.emit('resume');
134
141
  }
135
-
136
- return [];
137
142
  };
138
143
 
139
144
  return {
140
145
  add: function (doc) {
141
146
  if (!skipHeader) {
142
- var header = { index: { _index: targetIndexName, _type: typeName } };
147
+ var header = { index: { _index: targetIndexName } };
143
148
  buffer.push(header);
144
149
  }
145
150
  buffer.push(doc);
@@ -150,16 +155,24 @@ function indexQueueFactory(ref) {
150
155
  }
151
156
 
152
157
  if (buffer.length >= (bufferSize * 2)) {
153
- buffer = ingest(buffer);
158
+ ingest(buffer);
159
+ buffer = [];
154
160
  }
155
161
  },
156
- finish: function () {
157
- buffer = ingest(buffer);
162
+ finish: async function () {
163
+ await ingest(buffer);
164
+ buffer = [];
165
+ queueEmitter.emit('finish');
158
166
  },
159
167
  queueEmitter: queueEmitter,
160
168
  };
161
169
  }
162
170
 
171
+ var MAX_QUEUE_SIZE = 5;
172
+
173
+ // create a new progress bar instance and use shades_classic theme
174
+ var progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
175
+
163
176
  function indexReaderFactory(indexer, sourceIndexName, transform, client) {
164
177
  return async function indexReader() {
165
178
  var responseQueue = [];
@@ -169,12 +182,13 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client) {
169
182
  return client.search({
170
183
  index: sourceIndexName,
171
184
  scroll: '30s',
185
+ size: 10000,
172
186
  });
173
187
  }
174
188
 
175
189
  function scroll(id) {
176
190
  return client.scroll({
177
- scrollId: id,
191
+ scroll_id: id,
178
192
  scroll: '30s',
179
193
  });
180
194
  }
@@ -183,11 +197,12 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client) {
183
197
  // our first response into the queue to be processed
184
198
  var se = await search();
185
199
  responseQueue.push(se);
200
+ progressBar.start(se.hits.total.value, 0);
186
201
 
187
202
  function processHit(hit) {
188
203
  docsNum += 1;
189
204
  try {
190
- var doc = (typeof transform === 'function') ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle,max-len
205
+ var doc = (typeof transform === 'function') ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
191
206
  // if doc is undefined we'll skip indexing it
192
207
  if (typeof doc === 'undefined') {
193
208
  return;
@@ -206,36 +221,78 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client) {
206
221
  }
207
222
  }
208
223
 
209
- while (responseQueue.length) {
210
- var response = responseQueue.shift();
224
+ var ingestQueueSize = 0;
225
+ var scrollId = se._scroll_id; // eslint-disable-line no-underscore-dangle
226
+ var readActive = false;
227
+
228
+ async function processResponseQueue() {
229
+ while (responseQueue.length) {
230
+ readActive = true;
231
+ var response = responseQueue.shift();
232
+
233
+ // collect the docs from this response
234
+ response.hits.hits.forEach(processHit);
211
235
 
212
- // collect the docs from this response
213
- response.hits.hits.forEach(processHit);
236
+ progressBar.update(docsNum);
237
+
238
+ // check to see if we have collected all of the docs
239
+ // console.log('check count', response.hits.total.value, docsNum);
240
+ if (response.hits.total.value === docsNum) {
241
+ indexer.finish();
242
+ progressBar.stop();
243
+ break;
244
+ }
214
245
 
215
- // check to see if we have collected all of the docs
216
- if (response.hits.total === docsNum) {
217
- console.log('finished scrolling.');
218
- indexer.finish();
219
- break;
246
+ if (ingestQueueSize < MAX_QUEUE_SIZE) {
247
+ // get the next response if there are more docs to fetch
248
+ var sc = await scroll(response._scroll_id); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
249
+ scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
250
+ responseQueue.push(sc);
251
+ } else {
252
+ readActive = false;
253
+ }
220
254
  }
255
+ }
221
256
 
257
+ indexer.queueEmitter.on('queue-size', async function (size) {
258
+ ingestQueueSize = size;
259
+
260
+ if (!readActive && ingestQueueSize < MAX_QUEUE_SIZE) {
222
261
  // get the next response if there are more docs to fetch
223
- var sc = await scroll(response._scroll_id); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
262
+ var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
263
+ scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
264
+ responseQueue.push(sc);
265
+ processResponseQueue();
266
+ }
267
+ });
268
+
269
+ indexer.queueEmitter.on('resume', async function () {
270
+ ingestQueueSize = 0;
271
+
272
+ if (readActive) {
273
+ return;
274
+ }
275
+
276
+ // get the next response if there are more docs to fetch
277
+ var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
278
+ scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
224
279
  responseQueue.push(sc);
225
- }
280
+ processResponseQueue();
281
+ });
282
+
283
+ processResponseQueue();
226
284
  };
227
285
  }
228
286
 
229
- function transformer(ref) {
287
+ async function transformer(ref) {
230
288
  var deleteIndex = ref.deleteIndex; if ( deleteIndex === void 0 ) deleteIndex = false;
231
- var host = ref.host; if ( host === void 0 ) host = 'localhost';
232
- var port = ref.port; if ( port === void 0 ) port = '9200';
289
+ var sourceClientConfig = ref.sourceClientConfig;
290
+ var targetClientConfig = ref.targetClientConfig;
233
291
  var bufferSize = ref.bufferSize; if ( bufferSize === void 0 ) bufferSize = 1000;
234
292
  var fileName = ref.fileName;
235
293
  var splitRegex = ref.splitRegex; if ( splitRegex === void 0 ) splitRegex = /\n/;
236
294
  var sourceIndexName = ref.sourceIndexName;
237
295
  var targetIndexName = ref.targetIndexName;
238
- var typeName = ref.typeName;
239
296
  var mappings = ref.mappings;
240
297
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
241
298
  var transform = ref.transform;
@@ -245,30 +302,65 @@ function transformer(ref) {
245
302
  throw Error('targetIndexName must be specified.');
246
303
  }
247
304
 
248
- var client = new elasticsearch.Client({ host: (host + ":" + port) });
305
+ var defaultClientConfig = {
306
+ node: 'http://localhost:9200',
307
+ };
308
+
309
+ var sourceClient = new elasticsearch.Client(sourceClientConfig || defaultClientConfig);
310
+ var targetClient = new elasticsearch.Client(
311
+ targetClientConfig || sourceClientConfig || defaultClientConfig
312
+ );
249
313
 
250
314
  var createMapping = createMappingFactory({
251
- client: client, targetIndexName: targetIndexName, mappings: mappings, verbose: verbose,
315
+ sourceClient: sourceClient,
316
+ sourceIndexName: sourceIndexName,
317
+ targetClient: targetClient,
318
+ targetIndexName: targetIndexName,
319
+ mappings: mappings,
320
+ verbose: verbose,
252
321
  });
253
322
  var indexer = indexQueueFactory({
254
- client: client, targetIndexName: targetIndexName, typeName: typeName, bufferSize: bufferSize, skipHeader: skipHeader, verbose: verbose,
323
+ targetClient: targetClient,
324
+ targetIndexName: targetIndexName,
325
+ bufferSize: bufferSize,
326
+ skipHeader: skipHeader,
327
+ verbose: verbose,
255
328
  });
256
329
 
257
330
  function getReader() {
258
- if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
259
- throw Error('Only either one of fileName or sourceIndexName can be specified.');
331
+ if (
332
+ typeof fileName !== 'undefined'
333
+ && typeof sourceIndexName !== 'undefined'
334
+ ) {
335
+ throw Error(
336
+ 'Only either one of fileName or sourceIndexName can be specified.'
337
+ );
260
338
  }
261
339
 
262
- if (typeof fileName === 'undefined' && typeof sourceIndexName === 'undefined') {
340
+ if (
341
+ typeof fileName === 'undefined'
342
+ && typeof sourceIndexName === 'undefined'
343
+ ) {
263
344
  throw Error('Either fileName or sourceIndexName must be specified.');
264
345
  }
265
346
 
266
347
  if (typeof fileName !== 'undefined') {
267
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose);
348
+ return fileReaderFactory(
349
+ indexer,
350
+ fileName,
351
+ transform,
352
+ splitRegex,
353
+ verbose
354
+ );
268
355
  }
269
356
 
270
357
  if (typeof sourceIndexName !== 'undefined') {
271
- return indexReaderFactory(indexer, sourceIndexName, transform, client);
358
+ return indexReaderFactory(
359
+ indexer,
360
+ sourceIndexName,
361
+ transform,
362
+ sourceClient
363
+ );
272
364
  }
273
365
 
274
366
  return null;
@@ -276,17 +368,26 @@ function transformer(ref) {
276
368
 
277
369
  var reader = getReader();
278
370
 
279
- client.indices.exists({ index: targetIndexName }, function (err, resp) {
280
- if (resp === false) {
281
- createMapping().then(reader);
371
+ try {
372
+ var indexExists = await targetClient.indices.exists({ index: targetIndexName });
373
+
374
+ if (indexExists === false) {
375
+ await createMapping();
376
+ reader();
282
377
  } else if (deleteIndex === true) {
283
- client.indices.delete({ index: targetIndexName }, function () {
284
- createMapping().then(reader);
285
- });
378
+ await targetClient.indices.delete({ index: targetIndexName });
379
+ await createMapping();
380
+ reader();
286
381
  } else {
287
382
  reader();
288
383
  }
289
- });
384
+ } catch (error) {
385
+ console.error('Error checking index existence:', error);
386
+ } finally {
387
+ // targetClient.close();
388
+ }
389
+
390
+ return { events: indexer.queueEmitter };
290
391
  }
291
392
 
292
393
  module.exports = transformer;
@@ -1,40 +1,47 @@
1
1
  import fs from 'fs';
2
2
  import es from 'event-stream';
3
3
  import glob from 'glob';
4
- import elasticsearch from 'elasticsearch';
4
+ import cliProgress from 'cli-progress';
5
+ import elasticsearch from '@elastic/elasticsearch';
5
6
 
6
7
  function createMappingFactory(ref) {
7
- var client = ref.client;
8
+ var sourceClient = ref.sourceClient;
9
+ var sourceIndexName = ref.sourceIndexName;
10
+ var targetClient = ref.targetClient;
8
11
  var targetIndexName = ref.targetIndexName;
9
12
  var mappings = ref.mappings;
10
13
  var verbose = ref.verbose;
11
14
 
12
- return function () { return (new Promise(function (resolve, reject) {
13
- console.log('targetIndexName', targetIndexName);
14
- if (
15
- typeof mappings === 'object'
16
- && mappings !== null
17
- ) {
18
- client.indices.create({
19
- index: targetIndexName,
20
- body: { mappings: mappings },
21
- }, function (err, resp) {
22
- if (err) {
23
- console.log('Error creating mapping', err);
24
- reject();
25
- return;
26
- }
27
- if (verbose) { console.log('Created mapping', resp); }
28
- resolve();
29
- });
30
- } else {
31
- resolve();
15
+ return async function () {
16
+ var targetMappings = mappings;
17
+
18
+ if (sourceClient && sourceIndexName && typeof targetMappings === 'undefined') {
19
+ try {
20
+ var mapping = await sourceClient.indices.getMapping({ index: sourceIndexName });
21
+ targetMappings = mapping[sourceIndexName].mappings;
22
+ } catch (err) {
23
+ console.log('Error reading source mapping', err);
24
+ return;
25
+ }
32
26
  }
33
- })); };
27
+
28
+ if (typeof targetMappings === 'object' && targetMappings !== null) {
29
+ try {
30
+ var resp = await targetClient.indices.create(
31
+ {
32
+ index: targetIndexName,
33
+ body: { mappings: targetMappings },
34
+ }
35
+ );
36
+ if (verbose) { console.log('Created target mapping', resp); }
37
+ } catch (err) {
38
+ console.log('Error creating target mapping', err);
39
+ }
40
+ }
41
+ };
34
42
  }
35
43
 
36
44
  function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
37
- console.log('splitRegex', splitRegex);
38
45
  function startIndex(files) {
39
46
  var file = files.shift();
40
47
  var s = fs.createReadStream(file)
@@ -44,15 +51,11 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
44
51
  try {
45
52
  var doc = (typeof transform === 'function') ? transform(line) : line;
46
53
  // if doc is undefined we'll skip indexing it
47
- if (
48
- typeof doc === 'undefined'
49
- || (Array.isArray(doc) && doc.length === 0)
50
- ) {
54
+ if (typeof doc === 'undefined') {
51
55
  s.resume();
52
56
  return;
53
57
  }
54
58
 
55
- //console.log('continue?');
56
59
  // the transform callback may return an array of docs so we can emit
57
60
  // multiple docs from a single line
58
61
  if (Array.isArray(doc)) {
@@ -77,7 +80,6 @@ function fileReaderFactory(indexer, fileName, transform, splitRegex, verbose) {
77
80
  }));
78
81
 
79
82
  indexer.queueEmitter.on('resume', function () {
80
- //console.log('on resume');
81
83
  s.resume();
82
84
  });
83
85
  }
@@ -95,9 +97,8 @@ var queueEmitter = new EventEmitter();
95
97
 
96
98
  // a simple helper queue to bulk index documents
97
99
  function indexQueueFactory(ref) {
98
- var client = ref.client;
100
+ var client = ref.targetClient;
99
101
  var targetIndexName = ref.targetIndexName;
100
- var typeName = ref.typeName; if ( typeName === void 0 ) typeName = 'doc';
101
102
  var bufferSize = ref.bufferSize; if ( bufferSize === void 0 ) bufferSize = 1000;
102
103
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
103
104
  var verbose = ref.verbose; if ( verbose === void 0 ) verbose = true;
@@ -106,36 +107,40 @@ function indexQueueFactory(ref) {
106
107
  var queue = [];
107
108
  var ingesting = false;
108
109
 
109
- var ingest = function (b) {
110
+ var ingest = async function (b) {
110
111
  if (typeof b !== 'undefined') {
111
112
  queue.push(b);
113
+ queueEmitter.emit('queue-size', queue.length);
112
114
  }
113
115
 
114
116
  if (ingesting === false) {
115
117
  var docs = queue.shift();
118
+ queueEmitter.emit('queue-size', queue.length);
116
119
  ingesting = true;
117
120
  if (verbose) { console.log(("bulk ingest docs: " + (docs.length / 2) + ", queue length: " + (queue.length))); }
118
121
 
119
- client.bulk({ body: docs }, function () {
122
+ try {
123
+ await client.bulk({ body: docs });
120
124
  ingesting = false;
121
125
  if (queue.length > 0) {
122
126
  ingest();
123
127
  }
124
- });
128
+ } catch (err) {
129
+ console.log('bulk index error', err);
130
+ }
125
131
  }
126
132
 
127
133
  // console.log(`ingest: queue.length ${queue.length}`);
128
134
  if (queue.length === 0) {
135
+ queueEmitter.emit('queue-size', 0);
129
136
  queueEmitter.emit('resume');
130
137
  }
131
-
132
- return [];
133
138
  };
134
139
 
135
140
  return {
136
141
  add: function (doc) {
137
142
  if (!skipHeader) {
138
- var header = { index: { _index: targetIndexName, _type: typeName } };
143
+ var header = { index: { _index: targetIndexName } };
139
144
  buffer.push(header);
140
145
  }
141
146
  buffer.push(doc);
@@ -146,16 +151,24 @@ function indexQueueFactory(ref) {
146
151
  }
147
152
 
148
153
  if (buffer.length >= (bufferSize * 2)) {
149
- buffer = ingest(buffer);
154
+ ingest(buffer);
155
+ buffer = [];
150
156
  }
151
157
  },
152
- finish: function () {
153
- buffer = ingest(buffer);
158
+ finish: async function () {
159
+ await ingest(buffer);
160
+ buffer = [];
161
+ queueEmitter.emit('finish');
154
162
  },
155
163
  queueEmitter: queueEmitter,
156
164
  };
157
165
  }
158
166
 
167
+ var MAX_QUEUE_SIZE = 5;
168
+
169
+ // create a new progress bar instance and use shades_classic theme
170
+ var progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic);
171
+
159
172
  function indexReaderFactory(indexer, sourceIndexName, transform, client) {
160
173
  return async function indexReader() {
161
174
  var responseQueue = [];
@@ -165,12 +178,13 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client) {
165
178
  return client.search({
166
179
  index: sourceIndexName,
167
180
  scroll: '30s',
181
+ size: 10000,
168
182
  });
169
183
  }
170
184
 
171
185
  function scroll(id) {
172
186
  return client.scroll({
173
- scrollId: id,
187
+ scroll_id: id,
174
188
  scroll: '30s',
175
189
  });
176
190
  }
@@ -179,11 +193,12 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client) {
179
193
  // our first response into the queue to be processed
180
194
  var se = await search();
181
195
  responseQueue.push(se);
196
+ progressBar.start(se.hits.total.value, 0);
182
197
 
183
198
  function processHit(hit) {
184
199
  docsNum += 1;
185
200
  try {
186
- var doc = (typeof transform === 'function') ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle,max-len
201
+ var doc = (typeof transform === 'function') ? transform(hit._source) : hit._source; // eslint-disable-line no-underscore-dangle
187
202
  // if doc is undefined we'll skip indexing it
188
203
  if (typeof doc === 'undefined') {
189
204
  return;
@@ -202,36 +217,78 @@ function indexReaderFactory(indexer, sourceIndexName, transform, client) {
202
217
  }
203
218
  }
204
219
 
205
- while (responseQueue.length) {
206
- var response = responseQueue.shift();
220
+ var ingestQueueSize = 0;
221
+ var scrollId = se._scroll_id; // eslint-disable-line no-underscore-dangle
222
+ var readActive = false;
223
+
224
+ async function processResponseQueue() {
225
+ while (responseQueue.length) {
226
+ readActive = true;
227
+ var response = responseQueue.shift();
228
+
229
+ // collect the docs from this response
230
+ response.hits.hits.forEach(processHit);
207
231
 
208
- // collect the docs from this response
209
- response.hits.hits.forEach(processHit);
232
+ progressBar.update(docsNum);
233
+
234
+ // check to see if we have collected all of the docs
235
+ // console.log('check count', response.hits.total.value, docsNum);
236
+ if (response.hits.total.value === docsNum) {
237
+ indexer.finish();
238
+ progressBar.stop();
239
+ break;
240
+ }
210
241
 
211
- // check to see if we have collected all of the docs
212
- if (response.hits.total === docsNum) {
213
- console.log('finished scrolling.');
214
- indexer.finish();
215
- break;
242
+ if (ingestQueueSize < MAX_QUEUE_SIZE) {
243
+ // get the next response if there are more docs to fetch
244
+ var sc = await scroll(response._scroll_id); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
245
+ scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
246
+ responseQueue.push(sc);
247
+ } else {
248
+ readActive = false;
249
+ }
216
250
  }
251
+ }
217
252
 
253
+ indexer.queueEmitter.on('queue-size', async function (size) {
254
+ ingestQueueSize = size;
255
+
256
+ if (!readActive && ingestQueueSize < MAX_QUEUE_SIZE) {
218
257
  // get the next response if there are more docs to fetch
219
- var sc = await scroll(response._scroll_id); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
258
+ var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
259
+ scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
260
+ responseQueue.push(sc);
261
+ processResponseQueue();
262
+ }
263
+ });
264
+
265
+ indexer.queueEmitter.on('resume', async function () {
266
+ ingestQueueSize = 0;
267
+
268
+ if (readActive) {
269
+ return;
270
+ }
271
+
272
+ // get the next response if there are more docs to fetch
273
+ var sc = await scroll(scrollId); // eslint-disable-line no-await-in-loop,no-underscore-dangle,max-len
274
+ scrollId = sc._scroll_id; // eslint-disable-line no-underscore-dangle
220
275
  responseQueue.push(sc);
221
- }
276
+ processResponseQueue();
277
+ });
278
+
279
+ processResponseQueue();
222
280
  };
223
281
  }
224
282
 
225
- function transformer(ref) {
283
+ async function transformer(ref) {
226
284
  var deleteIndex = ref.deleteIndex; if ( deleteIndex === void 0 ) deleteIndex = false;
227
- var host = ref.host; if ( host === void 0 ) host = 'localhost';
228
- var port = ref.port; if ( port === void 0 ) port = '9200';
285
+ var sourceClientConfig = ref.sourceClientConfig;
286
+ var targetClientConfig = ref.targetClientConfig;
229
287
  var bufferSize = ref.bufferSize; if ( bufferSize === void 0 ) bufferSize = 1000;
230
288
  var fileName = ref.fileName;
231
289
  var splitRegex = ref.splitRegex; if ( splitRegex === void 0 ) splitRegex = /\n/;
232
290
  var sourceIndexName = ref.sourceIndexName;
233
291
  var targetIndexName = ref.targetIndexName;
234
- var typeName = ref.typeName;
235
292
  var mappings = ref.mappings;
236
293
  var skipHeader = ref.skipHeader; if ( skipHeader === void 0 ) skipHeader = false;
237
294
  var transform = ref.transform;
@@ -241,30 +298,65 @@ function transformer(ref) {
241
298
  throw Error('targetIndexName must be specified.');
242
299
  }
243
300
 
244
- var client = new elasticsearch.Client({ host: (host + ":" + port) });
301
+ var defaultClientConfig = {
302
+ node: 'http://localhost:9200',
303
+ };
304
+
305
+ var sourceClient = new elasticsearch.Client(sourceClientConfig || defaultClientConfig);
306
+ var targetClient = new elasticsearch.Client(
307
+ targetClientConfig || sourceClientConfig || defaultClientConfig
308
+ );
245
309
 
246
310
  var createMapping = createMappingFactory({
247
- client: client, targetIndexName: targetIndexName, mappings: mappings, verbose: verbose,
311
+ sourceClient: sourceClient,
312
+ sourceIndexName: sourceIndexName,
313
+ targetClient: targetClient,
314
+ targetIndexName: targetIndexName,
315
+ mappings: mappings,
316
+ verbose: verbose,
248
317
  });
249
318
  var indexer = indexQueueFactory({
250
- client: client, targetIndexName: targetIndexName, typeName: typeName, bufferSize: bufferSize, skipHeader: skipHeader, verbose: verbose,
319
+ targetClient: targetClient,
320
+ targetIndexName: targetIndexName,
321
+ bufferSize: bufferSize,
322
+ skipHeader: skipHeader,
323
+ verbose: verbose,
251
324
  });
252
325
 
253
326
  function getReader() {
254
- if (typeof fileName !== 'undefined' && typeof sourceIndexName !== 'undefined') {
255
- throw Error('Only either one of fileName or sourceIndexName can be specified.');
327
+ if (
328
+ typeof fileName !== 'undefined'
329
+ && typeof sourceIndexName !== 'undefined'
330
+ ) {
331
+ throw Error(
332
+ 'Only either one of fileName or sourceIndexName can be specified.'
333
+ );
256
334
  }
257
335
 
258
- if (typeof fileName === 'undefined' && typeof sourceIndexName === 'undefined') {
336
+ if (
337
+ typeof fileName === 'undefined'
338
+ && typeof sourceIndexName === 'undefined'
339
+ ) {
259
340
  throw Error('Either fileName or sourceIndexName must be specified.');
260
341
  }
261
342
 
262
343
  if (typeof fileName !== 'undefined') {
263
- return fileReaderFactory(indexer, fileName, transform, splitRegex, verbose);
344
+ return fileReaderFactory(
345
+ indexer,
346
+ fileName,
347
+ transform,
348
+ splitRegex,
349
+ verbose
350
+ );
264
351
  }
265
352
 
266
353
  if (typeof sourceIndexName !== 'undefined') {
267
- return indexReaderFactory(indexer, sourceIndexName, transform, client);
354
+ return indexReaderFactory(
355
+ indexer,
356
+ sourceIndexName,
357
+ transform,
358
+ sourceClient
359
+ );
268
360
  }
269
361
 
270
362
  return null;
@@ -272,17 +364,26 @@ function transformer(ref) {
272
364
 
273
365
  var reader = getReader();
274
366
 
275
- client.indices.exists({ index: targetIndexName }, function (err, resp) {
276
- if (resp === false) {
277
- createMapping().then(reader);
367
+ try {
368
+ var indexExists = await targetClient.indices.exists({ index: targetIndexName });
369
+
370
+ if (indexExists === false) {
371
+ await createMapping();
372
+ reader();
278
373
  } else if (deleteIndex === true) {
279
- client.indices.delete({ index: targetIndexName }, function () {
280
- createMapping().then(reader);
281
- });
374
+ await targetClient.indices.delete({ index: targetIndexName });
375
+ await createMapping();
376
+ reader();
282
377
  } else {
283
378
  reader();
284
379
  }
285
- });
380
+ } catch (error) {
381
+ console.error('Error checking index existence:', error);
382
+ } finally {
383
+ // targetClient.close();
384
+ }
385
+
386
+ return { events: indexer.queueEmitter };
286
387
  }
287
388
 
288
389
  export default transformer;
package/package.json CHANGED
@@ -14,23 +14,24 @@
14
14
  "license": "Apache-2.0",
15
15
  "author": "Walter Rafelsberger <walter@rafelsberger.at>",
16
16
  "contributors": [],
17
- "version": "1.0.0-alpha7",
17
+ "version": "1.0.0-alpha9",
18
18
  "main": "dist/node-es-transformer.cjs.js",
19
19
  "module": "dist/node-es-transformer.esm.js",
20
20
  "dependencies": {
21
- "elasticsearch": "15.0.0",
21
+ "@elastic/elasticsearch": "^8.8.1",
22
+ "cli-progress": "^3.12.0",
22
23
  "event-stream": "3.3.4",
23
24
  "glob": "7.1.2"
24
25
  },
25
26
  "devDependencies": {
26
- "acorn": "6.0.0",
27
- "eslint": "4.19.1",
28
- "eslint-config-airbnb": "17.1.0",
29
- "eslint-plugin-jsx-a11y": "6.1.1",
30
- "eslint-plugin-react": "7.11.0",
31
- "eslint-plugin-import": "2.12.0",
27
+ "acorn": "^6.4.2",
28
+ "eslint": "8.2.0",
29
+ "eslint-config-airbnb": "19.0.4",
30
+ "eslint-plugin-import": "2.27.5",
31
+ "eslint-plugin-jsx-a11y": "6.7.1",
32
+ "eslint-plugin-react": "7.32.2",
32
33
  "rollup": "0.66.6",
33
- "rollup-plugin-buble": "0.19.4",
34
+ "rollup-plugin-buble": "0.19.6",
34
35
  "rollup-plugin-commonjs": "8.0.2",
35
36
  "rollup-plugin-node-resolve": "3.0.0"
36
37
  },