@engine9-io/input-tools 1.7.8 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +198 -135
- package/file/tools.js +70 -40
- package/index.js +27 -33
- package/package.json +2 -1
- package/test/processing/forEach.js +55 -0
- package/test/packet/forEach.js +0 -63
- /package/test/{packet → processing}/bigDataMessage.js +0 -0
- /package/test/{packet → processing}/message.js +0 -0
- /package/test/{packet → processing}/zip.js +0 -0
package/file/FileUtilities.js
CHANGED
|
@@ -1,17 +1,15 @@
|
|
|
1
|
-
/* eslint-disable no-await-in-loop */
|
|
2
1
|
const fs = require('node:fs');
|
|
3
2
|
|
|
4
3
|
const fsp = fs.promises;
|
|
5
4
|
const path = require('node:path');
|
|
6
5
|
const zlib = require('node:zlib');
|
|
7
|
-
const {
|
|
8
|
-
Readable, Transform, PassThrough, Writable,
|
|
9
|
-
} = require('node:stream');
|
|
6
|
+
const { Readable, Transform, PassThrough, Writable } = require('node:stream');
|
|
10
7
|
const { pipeline } = require('node:stream/promises');
|
|
11
8
|
const { stringify } = require('csv');
|
|
12
9
|
|
|
13
10
|
const debug = require('debug')('FileWorker');
|
|
14
11
|
|
|
12
|
+
const { getXlsxStream } = require('xlstream');
|
|
15
13
|
const csv = require('csv');
|
|
16
14
|
const JSON5 = require('json5');
|
|
17
15
|
const languageEncoding = require('detect-file-encoding-and-language');
|
|
@@ -20,10 +18,18 @@ const S3Worker = require('./S3');
|
|
|
20
18
|
const ParquetWorker = require('./Parquet');
|
|
21
19
|
|
|
22
20
|
const {
|
|
23
|
-
bool,
|
|
21
|
+
bool,
|
|
22
|
+
getTempFilename,
|
|
23
|
+
getStringArray,
|
|
24
|
+
getTempDir,
|
|
25
|
+
makeStrings,
|
|
26
|
+
streamPacket,
|
|
27
|
+
relativeDate
|
|
24
28
|
} = require('./tools');
|
|
25
29
|
|
|
26
|
-
function Worker({ accountId }) {
|
|
30
|
+
function Worker({ accountId }) {
|
|
31
|
+
this.accountId = accountId;
|
|
32
|
+
}
|
|
27
33
|
|
|
28
34
|
class LineReaderTransform extends Transform {
|
|
29
35
|
constructor(options = {}) {
|
|
@@ -31,7 +37,6 @@ class LineReaderTransform extends Transform {
|
|
|
31
37
|
this.buffer = '';
|
|
32
38
|
}
|
|
33
39
|
|
|
34
|
-
// eslint-disable-next-line no-underscore-dangle
|
|
35
40
|
_transform(chunk, encoding, callback) {
|
|
36
41
|
this.buffer += chunk.toString();
|
|
37
42
|
const lines = this.buffer.split(/\r?\n/);
|
|
@@ -40,7 +45,6 @@ class LineReaderTransform extends Transform {
|
|
|
40
45
|
callback();
|
|
41
46
|
}
|
|
42
47
|
|
|
43
|
-
// eslint-disable-next-line no-underscore-dangle
|
|
44
48
|
_flush(callback) {
|
|
45
49
|
if (this.buffer) {
|
|
46
50
|
this.push(this.buffer);
|
|
@@ -53,7 +57,11 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
53
57
|
const transforms = [];
|
|
54
58
|
const delimiter = options.delimiter || ',';
|
|
55
59
|
|
|
56
|
-
const headerMapping =
|
|
60
|
+
const headerMapping =
|
|
61
|
+
options.headerMapping ||
|
|
62
|
+
function (d) {
|
|
63
|
+
return d;
|
|
64
|
+
};
|
|
57
65
|
let lastLine = null;
|
|
58
66
|
let head = null;
|
|
59
67
|
|
|
@@ -63,7 +71,7 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
63
71
|
skip_empty_lines: true,
|
|
64
72
|
delimiter,
|
|
65
73
|
max_limit_on_data_read: 10000000,
|
|
66
|
-
skip_lines_with_error: skipLinesWithError
|
|
74
|
+
skip_lines_with_error: skipLinesWithError
|
|
67
75
|
};
|
|
68
76
|
if (options.skip) parserOptions.from_line = options.skip;
|
|
69
77
|
if (options.relax_column_count) parserOptions.relax_column_count = true;
|
|
@@ -101,7 +109,7 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
101
109
|
|
|
102
110
|
lastLine = row.join(delimiter);
|
|
103
111
|
return cb(null, o);
|
|
104
|
-
}
|
|
112
|
+
}
|
|
105
113
|
});
|
|
106
114
|
|
|
107
115
|
transforms.push(parser);
|
|
@@ -124,12 +132,15 @@ Worker.prototype.detectEncoding = async function (options) {
|
|
|
124
132
|
// needed chunk size.
|
|
125
133
|
finalBuff = await new Promise((resolve, reject) => {
|
|
126
134
|
const bufferBuilder = [];
|
|
127
|
-
const decompressStream = zlib
|
|
135
|
+
const decompressStream = zlib
|
|
136
|
+
.createGunzip()
|
|
128
137
|
.on('data', (chunk) => {
|
|
129
138
|
bufferBuilder.push(chunk);
|
|
130
|
-
})
|
|
139
|
+
})
|
|
140
|
+
.on('close', () => {
|
|
131
141
|
resolve(Buffer.concat(bufferBuilder));
|
|
132
|
-
})
|
|
142
|
+
})
|
|
143
|
+
.on('error', (err) => {
|
|
133
144
|
if (err.errno !== -5) {
|
|
134
145
|
// EOF: expected
|
|
135
146
|
reject(err);
|
|
@@ -145,15 +156,57 @@ Worker.prototype.detectEncoding = async function (options) {
|
|
|
145
156
|
|
|
146
157
|
Worker.prototype.detectEncoding.metadata = {
|
|
147
158
|
options: {
|
|
148
|
-
filename: { required: true }
|
|
149
|
-
}
|
|
159
|
+
filename: { required: true }
|
|
160
|
+
}
|
|
161
|
+
};
|
|
162
|
+
|
|
163
|
+
Worker.prototype.xlsxToObjectStream = async function (options) {
|
|
164
|
+
let { filename } = options;
|
|
165
|
+
|
|
166
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
167
|
+
// We need to copy and delete
|
|
168
|
+
let worker = null;
|
|
169
|
+
if (filename.startsWith('r2://')) {
|
|
170
|
+
worker = new R2Worker(this);
|
|
171
|
+
} else {
|
|
172
|
+
worker = new S3Worker(this);
|
|
173
|
+
}
|
|
174
|
+
const target = getTempFilename({ targetFilename: filename.split('/').pop() });
|
|
175
|
+
|
|
176
|
+
await worker.copy({ filename, target });
|
|
177
|
+
filename = target;
|
|
178
|
+
}
|
|
179
|
+
let stream = await getXlsxStream({
|
|
180
|
+
filePath: filename,
|
|
181
|
+
sheet: 0
|
|
182
|
+
});
|
|
183
|
+
let keys = null;
|
|
184
|
+
stream = stream.pipe(
|
|
185
|
+
new Transform({
|
|
186
|
+
objectMode: true,
|
|
187
|
+
transform(d, enc, cb) {
|
|
188
|
+
if (!keys) {
|
|
189
|
+
keys = d?.raw.arr;
|
|
190
|
+
cb();
|
|
191
|
+
} else {
|
|
192
|
+
let o = {};
|
|
193
|
+
keys.forEach((k, i) => {
|
|
194
|
+
o[k] = d?.raw?.arr?.[i];
|
|
195
|
+
});
|
|
196
|
+
cb(null, o);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
})
|
|
200
|
+
);
|
|
201
|
+
|
|
202
|
+
return { stream };
|
|
150
203
|
};
|
|
151
204
|
|
|
152
205
|
/*
|
|
153
|
-
|
|
206
|
+
Commonly used method to transform a file into a stream of objects.
|
|
154
207
|
*/
|
|
155
208
|
Worker.prototype.fileToObjectStream = async function (options) {
|
|
156
|
-
const { filename, columns, limit: limitOption,format:formatOverride } = options;
|
|
209
|
+
const { filename, columns, limit: limitOption, format: formatOverride } = options;
|
|
157
210
|
|
|
158
211
|
// handle stream item
|
|
159
212
|
if (options.stream) {
|
|
@@ -167,6 +220,9 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
167
220
|
let limit;
|
|
168
221
|
if (limitOption) limit = parseInt(limitOption, 10);
|
|
169
222
|
if (!filename) throw new Error('fileToObjectStream: filename is required');
|
|
223
|
+
if (filename.split('.').pop().toLowerCase() === 'xlsx') {
|
|
224
|
+
return this.xlsxToObjectStream(options);
|
|
225
|
+
}
|
|
170
226
|
let postfix = options.sourcePostfix || filename.toLowerCase().split('.').pop();
|
|
171
227
|
if (postfix === 'zip') {
|
|
172
228
|
debug('Invalid filename:', { filename });
|
|
@@ -176,7 +232,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
176
232
|
const streamInfo = await this.stream({
|
|
177
233
|
filename,
|
|
178
234
|
columns,
|
|
179
|
-
limit
|
|
235
|
+
limit
|
|
180
236
|
});
|
|
181
237
|
const { encoding } = streamInfo;
|
|
182
238
|
let { stream } = streamInfo;
|
|
@@ -203,7 +259,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
203
259
|
} else {
|
|
204
260
|
stream.setEncoding(encoding);
|
|
205
261
|
}
|
|
206
|
-
let format=formatOverride || postfix;
|
|
262
|
+
let format = formatOverride || postfix;
|
|
207
263
|
|
|
208
264
|
if (format === 'csv') {
|
|
209
265
|
const csvTransforms = this.csvToObjectTransforms({ ...options });
|
|
@@ -243,13 +299,15 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
243
299
|
}
|
|
244
300
|
if (headers) {
|
|
245
301
|
const mapped = {};
|
|
246
|
-
headers.forEach((name, i) => {
|
|
302
|
+
headers.forEach((name, i) => {
|
|
303
|
+
mapped[name] = obj[i];
|
|
304
|
+
});
|
|
247
305
|
this.push(mapped);
|
|
248
306
|
} else {
|
|
249
307
|
this.push(obj);
|
|
250
308
|
}
|
|
251
309
|
return cb();
|
|
252
|
-
}
|
|
310
|
+
}
|
|
253
311
|
});
|
|
254
312
|
|
|
255
313
|
transforms.push(lineReader);
|
|
@@ -260,9 +318,11 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
260
318
|
const countAndDebug = new Transform({
|
|
261
319
|
objectMode: true,
|
|
262
320
|
transform(d, enc, cb) {
|
|
263
|
-
if (count === 0) {
|
|
321
|
+
if (count === 0) {
|
|
322
|
+
debug('Sample object from file:', d);
|
|
323
|
+
}
|
|
264
324
|
count += 1;
|
|
265
|
-
if ((count < 5000 && count % 1000 === 0) ||
|
|
325
|
+
if ((count < 5000 && count % 1000 === 0) || count % 50000 === 0) {
|
|
266
326
|
debug(`fileToObjectStream transformed ${count} lines`);
|
|
267
327
|
}
|
|
268
328
|
this.push(d);
|
|
@@ -279,7 +339,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
279
339
|
this.push(o);
|
|
280
340
|
} */
|
|
281
341
|
cb();
|
|
282
|
-
}
|
|
342
|
+
}
|
|
283
343
|
});
|
|
284
344
|
|
|
285
345
|
transforms.push(countAndDebug);
|
|
@@ -319,14 +379,14 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
319
379
|
objectMode: true,
|
|
320
380
|
async transform(item, encoding, cb) {
|
|
321
381
|
options.transform(item, encoding, cb);
|
|
322
|
-
}
|
|
382
|
+
}
|
|
323
383
|
});
|
|
324
384
|
} else {
|
|
325
385
|
transform = new Transform({
|
|
326
386
|
objectMode: true,
|
|
327
387
|
async transform(item, encoding, cb) {
|
|
328
388
|
cb(null, options.transform(item));
|
|
329
|
-
}
|
|
389
|
+
}
|
|
330
390
|
});
|
|
331
391
|
}
|
|
332
392
|
} else if (options.transform) {
|
|
@@ -345,7 +405,7 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
345
405
|
let v = item[k];
|
|
346
406
|
if (!o[k]) {
|
|
347
407
|
if (typeof v === 'object') {
|
|
348
|
-
while (Array.isArray(v)) [v] = v
|
|
408
|
+
while (Array.isArray(v)) [v] = v; // get first array item
|
|
349
409
|
o = { ...o, ...v };
|
|
350
410
|
} else {
|
|
351
411
|
o[k] = v;
|
|
@@ -353,12 +413,12 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
353
413
|
}
|
|
354
414
|
});
|
|
355
415
|
cb(null, o);
|
|
356
|
-
}
|
|
416
|
+
}
|
|
357
417
|
});
|
|
358
418
|
}
|
|
359
419
|
|
|
360
420
|
const stats = {
|
|
361
|
-
records: 0
|
|
421
|
+
records: 0
|
|
362
422
|
};
|
|
363
423
|
let stringifier;
|
|
364
424
|
if (options.targetFormat === 'jsonl') {
|
|
@@ -366,7 +426,7 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
366
426
|
objectMode: true,
|
|
367
427
|
transform(d, encoding, cb) {
|
|
368
428
|
cb(false, `${JSON.stringify(d)}\n`);
|
|
369
|
-
}
|
|
429
|
+
}
|
|
370
430
|
});
|
|
371
431
|
} else {
|
|
372
432
|
stringifier = stringify({ header: true });
|
|
@@ -383,11 +443,11 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
383
443
|
transform(d, enc, cb) {
|
|
384
444
|
stats.records += 1;
|
|
385
445
|
cb(null, d);
|
|
386
|
-
}
|
|
446
|
+
}
|
|
387
447
|
}),
|
|
388
448
|
stringifier,
|
|
389
449
|
gzip,
|
|
390
|
-
fileWriterStream
|
|
450
|
+
fileWriterStream
|
|
391
451
|
].filter(Boolean);
|
|
392
452
|
return { filename, streams, stats };
|
|
393
453
|
};
|
|
@@ -395,9 +455,7 @@ Worker.prototype.objectStreamToFile = async function (options) {
|
|
|
395
455
|
const { filename, streams, stats } = await this.getOutputStreams(options);
|
|
396
456
|
const { stream: inStream } = options;
|
|
397
457
|
streams.unshift(inStream);
|
|
398
|
-
await pipeline(
|
|
399
|
-
streams,
|
|
400
|
-
);
|
|
458
|
+
await pipeline(streams);
|
|
401
459
|
return { filename, records: stats.records };
|
|
402
460
|
};
|
|
403
461
|
|
|
@@ -432,7 +490,7 @@ Worker.prototype.transform = async function (options) {
|
|
|
432
490
|
if (typeof f === 'function') {
|
|
433
491
|
f = new Transform({
|
|
434
492
|
objectMode: true,
|
|
435
|
-
transform: f
|
|
493
|
+
transform: f
|
|
436
494
|
});
|
|
437
495
|
}
|
|
438
496
|
|
|
@@ -441,7 +499,10 @@ Worker.prototype.transform = async function (options) {
|
|
|
441
499
|
|
|
442
500
|
const { targetFormat } = options;
|
|
443
501
|
|
|
444
|
-
if (
|
|
502
|
+
if (
|
|
503
|
+
!targetFormat &&
|
|
504
|
+
(filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
|
|
505
|
+
) {
|
|
445
506
|
options.targetFormat = 'csv';
|
|
446
507
|
}
|
|
447
508
|
|
|
@@ -453,33 +514,34 @@ Worker.prototype.transform.metadata = {
|
|
|
453
514
|
sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
|
|
454
515
|
encoding: { description: 'Manual override of source file encoding' },
|
|
455
516
|
names: { description: 'Target field names (e.g. my_new_field,x,y,z)' },
|
|
456
|
-
values: {
|
|
517
|
+
values: {
|
|
518
|
+
description:
|
|
519
|
+
"Comma delimited source field name, or Handlebars [[ ]] merge fields (e.g. 'my_field,x,y,z', '[[field1]]-[[field2]]', etc)"
|
|
520
|
+
},
|
|
457
521
|
targetFilename: { description: 'Custom name of the output file (default auto-generated)' },
|
|
458
522
|
targetFormat: { description: 'Output format -- csv supported, or none for txt (default)' },
|
|
459
523
|
targetRowDelimiter: { description: 'Row delimiter (default \n)' },
|
|
460
|
-
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
|
|
461
|
-
}
|
|
524
|
+
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
|
|
525
|
+
}
|
|
462
526
|
};
|
|
463
527
|
Worker.prototype.testTransform = async function (options) {
|
|
464
528
|
return this.transform({
|
|
465
529
|
...options,
|
|
466
|
-
transform(d, enc, cb) {
|
|
530
|
+
transform(d, enc, cb) {
|
|
531
|
+
d.transform_time = new Date();
|
|
532
|
+
cb(null, d);
|
|
533
|
+
}
|
|
467
534
|
});
|
|
468
535
|
};
|
|
469
536
|
Worker.prototype.testTransform.metadata = {
|
|
470
537
|
options: {
|
|
471
|
-
filename: true
|
|
472
|
-
}
|
|
538
|
+
filename: true
|
|
539
|
+
}
|
|
473
540
|
};
|
|
474
541
|
|
|
475
542
|
/* Get a stream from an actual stream, or an array, or a file */
|
|
476
|
-
Worker.prototype.stream = async function (
|
|
477
|
-
options
|
|
478
|
-
) {
|
|
479
|
-
const {
|
|
480
|
-
stream: inputStream, packet, type, columns, limit,
|
|
481
|
-
filename: filenameOpt,
|
|
482
|
-
} = options;
|
|
543
|
+
Worker.prototype.stream = async function (options) {
|
|
544
|
+
const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
|
|
483
545
|
let filename = filenameOpt;
|
|
484
546
|
|
|
485
547
|
if (inputStream) {
|
|
@@ -496,7 +558,8 @@ Worker.prototype.stream = async function (
|
|
|
496
558
|
} else {
|
|
497
559
|
// debug(`Not prepending filename:${filename}`);
|
|
498
560
|
}
|
|
499
|
-
let encoding;
|
|
561
|
+
let encoding;
|
|
562
|
+
let stream;
|
|
500
563
|
if (filename.slice(-8) === '.parquet') {
|
|
501
564
|
const pq = new ParquetWorker(this);
|
|
502
565
|
stream = (await pq.stream({ filename, columns, limit })).stream;
|
|
@@ -541,9 +604,8 @@ Worker.prototype.sample = async function (opts) {
|
|
|
541
604
|
};
|
|
542
605
|
Worker.prototype.sample.metadata = {
|
|
543
606
|
options: {
|
|
544
|
-
filename: {}
|
|
545
|
-
|
|
546
|
-
},
|
|
607
|
+
filename: {}
|
|
608
|
+
}
|
|
547
609
|
};
|
|
548
610
|
Worker.prototype.toArray = async function (opts) {
|
|
549
611
|
const { stream } = await this.fileToObjectStream(opts);
|
|
@@ -551,8 +613,8 @@ Worker.prototype.toArray = async function (opts) {
|
|
|
551
613
|
};
|
|
552
614
|
Worker.prototype.toArray.metadata = {
|
|
553
615
|
options: {
|
|
554
|
-
filename: {}
|
|
555
|
-
}
|
|
616
|
+
filename: {}
|
|
617
|
+
}
|
|
556
618
|
};
|
|
557
619
|
|
|
558
620
|
Worker.prototype.write = async function (opts) {
|
|
@@ -566,7 +628,7 @@ Worker.prototype.write = async function (opts) {
|
|
|
566
628
|
await worker.write({
|
|
567
629
|
directory,
|
|
568
630
|
file,
|
|
569
|
-
content
|
|
631
|
+
content
|
|
570
632
|
});
|
|
571
633
|
} else {
|
|
572
634
|
await fsp.writeFile(filename, content);
|
|
@@ -576,15 +638,14 @@ Worker.prototype.write = async function (opts) {
|
|
|
576
638
|
Worker.prototype.write.metadata = {
|
|
577
639
|
options: {
|
|
578
640
|
filename: { description: 'Location to write content to, can be local or s3:// or r2://' },
|
|
579
|
-
content: {}
|
|
580
|
-
}
|
|
641
|
+
content: {}
|
|
642
|
+
}
|
|
581
643
|
};
|
|
582
644
|
|
|
583
645
|
async function streamToString(stream) {
|
|
584
646
|
// lets have a ReadableStream as a stream variable
|
|
585
647
|
const chunks = [];
|
|
586
648
|
|
|
587
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
588
649
|
for await (const chunk of stream) {
|
|
589
650
|
chunks.push(Buffer.from(chunk));
|
|
590
651
|
}
|
|
@@ -606,47 +667,46 @@ Worker.prototype.json = async function (opts) {
|
|
|
606
667
|
};
|
|
607
668
|
Worker.prototype.json.metadata = {
|
|
608
669
|
options: {
|
|
609
|
-
filename: { description: 'Get a javascript object from a file' }
|
|
610
|
-
}
|
|
670
|
+
filename: { description: 'Get a javascript object from a file' }
|
|
671
|
+
}
|
|
611
672
|
};
|
|
612
673
|
|
|
613
|
-
Worker.prototype.list = async function ({ directory, start:s, end:e }) {
|
|
674
|
+
Worker.prototype.list = async function ({ directory, start: s, end: e }) {
|
|
614
675
|
if (!directory) throw new Error('directory is required');
|
|
615
|
-
let start=null;
|
|
616
|
-
let end=null;
|
|
617
|
-
if (s) start=relativeDate(s);
|
|
618
|
-
if (e) end=relativeDate(e);
|
|
619
|
-
|
|
676
|
+
let start = null;
|
|
677
|
+
let end = null;
|
|
678
|
+
if (s) start = relativeDate(s);
|
|
679
|
+
if (e) end = relativeDate(e);
|
|
680
|
+
|
|
620
681
|
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
621
682
|
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
622
683
|
return worker.list({ directory, start, end });
|
|
623
684
|
}
|
|
624
685
|
const a = await fsp.readdir(directory, { withFileTypes: true });
|
|
625
686
|
|
|
626
|
-
const withModified=[];
|
|
687
|
+
const withModified = [];
|
|
627
688
|
for (const file of a) {
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
689
|
+
const fullPath = path.join(directory, file.name);
|
|
690
|
+
const stats = await fsp.stat(fullPath);
|
|
691
|
+
if (start && stats.mtime < start.getTime()) {
|
|
692
|
+
//do not include
|
|
693
|
+
} else if (end && stats.mtime > end.getTime()) {
|
|
694
|
+
//do nothing
|
|
695
|
+
} else {
|
|
696
|
+
withModified.push({
|
|
697
|
+
name: file.name,
|
|
698
|
+
type: file.isDirectory() ? 'directory' : 'file',
|
|
699
|
+
modifiedAt: new Date(stats.mtime).toISOString()
|
|
700
|
+
});
|
|
701
|
+
}
|
|
641
702
|
}
|
|
642
|
-
|
|
703
|
+
|
|
643
704
|
return withModified;
|
|
644
|
-
|
|
645
705
|
};
|
|
646
706
|
Worker.prototype.list.metadata = {
|
|
647
707
|
options: {
|
|
648
|
-
directory: { required: true }
|
|
649
|
-
}
|
|
708
|
+
directory: { required: true }
|
|
709
|
+
}
|
|
650
710
|
};
|
|
651
711
|
|
|
652
712
|
Worker.prototype.listAll = async function ({ directory }) {
|
|
@@ -661,8 +721,8 @@ Worker.prototype.listAll = async function ({ directory }) {
|
|
|
661
721
|
};
|
|
662
722
|
Worker.prototype.listAll.metadata = {
|
|
663
723
|
options: {
|
|
664
|
-
directory: { required: true }
|
|
665
|
-
}
|
|
724
|
+
directory: { required: true }
|
|
725
|
+
}
|
|
666
726
|
};
|
|
667
727
|
|
|
668
728
|
Worker.prototype.empty = async function ({ directory }) {
|
|
@@ -672,7 +732,7 @@ Worker.prototype.empty = async function ({ directory }) {
|
|
|
672
732
|
throw new Error('Cannot empty an s3:// or r2:// directory');
|
|
673
733
|
}
|
|
674
734
|
const removed = [];
|
|
675
|
-
|
|
735
|
+
|
|
676
736
|
for (const file of await fsp.readdir(directory)) {
|
|
677
737
|
removed.push(file);
|
|
678
738
|
await fsp.unlink(path.join(directory, file));
|
|
@@ -681,8 +741,8 @@ Worker.prototype.empty = async function ({ directory }) {
|
|
|
681
741
|
};
|
|
682
742
|
Worker.prototype.empty.metadata = {
|
|
683
743
|
options: {
|
|
684
|
-
directory: { required: true }
|
|
685
|
-
}
|
|
744
|
+
directory: { required: true }
|
|
745
|
+
}
|
|
686
746
|
};
|
|
687
747
|
|
|
688
748
|
Worker.prototype.remove = async function ({ filename }) {
|
|
@@ -705,16 +765,18 @@ Worker.prototype.remove = async function ({ filename }) {
|
|
|
705
765
|
};
|
|
706
766
|
Worker.prototype.remove.metadata = {
|
|
707
767
|
options: {
|
|
708
|
-
filename: {}
|
|
709
|
-
}
|
|
768
|
+
filename: {}
|
|
769
|
+
}
|
|
710
770
|
};
|
|
711
771
|
|
|
712
772
|
Worker.prototype.move = async function ({ filename, target }) {
|
|
713
773
|
if (!target) throw new Error('target is required');
|
|
714
774
|
if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
|
|
715
775
|
if (target.startsWith('s3://') || target.startsWith('r2://')) {
|
|
716
|
-
if (
|
|
717
|
-
|
|
776
|
+
if (
|
|
777
|
+
(target.startsWith('s3://') && filename.startsWith('r2://')) ||
|
|
778
|
+
(target.startsWith('r2://') && filename.startsWith('s3://'))
|
|
779
|
+
) {
|
|
718
780
|
throw new Error('Cowardly not copying between services');
|
|
719
781
|
}
|
|
720
782
|
|
|
@@ -741,8 +803,8 @@ Worker.prototype.move = async function ({ filename, target }) {
|
|
|
741
803
|
Worker.prototype.move.metadata = {
|
|
742
804
|
options: {
|
|
743
805
|
filename: {},
|
|
744
|
-
target: {}
|
|
745
|
-
}
|
|
806
|
+
target: {}
|
|
807
|
+
}
|
|
746
808
|
};
|
|
747
809
|
|
|
748
810
|
Worker.prototype.stat = async function ({ filename }) {
|
|
@@ -751,11 +813,7 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
751
813
|
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
752
814
|
return worker.stat({ filename });
|
|
753
815
|
}
|
|
754
|
-
const {
|
|
755
|
-
ctime,
|
|
756
|
-
birthtime,
|
|
757
|
-
size,
|
|
758
|
-
} = await fsp.stat(filename);
|
|
816
|
+
const { ctime, birthtime, size } = await fsp.stat(filename);
|
|
759
817
|
const modifiedAt = new Date(ctime);
|
|
760
818
|
let createdAt = birthtime;
|
|
761
819
|
if (createdAt === 0 || !createdAt) createdAt = ctime;
|
|
@@ -763,13 +821,13 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
763
821
|
return {
|
|
764
822
|
createdAt,
|
|
765
823
|
modifiedAt,
|
|
766
|
-
size
|
|
824
|
+
size
|
|
767
825
|
};
|
|
768
826
|
};
|
|
769
827
|
Worker.prototype.stat.metadata = {
|
|
770
828
|
options: {
|
|
771
|
-
filename: {}
|
|
772
|
-
}
|
|
829
|
+
filename: {}
|
|
830
|
+
}
|
|
773
831
|
};
|
|
774
832
|
|
|
775
833
|
Worker.prototype.download = async function ({ filename }) {
|
|
@@ -782,8 +840,8 @@ Worker.prototype.download = async function ({ filename }) {
|
|
|
782
840
|
};
|
|
783
841
|
Worker.prototype.download.metadata = {
|
|
784
842
|
options: {
|
|
785
|
-
filename: {}
|
|
786
|
-
}
|
|
843
|
+
filename: {}
|
|
844
|
+
}
|
|
787
845
|
};
|
|
788
846
|
|
|
789
847
|
Worker.prototype.head = async function (options) {
|
|
@@ -792,7 +850,7 @@ Worker.prototype.head = async function (options) {
|
|
|
792
850
|
const chunks = [];
|
|
793
851
|
|
|
794
852
|
let counter = 0;
|
|
795
|
-
|
|
853
|
+
|
|
796
854
|
for await (const chunk of stream) {
|
|
797
855
|
chunks.push(chunk);
|
|
798
856
|
counter += 1;
|
|
@@ -804,8 +862,8 @@ Worker.prototype.head = async function (options) {
|
|
|
804
862
|
|
|
805
863
|
Worker.prototype.head.metadata = {
|
|
806
864
|
options: {
|
|
807
|
-
filename: { required: true }
|
|
808
|
-
}
|
|
865
|
+
filename: { required: true }
|
|
866
|
+
}
|
|
809
867
|
};
|
|
810
868
|
|
|
811
869
|
Worker.prototype.count = async function (options) {
|
|
@@ -814,7 +872,7 @@ Worker.prototype.count = async function (options) {
|
|
|
814
872
|
|
|
815
873
|
const limit = options.limit || 5;
|
|
816
874
|
let records = 0;
|
|
817
|
-
|
|
875
|
+
|
|
818
876
|
for await (const chunk of stream) {
|
|
819
877
|
records += 1;
|
|
820
878
|
if (records < limit) {
|
|
@@ -827,8 +885,8 @@ Worker.prototype.count = async function (options) {
|
|
|
827
885
|
|
|
828
886
|
Worker.prototype.count.metadata = {
|
|
829
887
|
options: {
|
|
830
|
-
filename: { required: true }
|
|
831
|
-
}
|
|
888
|
+
filename: { required: true }
|
|
889
|
+
}
|
|
832
890
|
};
|
|
833
891
|
|
|
834
892
|
// Get a set of unique entries from a uniqueFunction
|
|
@@ -839,10 +897,10 @@ Worker.prototype.getUniqueSet = async function (options) {
|
|
|
839
897
|
|
|
840
898
|
let { uniqueFunction } = options;
|
|
841
899
|
if (!uniqueFunction) {
|
|
842
|
-
uniqueFunction = (
|
|
900
|
+
uniqueFunction = (o) => JSON.stringify(o);
|
|
843
901
|
}
|
|
844
902
|
const uniqueSet = new Set();
|
|
845
|
-
|
|
903
|
+
|
|
846
904
|
for (const filename of existingFiles) {
|
|
847
905
|
const { stream: existsStream } = await this.fileToObjectStream({ filename });
|
|
848
906
|
await pipeline(
|
|
@@ -856,14 +914,14 @@ Worker.prototype.getUniqueSet = async function (options) {
|
|
|
856
914
|
}
|
|
857
915
|
uniqueSet.add(v);
|
|
858
916
|
cb(null, d);
|
|
859
|
-
}
|
|
917
|
+
}
|
|
860
918
|
}),
|
|
861
919
|
new Writable({
|
|
862
920
|
objectMode: true,
|
|
863
921
|
write(d, enc, cb) {
|
|
864
922
|
cb();
|
|
865
|
-
}
|
|
866
|
-
})
|
|
923
|
+
}
|
|
924
|
+
})
|
|
867
925
|
);
|
|
868
926
|
debug(`Finished loading ${filename}`);
|
|
869
927
|
}
|
|
@@ -875,7 +933,7 @@ Worker.prototype.getUniqueStream = async function (options) {
|
|
|
875
933
|
|
|
876
934
|
const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
|
|
877
935
|
filenames: options.existingFiles,
|
|
878
|
-
uniqueFunction: options.uniqueFunction
|
|
936
|
+
uniqueFunction: options.uniqueFunction
|
|
879
937
|
});
|
|
880
938
|
|
|
881
939
|
const { stream: inStream } = await this.fileToObjectStream(options);
|
|
@@ -899,8 +957,8 @@ Worker.prototype.getUniqueStream = async function (options) {
|
|
|
899
957
|
}
|
|
900
958
|
cb(null, d);
|
|
901
959
|
}
|
|
902
|
-
}
|
|
903
|
-
})
|
|
960
|
+
}
|
|
961
|
+
})
|
|
904
962
|
);
|
|
905
963
|
return { stream: uniqueStream, sample };
|
|
906
964
|
};
|
|
@@ -912,9 +970,9 @@ Worker.prototype.getUniqueStream.metadata = {
|
|
|
912
970
|
filename: { description: 'Specify a source filename or a stream' },
|
|
913
971
|
stream: { description: 'Specify a source filename or a stream' },
|
|
914
972
|
includeDuplicateSourceRecords: {
|
|
915
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
916
|
-
}
|
|
917
|
-
}
|
|
973
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
974
|
+
}
|
|
975
|
+
}
|
|
918
976
|
};
|
|
919
977
|
Worker.prototype.getUniqueFile = async function (options) {
|
|
920
978
|
const { stream, sample } = await this.getUniqueStream(options);
|
|
@@ -929,9 +987,9 @@ Worker.prototype.getUniqueFile.metadata = {
|
|
|
929
987
|
filename: { description: 'Specify a source filename or a stream' },
|
|
930
988
|
stream: { description: 'Specify a source filename or a stream' },
|
|
931
989
|
includeDuplicateSourceRecords: {
|
|
932
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
933
|
-
}
|
|
934
|
-
}
|
|
990
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
991
|
+
}
|
|
992
|
+
}
|
|
935
993
|
};
|
|
936
994
|
|
|
937
995
|
/*
|
|
@@ -940,7 +998,11 @@ Requires 2 passes of the files,
|
|
|
940
998
|
but that's a better tradeoff than trying to store huge files in memory
|
|
941
999
|
*/
|
|
942
1000
|
Worker.prototype.diff = async function ({
|
|
943
|
-
fileA,
|
|
1001
|
+
fileA,
|
|
1002
|
+
fileB,
|
|
1003
|
+
uniqueFunction: ufOpt,
|
|
1004
|
+
fields,
|
|
1005
|
+
includeDuplicateSourceRecords
|
|
944
1006
|
}) {
|
|
945
1007
|
if (ufOpt && fields) throw new Error('fields and uniqueFunction cannot both be specified');
|
|
946
1008
|
let uniqueFunction = ufOpt;
|
|
@@ -953,17 +1015,18 @@ Worker.prototype.diff = async function ({
|
|
|
953
1015
|
existingFiles: [fileB],
|
|
954
1016
|
filename: fileA,
|
|
955
1017
|
uniqueFunction,
|
|
956
|
-
includeDuplicateSourceRecords
|
|
1018
|
+
includeDuplicateSourceRecords
|
|
957
1019
|
});
|
|
958
1020
|
const right = await this.getUniqueFile({
|
|
959
1021
|
existingFiles: [fileA],
|
|
960
1022
|
filename: fileB,
|
|
961
1023
|
uniqueFunction,
|
|
962
|
-
includeDuplicateSourceRecords
|
|
1024
|
+
includeDuplicateSourceRecords
|
|
963
1025
|
});
|
|
964
1026
|
|
|
965
1027
|
return {
|
|
966
|
-
left,
|
|
1028
|
+
left,
|
|
1029
|
+
right
|
|
967
1030
|
};
|
|
968
1031
|
};
|
|
969
1032
|
Worker.prototype.diff.metadata = {
|
|
@@ -973,9 +1036,9 @@ Worker.prototype.diff.metadata = {
|
|
|
973
1036
|
fields: { description: 'Fields to use for uniqueness -- aka primary key. Defaults to JSON of line' },
|
|
974
1037
|
uniqueFunction: {},
|
|
975
1038
|
includeDuplicateSourceRecords: {
|
|
976
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
977
|
-
}
|
|
978
|
-
}
|
|
1039
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
979
1042
|
};
|
|
980
1043
|
|
|
981
1044
|
module.exports = Worker;
|
package/file/tools.js
CHANGED
|
@@ -14,16 +14,9 @@ const unzipper = require('unzipper');
|
|
|
14
14
|
|
|
15
15
|
const dayjs = require('dayjs');
|
|
16
16
|
|
|
17
|
-
const {
|
|
18
|
-
S3Client,
|
|
19
|
-
HeadObjectCommand,
|
|
20
|
-
GetObjectCommand,
|
|
21
|
-
} = require('@aws-sdk/client-s3');
|
|
17
|
+
const { S3Client, HeadObjectCommand, GetObjectCommand } = require('@aws-sdk/client-s3');
|
|
22
18
|
|
|
23
|
-
|
|
24
|
-
const {
|
|
25
|
-
v7: uuidv7,
|
|
26
|
-
} = require('uuid');
|
|
19
|
+
const { v7: uuidv7 } = require('uuid');
|
|
27
20
|
|
|
28
21
|
async function getTempDir({ accountId = 'engine9' }) {
|
|
29
22
|
const dir = [os.tmpdir(), accountId, new Date().toISOString().substring(0, 10)].join(path.sep);
|
|
@@ -52,7 +45,10 @@ async function getTempFilename(options) {
|
|
|
52
45
|
}
|
|
53
46
|
|
|
54
47
|
// make a distinct directory, so we don't overwrite the file
|
|
55
|
-
dir = `${dir}/${new Date()
|
|
48
|
+
dir = `${dir}/${new Date()
|
|
49
|
+
.toISOString()
|
|
50
|
+
.slice(0, -6)
|
|
51
|
+
.replace(/[^0-9]/g, '_')}`;
|
|
56
52
|
|
|
57
53
|
const newDir = await mkdirp(dir);
|
|
58
54
|
|
|
@@ -97,8 +93,8 @@ async function getPacketFiles({ packet }) {
|
|
|
97
93
|
const info = await s3Client.send(
|
|
98
94
|
new HeadObjectCommand({
|
|
99
95
|
Bucket,
|
|
100
|
-
Key
|
|
101
|
-
})
|
|
96
|
+
Key
|
|
97
|
+
})
|
|
102
98
|
);
|
|
103
99
|
size = info.ContentLength;
|
|
104
100
|
progress(`Retrieving file of size ${size / (1024 * 1024)} MB`);
|
|
@@ -107,13 +103,14 @@ async function getPacketFiles({ packet }) {
|
|
|
107
103
|
|
|
108
104
|
stream(offset, length) {
|
|
109
105
|
const ptStream = new PassThrough();
|
|
110
|
-
s3Client
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
106
|
+
s3Client
|
|
107
|
+
.send(
|
|
108
|
+
new GetObjectCommand({
|
|
109
|
+
Bucket,
|
|
110
|
+
Key,
|
|
111
|
+
Range: `bytes=${offset}-${length ?? ''}`
|
|
112
|
+
})
|
|
113
|
+
)
|
|
117
114
|
.then((response) => {
|
|
118
115
|
response.Body.pipe(ptStream);
|
|
119
116
|
})
|
|
@@ -122,7 +119,7 @@ async function getPacketFiles({ packet }) {
|
|
|
122
119
|
});
|
|
123
120
|
|
|
124
121
|
return ptStream;
|
|
125
|
-
}
|
|
122
|
+
}
|
|
126
123
|
});
|
|
127
124
|
|
|
128
125
|
return directory;
|
|
@@ -131,7 +128,6 @@ async function getPacketFiles({ packet }) {
|
|
|
131
128
|
return directory;
|
|
132
129
|
}
|
|
133
130
|
|
|
134
|
-
|
|
135
131
|
async function getManifest({ packet }) {
|
|
136
132
|
if (!packet) throw new Error('no packet option specififed');
|
|
137
133
|
const { files } = await getPacketFiles({ packet });
|
|
@@ -156,8 +152,8 @@ function getBatchTransform({ batchSize = 100 }) {
|
|
|
156
152
|
flush(cb) {
|
|
157
153
|
if (this.buffer?.length > 0) this.push(this.buffer);
|
|
158
154
|
cb();
|
|
159
|
-
}
|
|
160
|
-
})
|
|
155
|
+
}
|
|
156
|
+
})
|
|
161
157
|
};
|
|
162
158
|
}
|
|
163
159
|
function getDebatchTransform() {
|
|
@@ -167,8 +163,8 @@ function getDebatchTransform() {
|
|
|
167
163
|
transform(chunk, encoding, cb) {
|
|
168
164
|
chunk.forEach((c) => this.push(c));
|
|
169
165
|
cb();
|
|
170
|
-
}
|
|
171
|
-
})
|
|
166
|
+
}
|
|
167
|
+
})
|
|
172
168
|
};
|
|
173
169
|
}
|
|
174
170
|
|
|
@@ -218,7 +214,8 @@ async function downloadFile({ packet, type = 'person' }) {
|
|
|
218
214
|
const filename = await getTempFilename({ targetFilename: filePath.split('/').pop() });
|
|
219
215
|
|
|
220
216
|
return new Promise((resolve, reject) => {
|
|
221
|
-
fileStream
|
|
217
|
+
fileStream
|
|
218
|
+
.pipe(fs.createWriteStream(filename))
|
|
222
219
|
.on('error', reject)
|
|
223
220
|
.on('finish', () => {
|
|
224
221
|
resolve({ filename });
|
|
@@ -228,12 +225,12 @@ async function downloadFile({ packet, type = 'person' }) {
|
|
|
228
225
|
|
|
229
226
|
function isValidDate(d) {
|
|
230
227
|
// we WANT to use isNaN, not the Number.isNaN -- we're checking the date type
|
|
231
|
-
|
|
228
|
+
|
|
232
229
|
return d instanceof Date && !isNaN(d);
|
|
233
230
|
}
|
|
234
231
|
|
|
235
232
|
function bool(x, _defaultVal) {
|
|
236
|
-
const defaultVal =
|
|
233
|
+
const defaultVal = _defaultVal === undefined ? false : _defaultVal;
|
|
237
234
|
if (x === undefined || x === null || x === '') return defaultVal;
|
|
238
235
|
if (typeof x !== 'string') return !!x;
|
|
239
236
|
if (x === '1') return true; // 0 will return false, but '1' is true
|
|
@@ -255,7 +252,7 @@ function relativeDate(s, _initialDate) {
|
|
|
255
252
|
if (!s || s === 'none') return null;
|
|
256
253
|
if (typeof s.getMonth === 'function') return s;
|
|
257
254
|
// We actually want a double equals here to test strings as well
|
|
258
|
-
|
|
255
|
+
|
|
259
256
|
if (parseInt(s, 10) == s) {
|
|
260
257
|
const r = new Date(parseInt(s, 10));
|
|
261
258
|
if (!isValidDate(r)) throw new Error(`Invalid integer date:${s}`);
|
|
@@ -274,15 +271,31 @@ function relativeDate(s, _initialDate) {
|
|
|
274
271
|
let period = null;
|
|
275
272
|
switch (r[3]) {
|
|
276
273
|
case 'Y':
|
|
277
|
-
case 'y':
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
case '
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
case '
|
|
285
|
-
|
|
274
|
+
case 'y':
|
|
275
|
+
period = 'years';
|
|
276
|
+
break;
|
|
277
|
+
|
|
278
|
+
case 'M':
|
|
279
|
+
period = 'months';
|
|
280
|
+
break;
|
|
281
|
+
case 'w':
|
|
282
|
+
period = 'weeks';
|
|
283
|
+
break;
|
|
284
|
+
case 'd':
|
|
285
|
+
period = 'days';
|
|
286
|
+
break;
|
|
287
|
+
case 'h':
|
|
288
|
+
period = 'hours';
|
|
289
|
+
break;
|
|
290
|
+
case 'm':
|
|
291
|
+
period = 'minutes';
|
|
292
|
+
break;
|
|
293
|
+
case 's':
|
|
294
|
+
period = 'seconds';
|
|
295
|
+
break;
|
|
296
|
+
default:
|
|
297
|
+
period = 'minutes';
|
|
298
|
+
break;
|
|
286
299
|
}
|
|
287
300
|
|
|
288
301
|
let d = dayjs(initialDate);
|
|
@@ -317,12 +330,29 @@ function relativeDate(s, _initialDate) {
|
|
|
317
330
|
*/
|
|
318
331
|
function makeStrings(o) {
|
|
319
332
|
return Object.entries(o).reduce((a, [k, v]) => {
|
|
320
|
-
a[k] =
|
|
333
|
+
a[k] = typeof v === 'object' ? JSON.stringify(v) : String(v);
|
|
321
334
|
return a;
|
|
322
335
|
}, {});
|
|
323
336
|
}
|
|
337
|
+
function appendPostfix(filename, postfix) {
|
|
338
|
+
const filenameParts = filename.split('/');
|
|
339
|
+
const fileParts = filenameParts
|
|
340
|
+
.slice(-1)[0]
|
|
341
|
+
.split('.')
|
|
342
|
+
.filter(Boolean)
|
|
343
|
+
.filter((d) => d !== postfix);
|
|
344
|
+
|
|
345
|
+
let targetFile = null;
|
|
346
|
+
if (fileParts.slice(-1)[0] === 'gz') {
|
|
347
|
+
targetFile = fileParts.slice(0, -2).concat(postfix).concat(fileParts.slice(-2)).join('.');
|
|
348
|
+
} else {
|
|
349
|
+
targetFile = fileParts.slice(0, -1).concat(postfix).concat(fileParts.slice(-1)).join('.');
|
|
350
|
+
}
|
|
351
|
+
return filenameParts.slice(0, -1).concat(targetFile).join('/');
|
|
352
|
+
}
|
|
324
353
|
|
|
325
354
|
module.exports = {
|
|
355
|
+
appendPostfix,
|
|
326
356
|
bool,
|
|
327
357
|
downloadFile,
|
|
328
358
|
getTempFilename,
|
|
@@ -336,5 +366,5 @@ module.exports = {
|
|
|
336
366
|
makeStrings,
|
|
337
367
|
relativeDate,
|
|
338
368
|
streamPacket,
|
|
339
|
-
writeTempFile
|
|
369
|
+
writeTempFile
|
|
340
370
|
};
|
package/index.js
CHANGED
|
@@ -6,15 +6,14 @@ const dayjs = require('dayjs');
|
|
|
6
6
|
const debug = require('debug')('@engine9/input-tools');
|
|
7
7
|
|
|
8
8
|
const unzipper = require('unzipper');
|
|
9
|
-
const {
|
|
10
|
-
v4: uuidv4, v5: uuidv5, v7: uuidv7, validate: uuidIsValid,
|
|
11
|
-
} = require('uuid');
|
|
9
|
+
const { v4: uuidv4, v5: uuidv5, v7: uuidv7, validate: uuidIsValid } = require('uuid');
|
|
12
10
|
const archiver = require('archiver');
|
|
13
11
|
const handlebars = require('handlebars');
|
|
14
12
|
|
|
15
13
|
const FileUtilities = require('./file/FileUtilities');
|
|
16
14
|
|
|
17
15
|
const {
|
|
16
|
+
appendPostfix,
|
|
18
17
|
bool,
|
|
19
18
|
getManifest,
|
|
20
19
|
getFile,
|
|
@@ -29,7 +28,7 @@ const {
|
|
|
29
28
|
getDebatchTransform,
|
|
30
29
|
getStringArray,
|
|
31
30
|
makeStrings,
|
|
32
|
-
writeTempFile
|
|
31
|
+
writeTempFile
|
|
33
32
|
} = require('./file/tools');
|
|
34
33
|
|
|
35
34
|
const ForEachEntry = require('./ForEachEntry');
|
|
@@ -45,7 +44,7 @@ function getFormattedDate(dateObject, format = 'MMM DD,YYYY') {
|
|
|
45
44
|
|
|
46
45
|
handlebars.registerHelper('date', (d, f) => {
|
|
47
46
|
let format;
|
|
48
|
-
if (typeof f === 'string')format = f;
|
|
47
|
+
if (typeof f === 'string') format = f;
|
|
49
48
|
return getFormattedDate(d, format);
|
|
50
49
|
});
|
|
51
50
|
handlebars.registerHelper('json', (d) => JSON.stringify(d));
|
|
@@ -60,11 +59,7 @@ async function list(_path) {
|
|
|
60
59
|
const directory = await unzipper.Open.file(_path);
|
|
61
60
|
|
|
62
61
|
return new Promise((resolve, reject) => {
|
|
63
|
-
directory.files[0]
|
|
64
|
-
.stream()
|
|
65
|
-
.pipe(fs.createWriteStream('firstFile'))
|
|
66
|
-
.on('error', reject)
|
|
67
|
-
.on('finish', resolve);
|
|
62
|
+
directory.files[0].stream().pipe(fs.createWriteStream('firstFile')).on('error', reject).on('finish', resolve);
|
|
68
63
|
});
|
|
69
64
|
}
|
|
70
65
|
|
|
@@ -74,11 +69,7 @@ async function extract(_path, _file) {
|
|
|
74
69
|
const file = directory.files.find((d) => d.path === _file);
|
|
75
70
|
const tempFilename = await getTempFilename({ source: _file });
|
|
76
71
|
return new Promise((resolve, reject) => {
|
|
77
|
-
file
|
|
78
|
-
.stream()
|
|
79
|
-
.pipe(fs.createWriteStream(tempFilename))
|
|
80
|
-
.on('error', reject)
|
|
81
|
-
.on('finish', resolve);
|
|
72
|
+
file.stream().pipe(fs.createWriteStream(tempFilename)).on('error', reject).on('finish', resolve);
|
|
82
73
|
});
|
|
83
74
|
}
|
|
84
75
|
|
|
@@ -87,7 +78,7 @@ function appendFiles(existingFiles, _newFiles, options) {
|
|
|
87
78
|
if (newFiles.length === 0) return;
|
|
88
79
|
let { type, dateCreated } = options || {};
|
|
89
80
|
if (!type) type = 'unknown';
|
|
90
|
-
if (!dateCreated)dateCreated = new Date().toISOString();
|
|
81
|
+
if (!dateCreated) dateCreated = new Date().toISOString();
|
|
91
82
|
let arr = newFiles;
|
|
92
83
|
if (!Array.isArray(newFiles)) arr = [arr];
|
|
93
84
|
|
|
@@ -96,7 +87,7 @@ function appendFiles(existingFiles, _newFiles, options) {
|
|
|
96
87
|
type,
|
|
97
88
|
originalFilename: '',
|
|
98
89
|
isNew: true,
|
|
99
|
-
dateCreated
|
|
90
|
+
dateCreated
|
|
100
91
|
};
|
|
101
92
|
|
|
102
93
|
if (typeof p === 'string') {
|
|
@@ -121,7 +112,7 @@ async function create(options) {
|
|
|
121
112
|
messageFiles = [], // file with contents of message, used for delivery
|
|
122
113
|
personFiles = [], // files with data on people
|
|
123
114
|
timelineFiles = [], // activity entry
|
|
124
|
-
statisticsFiles = []
|
|
115
|
+
statisticsFiles = [] // files with aggregate statistics
|
|
125
116
|
} = options;
|
|
126
117
|
if (options.peopleFiles) throw new Error('Unknown option: peopleFiles, did you mean personFiles?');
|
|
127
118
|
|
|
@@ -132,21 +123,21 @@ async function create(options) {
|
|
|
132
123
|
appendFiles(files, timelineFiles, { type: 'timeline', dateCreated });
|
|
133
124
|
appendFiles(files, statisticsFiles, { type: 'statistics', dateCreated });
|
|
134
125
|
|
|
135
|
-
const zipFilename = target || await getTempFilename({ postfix: '.packet.zip' });
|
|
126
|
+
const zipFilename = target || (await getTempFilename({ postfix: '.packet.zip' }));
|
|
136
127
|
|
|
137
128
|
const manifest = {
|
|
138
129
|
accountId,
|
|
139
130
|
source: {
|
|
140
|
-
pluginId
|
|
131
|
+
pluginId
|
|
141
132
|
},
|
|
142
133
|
dateCreated,
|
|
143
|
-
files
|
|
134
|
+
files
|
|
144
135
|
};
|
|
145
136
|
|
|
146
137
|
// create a file to stream archive data to.
|
|
147
138
|
const output = fs.createWriteStream(zipFilename);
|
|
148
139
|
const archive = archiver('zip', {
|
|
149
|
-
zlib: { level: 9 }
|
|
140
|
+
zlib: { level: 9 } // Sets the compression level.
|
|
150
141
|
});
|
|
151
142
|
return new Promise((resolve, reject) => {
|
|
152
143
|
debug(`Setting up write stream to ${zipFilename}`);
|
|
@@ -157,7 +148,7 @@ async function create(options) {
|
|
|
157
148
|
debug(zipFilename);
|
|
158
149
|
return resolve({
|
|
159
150
|
filename: zipFilename,
|
|
160
|
-
bytes: archive.pointer()
|
|
151
|
+
bytes: archive.pointer()
|
|
161
152
|
});
|
|
162
153
|
});
|
|
163
154
|
|
|
@@ -196,7 +187,6 @@ function intToByteArray(_v) {
|
|
|
196
187
|
const byteArray = [0, 0, 0, 0, 0, 0, 0, 0];
|
|
197
188
|
let v = _v;
|
|
198
189
|
for (let index = 0; index < byteArray.length; index += 1) {
|
|
199
|
-
// eslint-disable-next-line no-bitwise
|
|
200
190
|
const byte = v & 0xff;
|
|
201
191
|
byteArray[index] = byte;
|
|
202
192
|
v = (v - byte) / 256;
|
|
@@ -226,23 +216,26 @@ function getInputUUID(a, b) {
|
|
|
226
216
|
return uuidv5(`${pluginId}:${rid}`, '3d0e5d99-6ba9-4fab-9bb2-c32304d3df8e');
|
|
227
217
|
}
|
|
228
218
|
|
|
229
|
-
function getUUIDv7(date, inputUuid) {
|
|
219
|
+
function getUUIDv7(date, inputUuid) {
|
|
220
|
+
/* optional date and input UUID */
|
|
230
221
|
const uuid = inputUuid || uuidv7();
|
|
231
222
|
const bytes = Buffer.from(uuid.replace(/-/g, ''), 'hex');
|
|
232
223
|
if (date !== undefined) {
|
|
233
224
|
const d = new Date(date);
|
|
234
225
|
// isNaN behaves differently than Number.isNaN -- we're actually going for the
|
|
235
226
|
// attempted conversion here
|
|
236
|
-
|
|
227
|
+
|
|
237
228
|
if (isNaN(d)) throw new Error(`getUUIDv7 got an invalid date:${date || '<blank>'}`);
|
|
238
229
|
const dateBytes = intToByteArray(d.getTime()).reverse();
|
|
239
|
-
dateBytes.slice(2, 8).forEach((b, i) => {
|
|
230
|
+
dateBytes.slice(2, 8).forEach((b, i) => {
|
|
231
|
+
bytes[i] = b;
|
|
232
|
+
});
|
|
240
233
|
}
|
|
241
234
|
return uuidv4({ random: bytes });
|
|
242
235
|
}
|
|
243
236
|
/* Returns a date from a given uuid (assumed to be a v7, otherwise the results are ... weird */
|
|
244
237
|
function getUUIDTimestamp(uuid) {
|
|
245
|
-
const ts = parseInt(
|
|
238
|
+
const ts = parseInt(`${uuid}`.replace(/-/g, '').slice(0, 12), 16);
|
|
246
239
|
return new Date(ts);
|
|
247
240
|
}
|
|
248
241
|
|
|
@@ -266,7 +259,8 @@ function getTimelineEntryUUID(inputObject, { defaults = {} } = {}) {
|
|
|
266
259
|
|
|
267
260
|
if (o.remote_entry_id) {
|
|
268
261
|
// get a temp ID
|
|
269
|
-
if (!o.input_id)
|
|
262
|
+
if (!o.input_id)
|
|
263
|
+
throw new Error('Error generating timeline entry uuid -- remote_entry_id specified, but no input_id');
|
|
270
264
|
const uuid = uuidv5(o.remote_entry_id, o.input_id);
|
|
271
265
|
// Change out the ts to match the v7 sorting.
|
|
272
266
|
// But because outside specified remote_entry_uuid
|
|
@@ -274,14 +268,13 @@ function getTimelineEntryUUID(inputObject, { defaults = {} } = {}) {
|
|
|
274
268
|
return getUUIDv7(o.ts, uuid);
|
|
275
269
|
}
|
|
276
270
|
|
|
277
|
-
const missing = requiredTimelineEntryFields
|
|
278
|
-
.filter((d) => o[d] === undefined);// 0 could be an entry type value
|
|
271
|
+
const missing = requiredTimelineEntryFields.filter((d) => o[d] === undefined); // 0 could be an entry type value
|
|
279
272
|
|
|
280
273
|
if (missing.length > 0) throw new Error(`Missing required fields to append an entry_id:${missing.join(',')}`);
|
|
281
274
|
const ts = new Date(o.ts);
|
|
282
275
|
// isNaN behaves differently than Number.isNaN -- we're actually going for the
|
|
283
276
|
// attempted conversion here
|
|
284
|
-
|
|
277
|
+
|
|
285
278
|
if (isNaN(ts)) throw new Error(`getTimelineEntryUUID got an invalid date:${o.ts || '<blank>'}`);
|
|
286
279
|
const idString = `${ts.toISOString()}-${o.person_id}-${o.entry_type_id}-${o.source_code_id || 0}`;
|
|
287
280
|
|
|
@@ -308,6 +301,7 @@ function getEntryTypeId(o, { defaults = {} } = {}) {
|
|
|
308
301
|
}
|
|
309
302
|
|
|
310
303
|
module.exports = {
|
|
304
|
+
appendPostfix,
|
|
311
305
|
bool,
|
|
312
306
|
create,
|
|
313
307
|
list,
|
|
@@ -339,5 +333,5 @@ module.exports = {
|
|
|
339
333
|
uuidIsValid,
|
|
340
334
|
uuidv4,
|
|
341
335
|
uuidv5,
|
|
342
|
-
uuidv7
|
|
336
|
+
uuidv7
|
|
343
337
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@engine9-io/input-tools",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.0",
|
|
4
4
|
"description": "Tools for dealing with Engine9 inputs",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
"throttle-debounce": "^5.0.2",
|
|
31
31
|
"unzipper": "^0.12.1",
|
|
32
32
|
"uuid": "^11.1.0",
|
|
33
|
+
"xlstream": "^2.5.5",
|
|
33
34
|
"yargs": "^17.7.2"
|
|
34
35
|
},
|
|
35
36
|
"directories": {
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
const { describe, it } = require('node:test');
|
|
2
|
+
const assert = require('node:assert');
|
|
3
|
+
const debug = require('debug')('test/forEach');
|
|
4
|
+
|
|
5
|
+
const { ForEachEntry } = require('../../index');
|
|
6
|
+
|
|
7
|
+
describe('Test Person File For Each', async () => {
|
|
8
|
+
it('forEachPerson Should loop through 1000 sample people', async () => {
|
|
9
|
+
let counter = 0;
|
|
10
|
+
const forEach = new ForEachEntry();
|
|
11
|
+
const result = await forEach.process({
|
|
12
|
+
packet: 'test/sample/1000_message.packet.zip',
|
|
13
|
+
batchSize: 50,
|
|
14
|
+
bindings: {
|
|
15
|
+
timelineOutputFileStream: {
|
|
16
|
+
path: 'output.timeline',
|
|
17
|
+
options: {
|
|
18
|
+
entry_type: 'ENTRY_OPTION'
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
sampleOutputFileStream: {
|
|
22
|
+
path: 'output.stream'
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
async transform(props) {
|
|
26
|
+
const { batch, timelineOutputFileStream, sampleOutputFileStream } = props;
|
|
27
|
+
|
|
28
|
+
batch.forEach((p) => {
|
|
29
|
+
if (Math.random() > 0.9) {
|
|
30
|
+
sampleOutputFileStream.push({
|
|
31
|
+
// for testing we don't need real person_ids
|
|
32
|
+
person_id: p.person_id || Math.floor(Math.random() * 1000000),
|
|
33
|
+
email: p.email,
|
|
34
|
+
entry_type: 'SAMPLE_OUTPUT'
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
timelineOutputFileStream.push({
|
|
38
|
+
// for testing we don't need real person_ids
|
|
39
|
+
person_id: p.person_id || Math.floor(Math.random() * 1000000),
|
|
40
|
+
email: p.email,
|
|
41
|
+
entry_type: 'EMAIL_DELIVERED'
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
batch.forEach(() => {
|
|
46
|
+
counter += 1;
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
assert(result.outputFiles?.timelineOutputFileStream?.[0]?.records);
|
|
51
|
+
assert(result.outputFiles?.sampleOutputFileStream?.[0]?.records);
|
|
52
|
+
assert.equal(counter, 1000, `Expected to loop through 1000 people, actual:${counter}`);
|
|
53
|
+
});
|
|
54
|
+
debug('Completed tests');
|
|
55
|
+
});
|
package/test/packet/forEach.js
DELETED
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
const {
|
|
2
|
-
describe, it,
|
|
3
|
-
} = require('node:test');
|
|
4
|
-
const assert = require('node:assert');
|
|
5
|
-
const debug = require('debug')('test/forEach');
|
|
6
|
-
|
|
7
|
-
const { ForEachEntry } = require('../../index');
|
|
8
|
-
|
|
9
|
-
describe('Test Person Packet For Each', async () => {
|
|
10
|
-
it('forEachPerson Should loop through 1000 sample people', async () => {
|
|
11
|
-
let counter = 0;
|
|
12
|
-
const forEach = new ForEachEntry();
|
|
13
|
-
const result = await forEach.process(
|
|
14
|
-
{
|
|
15
|
-
packet: 'test/sample/1000_message.packet.zip',
|
|
16
|
-
batchSize: 50,
|
|
17
|
-
bindings: {
|
|
18
|
-
timelineOutputFileStream: {
|
|
19
|
-
path: 'output.timeline',
|
|
20
|
-
options: {
|
|
21
|
-
entry_type: 'ENTRY_OPTION',
|
|
22
|
-
},
|
|
23
|
-
},
|
|
24
|
-
sampleOutputFileStream: {
|
|
25
|
-
path: 'output.stream',
|
|
26
|
-
},
|
|
27
|
-
},
|
|
28
|
-
async transform(props) {
|
|
29
|
-
const {
|
|
30
|
-
batch,
|
|
31
|
-
timelineOutputFileStream,
|
|
32
|
-
sampleOutputFileStream,
|
|
33
|
-
} = props;
|
|
34
|
-
|
|
35
|
-
batch.forEach((p) => {
|
|
36
|
-
if (Math.random() > 0.9) {
|
|
37
|
-
sampleOutputFileStream.push({
|
|
38
|
-
// for testing we don't need real person_ids
|
|
39
|
-
person_id: p.person_id || Math.floor(Math.random() * 1000000),
|
|
40
|
-
email: p.email,
|
|
41
|
-
entry_type: 'SAMPLE_OUTPUT',
|
|
42
|
-
});
|
|
43
|
-
}
|
|
44
|
-
timelineOutputFileStream.push(
|
|
45
|
-
{
|
|
46
|
-
// for testing we don't need real person_ids
|
|
47
|
-
person_id: p.person_id || Math.floor(Math.random() * 1000000),
|
|
48
|
-
email: p.email,
|
|
49
|
-
entry_type: 'EMAIL_DELIVERED',
|
|
50
|
-
},
|
|
51
|
-
);
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
batch.forEach(() => { counter += 1; });
|
|
55
|
-
},
|
|
56
|
-
},
|
|
57
|
-
);
|
|
58
|
-
assert(result.outputFiles?.timelineOutputFileStream?.[0]?.records);
|
|
59
|
-
assert(result.outputFiles?.sampleOutputFileStream?.[0]?.records);
|
|
60
|
-
assert.equal(counter, 1000, `Expected to loop through 1000 people, actual:${counter}`);
|
|
61
|
-
});
|
|
62
|
-
debug('Completed tests');
|
|
63
|
-
});
|
|
File without changes
|
|
File without changes
|
|
File without changes
|