@engine9-io/input-tools 1.7.9 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +201 -135
- package/package.json +2 -1
package/file/FileUtilities.js
CHANGED
|
@@ -1,17 +1,15 @@
|
|
|
1
|
-
/* eslint-disable no-await-in-loop */
|
|
2
1
|
const fs = require('node:fs');
|
|
3
2
|
|
|
4
3
|
const fsp = fs.promises;
|
|
5
4
|
const path = require('node:path');
|
|
6
5
|
const zlib = require('node:zlib');
|
|
7
|
-
const {
|
|
8
|
-
Readable, Transform, PassThrough, Writable,
|
|
9
|
-
} = require('node:stream');
|
|
6
|
+
const { Readable, Transform, PassThrough, Writable } = require('node:stream');
|
|
10
7
|
const { pipeline } = require('node:stream/promises');
|
|
11
8
|
const { stringify } = require('csv');
|
|
12
9
|
|
|
13
10
|
const debug = require('debug')('FileWorker');
|
|
14
11
|
|
|
12
|
+
const { getXlsxStream } = require('xlstream');
|
|
15
13
|
const csv = require('csv');
|
|
16
14
|
const JSON5 = require('json5');
|
|
17
15
|
const languageEncoding = require('detect-file-encoding-and-language');
|
|
@@ -20,10 +18,18 @@ const S3Worker = require('./S3');
|
|
|
20
18
|
const ParquetWorker = require('./Parquet');
|
|
21
19
|
|
|
22
20
|
const {
|
|
23
|
-
bool,
|
|
21
|
+
bool,
|
|
22
|
+
getTempFilename,
|
|
23
|
+
getStringArray,
|
|
24
|
+
getTempDir,
|
|
25
|
+
makeStrings,
|
|
26
|
+
streamPacket,
|
|
27
|
+
relativeDate
|
|
24
28
|
} = require('./tools');
|
|
25
29
|
|
|
26
|
-
function Worker({ accountId }) {
|
|
30
|
+
function Worker({ accountId }) {
|
|
31
|
+
this.accountId = accountId;
|
|
32
|
+
}
|
|
27
33
|
|
|
28
34
|
class LineReaderTransform extends Transform {
|
|
29
35
|
constructor(options = {}) {
|
|
@@ -31,7 +37,6 @@ class LineReaderTransform extends Transform {
|
|
|
31
37
|
this.buffer = '';
|
|
32
38
|
}
|
|
33
39
|
|
|
34
|
-
// eslint-disable-next-line no-underscore-dangle
|
|
35
40
|
_transform(chunk, encoding, callback) {
|
|
36
41
|
this.buffer += chunk.toString();
|
|
37
42
|
const lines = this.buffer.split(/\r?\n/);
|
|
@@ -40,7 +45,6 @@ class LineReaderTransform extends Transform {
|
|
|
40
45
|
callback();
|
|
41
46
|
}
|
|
42
47
|
|
|
43
|
-
// eslint-disable-next-line no-underscore-dangle
|
|
44
48
|
_flush(callback) {
|
|
45
49
|
if (this.buffer) {
|
|
46
50
|
this.push(this.buffer);
|
|
@@ -53,7 +57,11 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
53
57
|
const transforms = [];
|
|
54
58
|
const delimiter = options.delimiter || ',';
|
|
55
59
|
|
|
56
|
-
const headerMapping =
|
|
60
|
+
const headerMapping =
|
|
61
|
+
options.headerMapping ||
|
|
62
|
+
function (d) {
|
|
63
|
+
return d;
|
|
64
|
+
};
|
|
57
65
|
let lastLine = null;
|
|
58
66
|
let head = null;
|
|
59
67
|
|
|
@@ -63,13 +71,16 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
63
71
|
skip_empty_lines: true,
|
|
64
72
|
delimiter,
|
|
65
73
|
max_limit_on_data_read: 10000000,
|
|
66
|
-
skip_lines_with_error: skipLinesWithError
|
|
74
|
+
skip_lines_with_error: skipLinesWithError
|
|
67
75
|
};
|
|
68
76
|
if (options.skip) parserOptions.from_line = options.skip;
|
|
69
77
|
if (options.relax_column_count) parserOptions.relax_column_count = true;
|
|
70
78
|
if (options.quote_escape) {
|
|
71
79
|
parserOptions.escape = options.quote_escape;
|
|
72
80
|
}
|
|
81
|
+
if (options.limit) {
|
|
82
|
+
parserOptions.to = options.limit;
|
|
83
|
+
}
|
|
73
84
|
|
|
74
85
|
debug('Parser options=', parserOptions);
|
|
75
86
|
const parser = csv.parse(parserOptions);
|
|
@@ -101,7 +112,7 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
101
112
|
|
|
102
113
|
lastLine = row.join(delimiter);
|
|
103
114
|
return cb(null, o);
|
|
104
|
-
}
|
|
115
|
+
}
|
|
105
116
|
});
|
|
106
117
|
|
|
107
118
|
transforms.push(parser);
|
|
@@ -124,12 +135,15 @@ Worker.prototype.detectEncoding = async function (options) {
|
|
|
124
135
|
// needed chunk size.
|
|
125
136
|
finalBuff = await new Promise((resolve, reject) => {
|
|
126
137
|
const bufferBuilder = [];
|
|
127
|
-
const decompressStream = zlib
|
|
138
|
+
const decompressStream = zlib
|
|
139
|
+
.createGunzip()
|
|
128
140
|
.on('data', (chunk) => {
|
|
129
141
|
bufferBuilder.push(chunk);
|
|
130
|
-
})
|
|
142
|
+
})
|
|
143
|
+
.on('close', () => {
|
|
131
144
|
resolve(Buffer.concat(bufferBuilder));
|
|
132
|
-
})
|
|
145
|
+
})
|
|
146
|
+
.on('error', (err) => {
|
|
133
147
|
if (err.errno !== -5) {
|
|
134
148
|
// EOF: expected
|
|
135
149
|
reject(err);
|
|
@@ -145,15 +159,57 @@ Worker.prototype.detectEncoding = async function (options) {
|
|
|
145
159
|
|
|
146
160
|
Worker.prototype.detectEncoding.metadata = {
|
|
147
161
|
options: {
|
|
148
|
-
filename: { required: true }
|
|
149
|
-
}
|
|
162
|
+
filename: { required: true }
|
|
163
|
+
}
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
Worker.prototype.xlsxToObjectStream = async function (options) {
|
|
167
|
+
let { filename } = options;
|
|
168
|
+
|
|
169
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
170
|
+
// We need to copy and delete
|
|
171
|
+
let worker = null;
|
|
172
|
+
if (filename.startsWith('r2://')) {
|
|
173
|
+
worker = new R2Worker(this);
|
|
174
|
+
} else {
|
|
175
|
+
worker = new S3Worker(this);
|
|
176
|
+
}
|
|
177
|
+
const target = getTempFilename({ targetFilename: filename.split('/').pop() });
|
|
178
|
+
|
|
179
|
+
await worker.copy({ filename, target });
|
|
180
|
+
filename = target;
|
|
181
|
+
}
|
|
182
|
+
let stream = await getXlsxStream({
|
|
183
|
+
filePath: filename,
|
|
184
|
+
sheet: 0
|
|
185
|
+
});
|
|
186
|
+
let keys = null;
|
|
187
|
+
stream = stream.pipe(
|
|
188
|
+
new Transform({
|
|
189
|
+
objectMode: true,
|
|
190
|
+
transform(d, enc, cb) {
|
|
191
|
+
if (!keys) {
|
|
192
|
+
keys = d?.raw.arr;
|
|
193
|
+
cb();
|
|
194
|
+
} else {
|
|
195
|
+
let o = {};
|
|
196
|
+
keys.forEach((k, i) => {
|
|
197
|
+
o[k] = d?.raw?.arr?.[i];
|
|
198
|
+
});
|
|
199
|
+
cb(null, o);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
})
|
|
203
|
+
);
|
|
204
|
+
|
|
205
|
+
return { stream };
|
|
150
206
|
};
|
|
151
207
|
|
|
152
208
|
/*
|
|
153
|
-
|
|
209
|
+
Commonly used method to transform a file into a stream of objects.
|
|
154
210
|
*/
|
|
155
211
|
Worker.prototype.fileToObjectStream = async function (options) {
|
|
156
|
-
const { filename, columns, limit: limitOption,format:formatOverride } = options;
|
|
212
|
+
const { filename, columns, limit: limitOption, format: formatOverride } = options;
|
|
157
213
|
|
|
158
214
|
// handle stream item
|
|
159
215
|
if (options.stream) {
|
|
@@ -167,6 +223,9 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
167
223
|
let limit;
|
|
168
224
|
if (limitOption) limit = parseInt(limitOption, 10);
|
|
169
225
|
if (!filename) throw new Error('fileToObjectStream: filename is required');
|
|
226
|
+
if (filename.split('.').pop().toLowerCase() === 'xlsx') {
|
|
227
|
+
return this.xlsxToObjectStream(options);
|
|
228
|
+
}
|
|
170
229
|
let postfix = options.sourcePostfix || filename.toLowerCase().split('.').pop();
|
|
171
230
|
if (postfix === 'zip') {
|
|
172
231
|
debug('Invalid filename:', { filename });
|
|
@@ -176,7 +235,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
176
235
|
const streamInfo = await this.stream({
|
|
177
236
|
filename,
|
|
178
237
|
columns,
|
|
179
|
-
limit
|
|
238
|
+
limit
|
|
180
239
|
});
|
|
181
240
|
const { encoding } = streamInfo;
|
|
182
241
|
let { stream } = streamInfo;
|
|
@@ -203,7 +262,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
203
262
|
} else {
|
|
204
263
|
stream.setEncoding(encoding);
|
|
205
264
|
}
|
|
206
|
-
let format=formatOverride || postfix;
|
|
265
|
+
let format = formatOverride || postfix;
|
|
207
266
|
|
|
208
267
|
if (format === 'csv') {
|
|
209
268
|
const csvTransforms = this.csvToObjectTransforms({ ...options });
|
|
@@ -243,13 +302,15 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
243
302
|
}
|
|
244
303
|
if (headers) {
|
|
245
304
|
const mapped = {};
|
|
246
|
-
headers.forEach((name, i) => {
|
|
305
|
+
headers.forEach((name, i) => {
|
|
306
|
+
mapped[name] = obj[i];
|
|
307
|
+
});
|
|
247
308
|
this.push(mapped);
|
|
248
309
|
} else {
|
|
249
310
|
this.push(obj);
|
|
250
311
|
}
|
|
251
312
|
return cb();
|
|
252
|
-
}
|
|
313
|
+
}
|
|
253
314
|
});
|
|
254
315
|
|
|
255
316
|
transforms.push(lineReader);
|
|
@@ -260,9 +321,11 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
260
321
|
const countAndDebug = new Transform({
|
|
261
322
|
objectMode: true,
|
|
262
323
|
transform(d, enc, cb) {
|
|
263
|
-
if (count === 0) {
|
|
324
|
+
if (count === 0) {
|
|
325
|
+
debug('Sample object from file:', d);
|
|
326
|
+
}
|
|
264
327
|
count += 1;
|
|
265
|
-
if ((count < 5000 && count % 1000 === 0) ||
|
|
328
|
+
if ((count < 5000 && count % 1000 === 0) || count % 50000 === 0) {
|
|
266
329
|
debug(`fileToObjectStream transformed ${count} lines`);
|
|
267
330
|
}
|
|
268
331
|
this.push(d);
|
|
@@ -279,7 +342,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
279
342
|
this.push(o);
|
|
280
343
|
} */
|
|
281
344
|
cb();
|
|
282
|
-
}
|
|
345
|
+
}
|
|
283
346
|
});
|
|
284
347
|
|
|
285
348
|
transforms.push(countAndDebug);
|
|
@@ -319,14 +382,14 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
319
382
|
objectMode: true,
|
|
320
383
|
async transform(item, encoding, cb) {
|
|
321
384
|
options.transform(item, encoding, cb);
|
|
322
|
-
}
|
|
385
|
+
}
|
|
323
386
|
});
|
|
324
387
|
} else {
|
|
325
388
|
transform = new Transform({
|
|
326
389
|
objectMode: true,
|
|
327
390
|
async transform(item, encoding, cb) {
|
|
328
391
|
cb(null, options.transform(item));
|
|
329
|
-
}
|
|
392
|
+
}
|
|
330
393
|
});
|
|
331
394
|
}
|
|
332
395
|
} else if (options.transform) {
|
|
@@ -345,7 +408,7 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
345
408
|
let v = item[k];
|
|
346
409
|
if (!o[k]) {
|
|
347
410
|
if (typeof v === 'object') {
|
|
348
|
-
while (Array.isArray(v)) [v] = v
|
|
411
|
+
while (Array.isArray(v)) [v] = v; // get first array item
|
|
349
412
|
o = { ...o, ...v };
|
|
350
413
|
} else {
|
|
351
414
|
o[k] = v;
|
|
@@ -353,12 +416,12 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
353
416
|
}
|
|
354
417
|
});
|
|
355
418
|
cb(null, o);
|
|
356
|
-
}
|
|
419
|
+
}
|
|
357
420
|
});
|
|
358
421
|
}
|
|
359
422
|
|
|
360
423
|
const stats = {
|
|
361
|
-
records: 0
|
|
424
|
+
records: 0
|
|
362
425
|
};
|
|
363
426
|
let stringifier;
|
|
364
427
|
if (options.targetFormat === 'jsonl') {
|
|
@@ -366,7 +429,7 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
366
429
|
objectMode: true,
|
|
367
430
|
transform(d, encoding, cb) {
|
|
368
431
|
cb(false, `${JSON.stringify(d)}\n`);
|
|
369
|
-
}
|
|
432
|
+
}
|
|
370
433
|
});
|
|
371
434
|
} else {
|
|
372
435
|
stringifier = stringify({ header: true });
|
|
@@ -383,11 +446,11 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
383
446
|
transform(d, enc, cb) {
|
|
384
447
|
stats.records += 1;
|
|
385
448
|
cb(null, d);
|
|
386
|
-
}
|
|
449
|
+
}
|
|
387
450
|
}),
|
|
388
451
|
stringifier,
|
|
389
452
|
gzip,
|
|
390
|
-
fileWriterStream
|
|
453
|
+
fileWriterStream
|
|
391
454
|
].filter(Boolean);
|
|
392
455
|
return { filename, streams, stats };
|
|
393
456
|
};
|
|
@@ -395,9 +458,7 @@ Worker.prototype.objectStreamToFile = async function (options) {
|
|
|
395
458
|
const { filename, streams, stats } = await this.getOutputStreams(options);
|
|
396
459
|
const { stream: inStream } = options;
|
|
397
460
|
streams.unshift(inStream);
|
|
398
|
-
await pipeline(
|
|
399
|
-
streams,
|
|
400
|
-
);
|
|
461
|
+
await pipeline(streams);
|
|
401
462
|
return { filename, records: stats.records };
|
|
402
463
|
};
|
|
403
464
|
|
|
@@ -432,7 +493,7 @@ Worker.prototype.transform = async function (options) {
|
|
|
432
493
|
if (typeof f === 'function') {
|
|
433
494
|
f = new Transform({
|
|
434
495
|
objectMode: true,
|
|
435
|
-
transform: f
|
|
496
|
+
transform: f
|
|
436
497
|
});
|
|
437
498
|
}
|
|
438
499
|
|
|
@@ -441,7 +502,10 @@ Worker.prototype.transform = async function (options) {
|
|
|
441
502
|
|
|
442
503
|
const { targetFormat } = options;
|
|
443
504
|
|
|
444
|
-
if (
|
|
505
|
+
if (
|
|
506
|
+
!targetFormat &&
|
|
507
|
+
(filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
|
|
508
|
+
) {
|
|
445
509
|
options.targetFormat = 'csv';
|
|
446
510
|
}
|
|
447
511
|
|
|
@@ -453,33 +517,34 @@ Worker.prototype.transform.metadata = {
|
|
|
453
517
|
sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
|
|
454
518
|
encoding: { description: 'Manual override of source file encoding' },
|
|
455
519
|
names: { description: 'Target field names (e.g. my_new_field,x,y,z)' },
|
|
456
|
-
values: {
|
|
520
|
+
values: {
|
|
521
|
+
description:
|
|
522
|
+
"Comma delimited source field name, or Handlebars [[ ]] merge fields (e.g. 'my_field,x,y,z', '[[field1]]-[[field2]]', etc)"
|
|
523
|
+
},
|
|
457
524
|
targetFilename: { description: 'Custom name of the output file (default auto-generated)' },
|
|
458
525
|
targetFormat: { description: 'Output format -- csv supported, or none for txt (default)' },
|
|
459
526
|
targetRowDelimiter: { description: 'Row delimiter (default \n)' },
|
|
460
|
-
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
|
|
461
|
-
}
|
|
527
|
+
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
|
|
528
|
+
}
|
|
462
529
|
};
|
|
463
530
|
Worker.prototype.testTransform = async function (options) {
|
|
464
531
|
return this.transform({
|
|
465
532
|
...options,
|
|
466
|
-
transform(d, enc, cb) {
|
|
533
|
+
transform(d, enc, cb) {
|
|
534
|
+
d.transform_time = new Date();
|
|
535
|
+
cb(null, d);
|
|
536
|
+
}
|
|
467
537
|
});
|
|
468
538
|
};
|
|
469
539
|
Worker.prototype.testTransform.metadata = {
|
|
470
540
|
options: {
|
|
471
|
-
filename: true
|
|
472
|
-
}
|
|
541
|
+
filename: true
|
|
542
|
+
}
|
|
473
543
|
};
|
|
474
544
|
|
|
475
545
|
/* Get a stream from an actual stream, or an array, or a file */
|
|
476
|
-
Worker.prototype.stream = async function (
|
|
477
|
-
options
|
|
478
|
-
) {
|
|
479
|
-
const {
|
|
480
|
-
stream: inputStream, packet, type, columns, limit,
|
|
481
|
-
filename: filenameOpt,
|
|
482
|
-
} = options;
|
|
546
|
+
Worker.prototype.stream = async function (options) {
|
|
547
|
+
const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
|
|
483
548
|
let filename = filenameOpt;
|
|
484
549
|
|
|
485
550
|
if (inputStream) {
|
|
@@ -496,7 +561,8 @@ Worker.prototype.stream = async function (
|
|
|
496
561
|
} else {
|
|
497
562
|
// debug(`Not prepending filename:${filename}`);
|
|
498
563
|
}
|
|
499
|
-
let encoding;
|
|
564
|
+
let encoding;
|
|
565
|
+
let stream;
|
|
500
566
|
if (filename.slice(-8) === '.parquet') {
|
|
501
567
|
const pq = new ParquetWorker(this);
|
|
502
568
|
stream = (await pq.stream({ filename, columns, limit })).stream;
|
|
@@ -541,9 +607,8 @@ Worker.prototype.sample = async function (opts) {
|
|
|
541
607
|
};
|
|
542
608
|
Worker.prototype.sample.metadata = {
|
|
543
609
|
options: {
|
|
544
|
-
filename: {}
|
|
545
|
-
|
|
546
|
-
},
|
|
610
|
+
filename: {}
|
|
611
|
+
}
|
|
547
612
|
};
|
|
548
613
|
Worker.prototype.toArray = async function (opts) {
|
|
549
614
|
const { stream } = await this.fileToObjectStream(opts);
|
|
@@ -551,8 +616,8 @@ Worker.prototype.toArray = async function (opts) {
|
|
|
551
616
|
};
|
|
552
617
|
Worker.prototype.toArray.metadata = {
|
|
553
618
|
options: {
|
|
554
|
-
filename: {}
|
|
555
|
-
}
|
|
619
|
+
filename: {}
|
|
620
|
+
}
|
|
556
621
|
};
|
|
557
622
|
|
|
558
623
|
Worker.prototype.write = async function (opts) {
|
|
@@ -566,7 +631,7 @@ Worker.prototype.write = async function (opts) {
|
|
|
566
631
|
await worker.write({
|
|
567
632
|
directory,
|
|
568
633
|
file,
|
|
569
|
-
content
|
|
634
|
+
content
|
|
570
635
|
});
|
|
571
636
|
} else {
|
|
572
637
|
await fsp.writeFile(filename, content);
|
|
@@ -576,15 +641,14 @@ Worker.prototype.write = async function (opts) {
|
|
|
576
641
|
Worker.prototype.write.metadata = {
|
|
577
642
|
options: {
|
|
578
643
|
filename: { description: 'Location to write content to, can be local or s3:// or r2://' },
|
|
579
|
-
content: {}
|
|
580
|
-
}
|
|
644
|
+
content: {}
|
|
645
|
+
}
|
|
581
646
|
};
|
|
582
647
|
|
|
583
648
|
async function streamToString(stream) {
|
|
584
649
|
// lets have a ReadableStream as a stream variable
|
|
585
650
|
const chunks = [];
|
|
586
651
|
|
|
587
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
588
652
|
for await (const chunk of stream) {
|
|
589
653
|
chunks.push(Buffer.from(chunk));
|
|
590
654
|
}
|
|
@@ -606,47 +670,46 @@ Worker.prototype.json = async function (opts) {
|
|
|
606
670
|
};
|
|
607
671
|
Worker.prototype.json.metadata = {
|
|
608
672
|
options: {
|
|
609
|
-
filename: { description: 'Get a javascript object from a file' }
|
|
610
|
-
}
|
|
673
|
+
filename: { description: 'Get a javascript object from a file' }
|
|
674
|
+
}
|
|
611
675
|
};
|
|
612
676
|
|
|
613
|
-
Worker.prototype.list = async function ({ directory, start:s, end:e }) {
|
|
677
|
+
Worker.prototype.list = async function ({ directory, start: s, end: e }) {
|
|
614
678
|
if (!directory) throw new Error('directory is required');
|
|
615
|
-
let start=null;
|
|
616
|
-
let end=null;
|
|
617
|
-
if (s) start=relativeDate(s);
|
|
618
|
-
if (e) end=relativeDate(e);
|
|
619
|
-
|
|
679
|
+
let start = null;
|
|
680
|
+
let end = null;
|
|
681
|
+
if (s) start = relativeDate(s);
|
|
682
|
+
if (e) end = relativeDate(e);
|
|
683
|
+
|
|
620
684
|
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
621
685
|
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
622
686
|
return worker.list({ directory, start, end });
|
|
623
687
|
}
|
|
624
688
|
const a = await fsp.readdir(directory, { withFileTypes: true });
|
|
625
689
|
|
|
626
|
-
const withModified=[];
|
|
690
|
+
const withModified = [];
|
|
627
691
|
for (const file of a) {
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
692
|
+
const fullPath = path.join(directory, file.name);
|
|
693
|
+
const stats = await fsp.stat(fullPath);
|
|
694
|
+
if (start && stats.mtime < start.getTime()) {
|
|
695
|
+
//do not include
|
|
696
|
+
} else if (end && stats.mtime > end.getTime()) {
|
|
697
|
+
//do nothing
|
|
698
|
+
} else {
|
|
699
|
+
withModified.push({
|
|
700
|
+
name: file.name,
|
|
701
|
+
type: file.isDirectory() ? 'directory' : 'file',
|
|
702
|
+
modifiedAt: new Date(stats.mtime).toISOString()
|
|
703
|
+
});
|
|
704
|
+
}
|
|
641
705
|
}
|
|
642
|
-
|
|
706
|
+
|
|
643
707
|
return withModified;
|
|
644
|
-
|
|
645
708
|
};
|
|
646
709
|
Worker.prototype.list.metadata = {
|
|
647
710
|
options: {
|
|
648
|
-
directory: { required: true }
|
|
649
|
-
}
|
|
711
|
+
directory: { required: true }
|
|
712
|
+
}
|
|
650
713
|
};
|
|
651
714
|
|
|
652
715
|
Worker.prototype.listAll = async function ({ directory }) {
|
|
@@ -661,8 +724,8 @@ Worker.prototype.listAll = async function ({ directory }) {
|
|
|
661
724
|
};
|
|
662
725
|
Worker.prototype.listAll.metadata = {
|
|
663
726
|
options: {
|
|
664
|
-
directory: { required: true }
|
|
665
|
-
}
|
|
727
|
+
directory: { required: true }
|
|
728
|
+
}
|
|
666
729
|
};
|
|
667
730
|
|
|
668
731
|
Worker.prototype.empty = async function ({ directory }) {
|
|
@@ -672,7 +735,7 @@ Worker.prototype.empty = async function ({ directory }) {
|
|
|
672
735
|
throw new Error('Cannot empty an s3:// or r2:// directory');
|
|
673
736
|
}
|
|
674
737
|
const removed = [];
|
|
675
|
-
|
|
738
|
+
|
|
676
739
|
for (const file of await fsp.readdir(directory)) {
|
|
677
740
|
removed.push(file);
|
|
678
741
|
await fsp.unlink(path.join(directory, file));
|
|
@@ -681,8 +744,8 @@ Worker.prototype.empty = async function ({ directory }) {
|
|
|
681
744
|
};
|
|
682
745
|
Worker.prototype.empty.metadata = {
|
|
683
746
|
options: {
|
|
684
|
-
directory: { required: true }
|
|
685
|
-
}
|
|
747
|
+
directory: { required: true }
|
|
748
|
+
}
|
|
686
749
|
};
|
|
687
750
|
|
|
688
751
|
Worker.prototype.remove = async function ({ filename }) {
|
|
@@ -705,16 +768,18 @@ Worker.prototype.remove = async function ({ filename }) {
|
|
|
705
768
|
};
|
|
706
769
|
Worker.prototype.remove.metadata = {
|
|
707
770
|
options: {
|
|
708
|
-
filename: {}
|
|
709
|
-
}
|
|
771
|
+
filename: {}
|
|
772
|
+
}
|
|
710
773
|
};
|
|
711
774
|
|
|
712
775
|
Worker.prototype.move = async function ({ filename, target }) {
|
|
713
776
|
if (!target) throw new Error('target is required');
|
|
714
777
|
if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
|
|
715
778
|
if (target.startsWith('s3://') || target.startsWith('r2://')) {
|
|
716
|
-
if (
|
|
717
|
-
|
|
779
|
+
if (
|
|
780
|
+
(target.startsWith('s3://') && filename.startsWith('r2://')) ||
|
|
781
|
+
(target.startsWith('r2://') && filename.startsWith('s3://'))
|
|
782
|
+
) {
|
|
718
783
|
throw new Error('Cowardly not copying between services');
|
|
719
784
|
}
|
|
720
785
|
|
|
@@ -741,8 +806,8 @@ Worker.prototype.move = async function ({ filename, target }) {
|
|
|
741
806
|
Worker.prototype.move.metadata = {
|
|
742
807
|
options: {
|
|
743
808
|
filename: {},
|
|
744
|
-
target: {}
|
|
745
|
-
}
|
|
809
|
+
target: {}
|
|
810
|
+
}
|
|
746
811
|
};
|
|
747
812
|
|
|
748
813
|
Worker.prototype.stat = async function ({ filename }) {
|
|
@@ -751,11 +816,7 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
751
816
|
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
752
817
|
return worker.stat({ filename });
|
|
753
818
|
}
|
|
754
|
-
const {
|
|
755
|
-
ctime,
|
|
756
|
-
birthtime,
|
|
757
|
-
size,
|
|
758
|
-
} = await fsp.stat(filename);
|
|
819
|
+
const { ctime, birthtime, size } = await fsp.stat(filename);
|
|
759
820
|
const modifiedAt = new Date(ctime);
|
|
760
821
|
let createdAt = birthtime;
|
|
761
822
|
if (createdAt === 0 || !createdAt) createdAt = ctime;
|
|
@@ -763,13 +824,13 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
763
824
|
return {
|
|
764
825
|
createdAt,
|
|
765
826
|
modifiedAt,
|
|
766
|
-
size
|
|
827
|
+
size
|
|
767
828
|
};
|
|
768
829
|
};
|
|
769
830
|
Worker.prototype.stat.metadata = {
|
|
770
831
|
options: {
|
|
771
|
-
filename: {}
|
|
772
|
-
}
|
|
832
|
+
filename: {}
|
|
833
|
+
}
|
|
773
834
|
};
|
|
774
835
|
|
|
775
836
|
Worker.prototype.download = async function ({ filename }) {
|
|
@@ -782,8 +843,8 @@ Worker.prototype.download = async function ({ filename }) {
|
|
|
782
843
|
};
|
|
783
844
|
Worker.prototype.download.metadata = {
|
|
784
845
|
options: {
|
|
785
|
-
filename: {}
|
|
786
|
-
}
|
|
846
|
+
filename: {}
|
|
847
|
+
}
|
|
787
848
|
};
|
|
788
849
|
|
|
789
850
|
Worker.prototype.head = async function (options) {
|
|
@@ -792,7 +853,7 @@ Worker.prototype.head = async function (options) {
|
|
|
792
853
|
const chunks = [];
|
|
793
854
|
|
|
794
855
|
let counter = 0;
|
|
795
|
-
|
|
856
|
+
|
|
796
857
|
for await (const chunk of stream) {
|
|
797
858
|
chunks.push(chunk);
|
|
798
859
|
counter += 1;
|
|
@@ -804,8 +865,8 @@ Worker.prototype.head = async function (options) {
|
|
|
804
865
|
|
|
805
866
|
Worker.prototype.head.metadata = {
|
|
806
867
|
options: {
|
|
807
|
-
filename: { required: true }
|
|
808
|
-
}
|
|
868
|
+
filename: { required: true }
|
|
869
|
+
}
|
|
809
870
|
};
|
|
810
871
|
|
|
811
872
|
Worker.prototype.count = async function (options) {
|
|
@@ -814,7 +875,7 @@ Worker.prototype.count = async function (options) {
|
|
|
814
875
|
|
|
815
876
|
const limit = options.limit || 5;
|
|
816
877
|
let records = 0;
|
|
817
|
-
|
|
878
|
+
|
|
818
879
|
for await (const chunk of stream) {
|
|
819
880
|
records += 1;
|
|
820
881
|
if (records < limit) {
|
|
@@ -827,8 +888,8 @@ Worker.prototype.count = async function (options) {
|
|
|
827
888
|
|
|
828
889
|
Worker.prototype.count.metadata = {
|
|
829
890
|
options: {
|
|
830
|
-
filename: { required: true }
|
|
831
|
-
}
|
|
891
|
+
filename: { required: true }
|
|
892
|
+
}
|
|
832
893
|
};
|
|
833
894
|
|
|
834
895
|
// Get a set of unique entries from a uniqueFunction
|
|
@@ -839,10 +900,10 @@ Worker.prototype.getUniqueSet = async function (options) {
|
|
|
839
900
|
|
|
840
901
|
let { uniqueFunction } = options;
|
|
841
902
|
if (!uniqueFunction) {
|
|
842
|
-
uniqueFunction = (
|
|
903
|
+
uniqueFunction = (o) => JSON.stringify(o);
|
|
843
904
|
}
|
|
844
905
|
const uniqueSet = new Set();
|
|
845
|
-
|
|
906
|
+
|
|
846
907
|
for (const filename of existingFiles) {
|
|
847
908
|
const { stream: existsStream } = await this.fileToObjectStream({ filename });
|
|
848
909
|
await pipeline(
|
|
@@ -856,14 +917,14 @@ Worker.prototype.getUniqueSet = async function (options) {
|
|
|
856
917
|
}
|
|
857
918
|
uniqueSet.add(v);
|
|
858
919
|
cb(null, d);
|
|
859
|
-
}
|
|
920
|
+
}
|
|
860
921
|
}),
|
|
861
922
|
new Writable({
|
|
862
923
|
objectMode: true,
|
|
863
924
|
write(d, enc, cb) {
|
|
864
925
|
cb();
|
|
865
|
-
}
|
|
866
|
-
})
|
|
926
|
+
}
|
|
927
|
+
})
|
|
867
928
|
);
|
|
868
929
|
debug(`Finished loading ${filename}`);
|
|
869
930
|
}
|
|
@@ -875,7 +936,7 @@ Worker.prototype.getUniqueStream = async function (options) {
|
|
|
875
936
|
|
|
876
937
|
const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
|
|
877
938
|
filenames: options.existingFiles,
|
|
878
|
-
uniqueFunction: options.uniqueFunction
|
|
939
|
+
uniqueFunction: options.uniqueFunction
|
|
879
940
|
});
|
|
880
941
|
|
|
881
942
|
const { stream: inStream } = await this.fileToObjectStream(options);
|
|
@@ -899,8 +960,8 @@ Worker.prototype.getUniqueStream = async function (options) {
|
|
|
899
960
|
}
|
|
900
961
|
cb(null, d);
|
|
901
962
|
}
|
|
902
|
-
}
|
|
903
|
-
})
|
|
963
|
+
}
|
|
964
|
+
})
|
|
904
965
|
);
|
|
905
966
|
return { stream: uniqueStream, sample };
|
|
906
967
|
};
|
|
@@ -912,9 +973,9 @@ Worker.prototype.getUniqueStream.metadata = {
|
|
|
912
973
|
filename: { description: 'Specify a source filename or a stream' },
|
|
913
974
|
stream: { description: 'Specify a source filename or a stream' },
|
|
914
975
|
includeDuplicateSourceRecords: {
|
|
915
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
916
|
-
}
|
|
917
|
-
}
|
|
976
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
977
|
+
}
|
|
978
|
+
}
|
|
918
979
|
};
|
|
919
980
|
Worker.prototype.getUniqueFile = async function (options) {
|
|
920
981
|
const { stream, sample } = await this.getUniqueStream(options);
|
|
@@ -929,9 +990,9 @@ Worker.prototype.getUniqueFile.metadata = {
|
|
|
929
990
|
filename: { description: 'Specify a source filename or a stream' },
|
|
930
991
|
stream: { description: 'Specify a source filename or a stream' },
|
|
931
992
|
includeDuplicateSourceRecords: {
|
|
932
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
933
|
-
}
|
|
934
|
-
}
|
|
993
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
994
|
+
}
|
|
995
|
+
}
|
|
935
996
|
};
|
|
936
997
|
|
|
937
998
|
/*
|
|
@@ -940,7 +1001,11 @@ Requires 2 passes of the files,
|
|
|
940
1001
|
but that's a better tradeoff than trying to store huge files in memory
|
|
941
1002
|
*/
|
|
942
1003
|
Worker.prototype.diff = async function ({
|
|
943
|
-
fileA,
|
|
1004
|
+
fileA,
|
|
1005
|
+
fileB,
|
|
1006
|
+
uniqueFunction: ufOpt,
|
|
1007
|
+
fields,
|
|
1008
|
+
includeDuplicateSourceRecords
|
|
944
1009
|
}) {
|
|
945
1010
|
if (ufOpt && fields) throw new Error('fields and uniqueFunction cannot both be specified');
|
|
946
1011
|
let uniqueFunction = ufOpt;
|
|
@@ -953,17 +1018,18 @@ Worker.prototype.diff = async function ({
|
|
|
953
1018
|
existingFiles: [fileB],
|
|
954
1019
|
filename: fileA,
|
|
955
1020
|
uniqueFunction,
|
|
956
|
-
includeDuplicateSourceRecords
|
|
1021
|
+
includeDuplicateSourceRecords
|
|
957
1022
|
});
|
|
958
1023
|
const right = await this.getUniqueFile({
|
|
959
1024
|
existingFiles: [fileA],
|
|
960
1025
|
filename: fileB,
|
|
961
1026
|
uniqueFunction,
|
|
962
|
-
includeDuplicateSourceRecords
|
|
1027
|
+
includeDuplicateSourceRecords
|
|
963
1028
|
});
|
|
964
1029
|
|
|
965
1030
|
return {
|
|
966
|
-
left,
|
|
1031
|
+
left,
|
|
1032
|
+
right
|
|
967
1033
|
};
|
|
968
1034
|
};
|
|
969
1035
|
Worker.prototype.diff.metadata = {
|
|
@@ -973,9 +1039,9 @@ Worker.prototype.diff.metadata = {
|
|
|
973
1039
|
fields: { description: 'Fields to use for uniqueness -- aka primary key. Defaults to JSON of line' },
|
|
974
1040
|
uniqueFunction: {},
|
|
975
1041
|
includeDuplicateSourceRecords: {
|
|
976
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
977
|
-
}
|
|
978
|
-
}
|
|
1042
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
979
1045
|
};
|
|
980
1046
|
|
|
981
1047
|
module.exports = Worker;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@engine9-io/input-tools",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.1",
|
|
4
4
|
"description": "Tools for dealing with Engine9 inputs",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
"throttle-debounce": "^5.0.2",
|
|
31
31
|
"unzipper": "^0.12.1",
|
|
32
32
|
"uuid": "^11.1.0",
|
|
33
|
+
"xlstream": "^2.5.5",
|
|
33
34
|
"yargs": "^17.7.2"
|
|
34
35
|
},
|
|
35
36
|
"directories": {
|