@engine9-io/input-tools 1.7.9 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/file/FileUtilities.js +198 -135
- package/package.json +2 -1
package/file/FileUtilities.js
CHANGED
|
@@ -1,17 +1,15 @@
|
|
|
1
|
-
/* eslint-disable no-await-in-loop */
|
|
2
1
|
const fs = require('node:fs');
|
|
3
2
|
|
|
4
3
|
const fsp = fs.promises;
|
|
5
4
|
const path = require('node:path');
|
|
6
5
|
const zlib = require('node:zlib');
|
|
7
|
-
const {
|
|
8
|
-
Readable, Transform, PassThrough, Writable,
|
|
9
|
-
} = require('node:stream');
|
|
6
|
+
const { Readable, Transform, PassThrough, Writable } = require('node:stream');
|
|
10
7
|
const { pipeline } = require('node:stream/promises');
|
|
11
8
|
const { stringify } = require('csv');
|
|
12
9
|
|
|
13
10
|
const debug = require('debug')('FileWorker');
|
|
14
11
|
|
|
12
|
+
const { getXlsxStream } = require('xlstream');
|
|
15
13
|
const csv = require('csv');
|
|
16
14
|
const JSON5 = require('json5');
|
|
17
15
|
const languageEncoding = require('detect-file-encoding-and-language');
|
|
@@ -20,10 +18,18 @@ const S3Worker = require('./S3');
|
|
|
20
18
|
const ParquetWorker = require('./Parquet');
|
|
21
19
|
|
|
22
20
|
const {
|
|
23
|
-
bool,
|
|
21
|
+
bool,
|
|
22
|
+
getTempFilename,
|
|
23
|
+
getStringArray,
|
|
24
|
+
getTempDir,
|
|
25
|
+
makeStrings,
|
|
26
|
+
streamPacket,
|
|
27
|
+
relativeDate
|
|
24
28
|
} = require('./tools');
|
|
25
29
|
|
|
26
|
-
function Worker({ accountId }) {
|
|
30
|
+
function Worker({ accountId }) {
|
|
31
|
+
this.accountId = accountId;
|
|
32
|
+
}
|
|
27
33
|
|
|
28
34
|
class LineReaderTransform extends Transform {
|
|
29
35
|
constructor(options = {}) {
|
|
@@ -31,7 +37,6 @@ class LineReaderTransform extends Transform {
|
|
|
31
37
|
this.buffer = '';
|
|
32
38
|
}
|
|
33
39
|
|
|
34
|
-
// eslint-disable-next-line no-underscore-dangle
|
|
35
40
|
_transform(chunk, encoding, callback) {
|
|
36
41
|
this.buffer += chunk.toString();
|
|
37
42
|
const lines = this.buffer.split(/\r?\n/);
|
|
@@ -40,7 +45,6 @@ class LineReaderTransform extends Transform {
|
|
|
40
45
|
callback();
|
|
41
46
|
}
|
|
42
47
|
|
|
43
|
-
// eslint-disable-next-line no-underscore-dangle
|
|
44
48
|
_flush(callback) {
|
|
45
49
|
if (this.buffer) {
|
|
46
50
|
this.push(this.buffer);
|
|
@@ -53,7 +57,11 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
53
57
|
const transforms = [];
|
|
54
58
|
const delimiter = options.delimiter || ',';
|
|
55
59
|
|
|
56
|
-
const headerMapping =
|
|
60
|
+
const headerMapping =
|
|
61
|
+
options.headerMapping ||
|
|
62
|
+
function (d) {
|
|
63
|
+
return d;
|
|
64
|
+
};
|
|
57
65
|
let lastLine = null;
|
|
58
66
|
let head = null;
|
|
59
67
|
|
|
@@ -63,7 +71,7 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
63
71
|
skip_empty_lines: true,
|
|
64
72
|
delimiter,
|
|
65
73
|
max_limit_on_data_read: 10000000,
|
|
66
|
-
skip_lines_with_error: skipLinesWithError
|
|
74
|
+
skip_lines_with_error: skipLinesWithError
|
|
67
75
|
};
|
|
68
76
|
if (options.skip) parserOptions.from_line = options.skip;
|
|
69
77
|
if (options.relax_column_count) parserOptions.relax_column_count = true;
|
|
@@ -101,7 +109,7 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
101
109
|
|
|
102
110
|
lastLine = row.join(delimiter);
|
|
103
111
|
return cb(null, o);
|
|
104
|
-
}
|
|
112
|
+
}
|
|
105
113
|
});
|
|
106
114
|
|
|
107
115
|
transforms.push(parser);
|
|
@@ -124,12 +132,15 @@ Worker.prototype.detectEncoding = async function (options) {
|
|
|
124
132
|
// needed chunk size.
|
|
125
133
|
finalBuff = await new Promise((resolve, reject) => {
|
|
126
134
|
const bufferBuilder = [];
|
|
127
|
-
const decompressStream = zlib
|
|
135
|
+
const decompressStream = zlib
|
|
136
|
+
.createGunzip()
|
|
128
137
|
.on('data', (chunk) => {
|
|
129
138
|
bufferBuilder.push(chunk);
|
|
130
|
-
})
|
|
139
|
+
})
|
|
140
|
+
.on('close', () => {
|
|
131
141
|
resolve(Buffer.concat(bufferBuilder));
|
|
132
|
-
})
|
|
142
|
+
})
|
|
143
|
+
.on('error', (err) => {
|
|
133
144
|
if (err.errno !== -5) {
|
|
134
145
|
// EOF: expected
|
|
135
146
|
reject(err);
|
|
@@ -145,15 +156,57 @@ Worker.prototype.detectEncoding = async function (options) {
|
|
|
145
156
|
|
|
146
157
|
Worker.prototype.detectEncoding.metadata = {
|
|
147
158
|
options: {
|
|
148
|
-
filename: { required: true }
|
|
149
|
-
}
|
|
159
|
+
filename: { required: true }
|
|
160
|
+
}
|
|
161
|
+
};
|
|
162
|
+
|
|
163
|
+
Worker.prototype.xlsxToObjectStream = async function (options) {
|
|
164
|
+
let { filename } = options;
|
|
165
|
+
|
|
166
|
+
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
167
|
+
// We need to copy and delete
|
|
168
|
+
let worker = null;
|
|
169
|
+
if (filename.startsWith('r2://')) {
|
|
170
|
+
worker = new R2Worker(this);
|
|
171
|
+
} else {
|
|
172
|
+
worker = new S3Worker(this);
|
|
173
|
+
}
|
|
174
|
+
const target = getTempFilename({ targetFilename: filename.split('/').pop() });
|
|
175
|
+
|
|
176
|
+
await worker.copy({ filename, target });
|
|
177
|
+
filename = target;
|
|
178
|
+
}
|
|
179
|
+
let stream = await getXlsxStream({
|
|
180
|
+
filePath: filename,
|
|
181
|
+
sheet: 0
|
|
182
|
+
});
|
|
183
|
+
let keys = null;
|
|
184
|
+
stream = stream.pipe(
|
|
185
|
+
new Transform({
|
|
186
|
+
objectMode: true,
|
|
187
|
+
transform(d, enc, cb) {
|
|
188
|
+
if (!keys) {
|
|
189
|
+
keys = d?.raw.arr;
|
|
190
|
+
cb();
|
|
191
|
+
} else {
|
|
192
|
+
let o = {};
|
|
193
|
+
keys.forEach((k, i) => {
|
|
194
|
+
o[k] = d?.raw?.arr?.[i];
|
|
195
|
+
});
|
|
196
|
+
cb(null, o);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
})
|
|
200
|
+
);
|
|
201
|
+
|
|
202
|
+
return { stream };
|
|
150
203
|
};
|
|
151
204
|
|
|
152
205
|
/*
|
|
153
|
-
|
|
206
|
+
Commonly used method to transform a file into a stream of objects.
|
|
154
207
|
*/
|
|
155
208
|
Worker.prototype.fileToObjectStream = async function (options) {
|
|
156
|
-
const { filename, columns, limit: limitOption,format:formatOverride } = options;
|
|
209
|
+
const { filename, columns, limit: limitOption, format: formatOverride } = options;
|
|
157
210
|
|
|
158
211
|
// handle stream item
|
|
159
212
|
if (options.stream) {
|
|
@@ -167,6 +220,9 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
167
220
|
let limit;
|
|
168
221
|
if (limitOption) limit = parseInt(limitOption, 10);
|
|
169
222
|
if (!filename) throw new Error('fileToObjectStream: filename is required');
|
|
223
|
+
if (filename.split('.').pop().toLowerCase() === 'xlsx') {
|
|
224
|
+
return this.xlsxToObjectStream(options);
|
|
225
|
+
}
|
|
170
226
|
let postfix = options.sourcePostfix || filename.toLowerCase().split('.').pop();
|
|
171
227
|
if (postfix === 'zip') {
|
|
172
228
|
debug('Invalid filename:', { filename });
|
|
@@ -176,7 +232,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
176
232
|
const streamInfo = await this.stream({
|
|
177
233
|
filename,
|
|
178
234
|
columns,
|
|
179
|
-
limit
|
|
235
|
+
limit
|
|
180
236
|
});
|
|
181
237
|
const { encoding } = streamInfo;
|
|
182
238
|
let { stream } = streamInfo;
|
|
@@ -203,7 +259,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
203
259
|
} else {
|
|
204
260
|
stream.setEncoding(encoding);
|
|
205
261
|
}
|
|
206
|
-
let format=formatOverride || postfix;
|
|
262
|
+
let format = formatOverride || postfix;
|
|
207
263
|
|
|
208
264
|
if (format === 'csv') {
|
|
209
265
|
const csvTransforms = this.csvToObjectTransforms({ ...options });
|
|
@@ -243,13 +299,15 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
243
299
|
}
|
|
244
300
|
if (headers) {
|
|
245
301
|
const mapped = {};
|
|
246
|
-
headers.forEach((name, i) => {
|
|
302
|
+
headers.forEach((name, i) => {
|
|
303
|
+
mapped[name] = obj[i];
|
|
304
|
+
});
|
|
247
305
|
this.push(mapped);
|
|
248
306
|
} else {
|
|
249
307
|
this.push(obj);
|
|
250
308
|
}
|
|
251
309
|
return cb();
|
|
252
|
-
}
|
|
310
|
+
}
|
|
253
311
|
});
|
|
254
312
|
|
|
255
313
|
transforms.push(lineReader);
|
|
@@ -260,9 +318,11 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
260
318
|
const countAndDebug = new Transform({
|
|
261
319
|
objectMode: true,
|
|
262
320
|
transform(d, enc, cb) {
|
|
263
|
-
if (count === 0) {
|
|
321
|
+
if (count === 0) {
|
|
322
|
+
debug('Sample object from file:', d);
|
|
323
|
+
}
|
|
264
324
|
count += 1;
|
|
265
|
-
if ((count < 5000 && count % 1000 === 0) ||
|
|
325
|
+
if ((count < 5000 && count % 1000 === 0) || count % 50000 === 0) {
|
|
266
326
|
debug(`fileToObjectStream transformed ${count} lines`);
|
|
267
327
|
}
|
|
268
328
|
this.push(d);
|
|
@@ -279,7 +339,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
279
339
|
this.push(o);
|
|
280
340
|
} */
|
|
281
341
|
cb();
|
|
282
|
-
}
|
|
342
|
+
}
|
|
283
343
|
});
|
|
284
344
|
|
|
285
345
|
transforms.push(countAndDebug);
|
|
@@ -319,14 +379,14 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
319
379
|
objectMode: true,
|
|
320
380
|
async transform(item, encoding, cb) {
|
|
321
381
|
options.transform(item, encoding, cb);
|
|
322
|
-
}
|
|
382
|
+
}
|
|
323
383
|
});
|
|
324
384
|
} else {
|
|
325
385
|
transform = new Transform({
|
|
326
386
|
objectMode: true,
|
|
327
387
|
async transform(item, encoding, cb) {
|
|
328
388
|
cb(null, options.transform(item));
|
|
329
|
-
}
|
|
389
|
+
}
|
|
330
390
|
});
|
|
331
391
|
}
|
|
332
392
|
} else if (options.transform) {
|
|
@@ -345,7 +405,7 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
345
405
|
let v = item[k];
|
|
346
406
|
if (!o[k]) {
|
|
347
407
|
if (typeof v === 'object') {
|
|
348
|
-
while (Array.isArray(v)) [v] = v
|
|
408
|
+
while (Array.isArray(v)) [v] = v; // get first array item
|
|
349
409
|
o = { ...o, ...v };
|
|
350
410
|
} else {
|
|
351
411
|
o[k] = v;
|
|
@@ -353,12 +413,12 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
353
413
|
}
|
|
354
414
|
});
|
|
355
415
|
cb(null, o);
|
|
356
|
-
}
|
|
416
|
+
}
|
|
357
417
|
});
|
|
358
418
|
}
|
|
359
419
|
|
|
360
420
|
const stats = {
|
|
361
|
-
records: 0
|
|
421
|
+
records: 0
|
|
362
422
|
};
|
|
363
423
|
let stringifier;
|
|
364
424
|
if (options.targetFormat === 'jsonl') {
|
|
@@ -366,7 +426,7 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
366
426
|
objectMode: true,
|
|
367
427
|
transform(d, encoding, cb) {
|
|
368
428
|
cb(false, `${JSON.stringify(d)}\n`);
|
|
369
|
-
}
|
|
429
|
+
}
|
|
370
430
|
});
|
|
371
431
|
} else {
|
|
372
432
|
stringifier = stringify({ header: true });
|
|
@@ -383,11 +443,11 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
383
443
|
transform(d, enc, cb) {
|
|
384
444
|
stats.records += 1;
|
|
385
445
|
cb(null, d);
|
|
386
|
-
}
|
|
446
|
+
}
|
|
387
447
|
}),
|
|
388
448
|
stringifier,
|
|
389
449
|
gzip,
|
|
390
|
-
fileWriterStream
|
|
450
|
+
fileWriterStream
|
|
391
451
|
].filter(Boolean);
|
|
392
452
|
return { filename, streams, stats };
|
|
393
453
|
};
|
|
@@ -395,9 +455,7 @@ Worker.prototype.objectStreamToFile = async function (options) {
|
|
|
395
455
|
const { filename, streams, stats } = await this.getOutputStreams(options);
|
|
396
456
|
const { stream: inStream } = options;
|
|
397
457
|
streams.unshift(inStream);
|
|
398
|
-
await pipeline(
|
|
399
|
-
streams,
|
|
400
|
-
);
|
|
458
|
+
await pipeline(streams);
|
|
401
459
|
return { filename, records: stats.records };
|
|
402
460
|
};
|
|
403
461
|
|
|
@@ -432,7 +490,7 @@ Worker.prototype.transform = async function (options) {
|
|
|
432
490
|
if (typeof f === 'function') {
|
|
433
491
|
f = new Transform({
|
|
434
492
|
objectMode: true,
|
|
435
|
-
transform: f
|
|
493
|
+
transform: f
|
|
436
494
|
});
|
|
437
495
|
}
|
|
438
496
|
|
|
@@ -441,7 +499,10 @@ Worker.prototype.transform = async function (options) {
|
|
|
441
499
|
|
|
442
500
|
const { targetFormat } = options;
|
|
443
501
|
|
|
444
|
-
if (
|
|
502
|
+
if (
|
|
503
|
+
!targetFormat &&
|
|
504
|
+
(filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
|
|
505
|
+
) {
|
|
445
506
|
options.targetFormat = 'csv';
|
|
446
507
|
}
|
|
447
508
|
|
|
@@ -453,33 +514,34 @@ Worker.prototype.transform.metadata = {
|
|
|
453
514
|
sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
|
|
454
515
|
encoding: { description: 'Manual override of source file encoding' },
|
|
455
516
|
names: { description: 'Target field names (e.g. my_new_field,x,y,z)' },
|
|
456
|
-
values: {
|
|
517
|
+
values: {
|
|
518
|
+
description:
|
|
519
|
+
"Comma delimited source field name, or Handlebars [[ ]] merge fields (e.g. 'my_field,x,y,z', '[[field1]]-[[field2]]', etc)"
|
|
520
|
+
},
|
|
457
521
|
targetFilename: { description: 'Custom name of the output file (default auto-generated)' },
|
|
458
522
|
targetFormat: { description: 'Output format -- csv supported, or none for txt (default)' },
|
|
459
523
|
targetRowDelimiter: { description: 'Row delimiter (default \n)' },
|
|
460
|
-
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
|
|
461
|
-
}
|
|
524
|
+
targetFieldDelimiter: { description: 'Field delimiter (default \t or ,)' }
|
|
525
|
+
}
|
|
462
526
|
};
|
|
463
527
|
Worker.prototype.testTransform = async function (options) {
|
|
464
528
|
return this.transform({
|
|
465
529
|
...options,
|
|
466
|
-
transform(d, enc, cb) {
|
|
530
|
+
transform(d, enc, cb) {
|
|
531
|
+
d.transform_time = new Date();
|
|
532
|
+
cb(null, d);
|
|
533
|
+
}
|
|
467
534
|
});
|
|
468
535
|
};
|
|
469
536
|
Worker.prototype.testTransform.metadata = {
|
|
470
537
|
options: {
|
|
471
|
-
filename: true
|
|
472
|
-
}
|
|
538
|
+
filename: true
|
|
539
|
+
}
|
|
473
540
|
};
|
|
474
541
|
|
|
475
542
|
/* Get a stream from an actual stream, or an array, or a file */
|
|
476
|
-
Worker.prototype.stream = async function (
|
|
477
|
-
options
|
|
478
|
-
) {
|
|
479
|
-
const {
|
|
480
|
-
stream: inputStream, packet, type, columns, limit,
|
|
481
|
-
filename: filenameOpt,
|
|
482
|
-
} = options;
|
|
543
|
+
Worker.prototype.stream = async function (options) {
|
|
544
|
+
const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
|
|
483
545
|
let filename = filenameOpt;
|
|
484
546
|
|
|
485
547
|
if (inputStream) {
|
|
@@ -496,7 +558,8 @@ Worker.prototype.stream = async function (
|
|
|
496
558
|
} else {
|
|
497
559
|
// debug(`Not prepending filename:${filename}`);
|
|
498
560
|
}
|
|
499
|
-
let encoding;
|
|
561
|
+
let encoding;
|
|
562
|
+
let stream;
|
|
500
563
|
if (filename.slice(-8) === '.parquet') {
|
|
501
564
|
const pq = new ParquetWorker(this);
|
|
502
565
|
stream = (await pq.stream({ filename, columns, limit })).stream;
|
|
@@ -541,9 +604,8 @@ Worker.prototype.sample = async function (opts) {
|
|
|
541
604
|
};
|
|
542
605
|
Worker.prototype.sample.metadata = {
|
|
543
606
|
options: {
|
|
544
|
-
filename: {}
|
|
545
|
-
|
|
546
|
-
},
|
|
607
|
+
filename: {}
|
|
608
|
+
}
|
|
547
609
|
};
|
|
548
610
|
Worker.prototype.toArray = async function (opts) {
|
|
549
611
|
const { stream } = await this.fileToObjectStream(opts);
|
|
@@ -551,8 +613,8 @@ Worker.prototype.toArray = async function (opts) {
|
|
|
551
613
|
};
|
|
552
614
|
Worker.prototype.toArray.metadata = {
|
|
553
615
|
options: {
|
|
554
|
-
filename: {}
|
|
555
|
-
}
|
|
616
|
+
filename: {}
|
|
617
|
+
}
|
|
556
618
|
};
|
|
557
619
|
|
|
558
620
|
Worker.prototype.write = async function (opts) {
|
|
@@ -566,7 +628,7 @@ Worker.prototype.write = async function (opts) {
|
|
|
566
628
|
await worker.write({
|
|
567
629
|
directory,
|
|
568
630
|
file,
|
|
569
|
-
content
|
|
631
|
+
content
|
|
570
632
|
});
|
|
571
633
|
} else {
|
|
572
634
|
await fsp.writeFile(filename, content);
|
|
@@ -576,15 +638,14 @@ Worker.prototype.write = async function (opts) {
|
|
|
576
638
|
Worker.prototype.write.metadata = {
|
|
577
639
|
options: {
|
|
578
640
|
filename: { description: 'Location to write content to, can be local or s3:// or r2://' },
|
|
579
|
-
content: {}
|
|
580
|
-
}
|
|
641
|
+
content: {}
|
|
642
|
+
}
|
|
581
643
|
};
|
|
582
644
|
|
|
583
645
|
async function streamToString(stream) {
|
|
584
646
|
// lets have a ReadableStream as a stream variable
|
|
585
647
|
const chunks = [];
|
|
586
648
|
|
|
587
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
588
649
|
for await (const chunk of stream) {
|
|
589
650
|
chunks.push(Buffer.from(chunk));
|
|
590
651
|
}
|
|
@@ -606,47 +667,46 @@ Worker.prototype.json = async function (opts) {
|
|
|
606
667
|
};
|
|
607
668
|
Worker.prototype.json.metadata = {
|
|
608
669
|
options: {
|
|
609
|
-
filename: { description: 'Get a javascript object from a file' }
|
|
610
|
-
}
|
|
670
|
+
filename: { description: 'Get a javascript object from a file' }
|
|
671
|
+
}
|
|
611
672
|
};
|
|
612
673
|
|
|
613
|
-
Worker.prototype.list = async function ({ directory, start:s, end:e }) {
|
|
674
|
+
Worker.prototype.list = async function ({ directory, start: s, end: e }) {
|
|
614
675
|
if (!directory) throw new Error('directory is required');
|
|
615
|
-
let start=null;
|
|
616
|
-
let end=null;
|
|
617
|
-
if (s) start=relativeDate(s);
|
|
618
|
-
if (e) end=relativeDate(e);
|
|
619
|
-
|
|
676
|
+
let start = null;
|
|
677
|
+
let end = null;
|
|
678
|
+
if (s) start = relativeDate(s);
|
|
679
|
+
if (e) end = relativeDate(e);
|
|
680
|
+
|
|
620
681
|
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
621
682
|
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
622
683
|
return worker.list({ directory, start, end });
|
|
623
684
|
}
|
|
624
685
|
const a = await fsp.readdir(directory, { withFileTypes: true });
|
|
625
686
|
|
|
626
|
-
const withModified=[];
|
|
687
|
+
const withModified = [];
|
|
627
688
|
for (const file of a) {
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
689
|
+
const fullPath = path.join(directory, file.name);
|
|
690
|
+
const stats = await fsp.stat(fullPath);
|
|
691
|
+
if (start && stats.mtime < start.getTime()) {
|
|
692
|
+
//do not include
|
|
693
|
+
} else if (end && stats.mtime > end.getTime()) {
|
|
694
|
+
//do nothing
|
|
695
|
+
} else {
|
|
696
|
+
withModified.push({
|
|
697
|
+
name: file.name,
|
|
698
|
+
type: file.isDirectory() ? 'directory' : 'file',
|
|
699
|
+
modifiedAt: new Date(stats.mtime).toISOString()
|
|
700
|
+
});
|
|
701
|
+
}
|
|
641
702
|
}
|
|
642
|
-
|
|
703
|
+
|
|
643
704
|
return withModified;
|
|
644
|
-
|
|
645
705
|
};
|
|
646
706
|
Worker.prototype.list.metadata = {
|
|
647
707
|
options: {
|
|
648
|
-
directory: { required: true }
|
|
649
|
-
}
|
|
708
|
+
directory: { required: true }
|
|
709
|
+
}
|
|
650
710
|
};
|
|
651
711
|
|
|
652
712
|
Worker.prototype.listAll = async function ({ directory }) {
|
|
@@ -661,8 +721,8 @@ Worker.prototype.listAll = async function ({ directory }) {
|
|
|
661
721
|
};
|
|
662
722
|
Worker.prototype.listAll.metadata = {
|
|
663
723
|
options: {
|
|
664
|
-
directory: { required: true }
|
|
665
|
-
}
|
|
724
|
+
directory: { required: true }
|
|
725
|
+
}
|
|
666
726
|
};
|
|
667
727
|
|
|
668
728
|
Worker.prototype.empty = async function ({ directory }) {
|
|
@@ -672,7 +732,7 @@ Worker.prototype.empty = async function ({ directory }) {
|
|
|
672
732
|
throw new Error('Cannot empty an s3:// or r2:// directory');
|
|
673
733
|
}
|
|
674
734
|
const removed = [];
|
|
675
|
-
|
|
735
|
+
|
|
676
736
|
for (const file of await fsp.readdir(directory)) {
|
|
677
737
|
removed.push(file);
|
|
678
738
|
await fsp.unlink(path.join(directory, file));
|
|
@@ -681,8 +741,8 @@ Worker.prototype.empty = async function ({ directory }) {
|
|
|
681
741
|
};
|
|
682
742
|
Worker.prototype.empty.metadata = {
|
|
683
743
|
options: {
|
|
684
|
-
directory: { required: true }
|
|
685
|
-
}
|
|
744
|
+
directory: { required: true }
|
|
745
|
+
}
|
|
686
746
|
};
|
|
687
747
|
|
|
688
748
|
Worker.prototype.remove = async function ({ filename }) {
|
|
@@ -705,16 +765,18 @@ Worker.prototype.remove = async function ({ filename }) {
|
|
|
705
765
|
};
|
|
706
766
|
Worker.prototype.remove.metadata = {
|
|
707
767
|
options: {
|
|
708
|
-
filename: {}
|
|
709
|
-
}
|
|
768
|
+
filename: {}
|
|
769
|
+
}
|
|
710
770
|
};
|
|
711
771
|
|
|
712
772
|
Worker.prototype.move = async function ({ filename, target }) {
|
|
713
773
|
if (!target) throw new Error('target is required');
|
|
714
774
|
if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
|
|
715
775
|
if (target.startsWith('s3://') || target.startsWith('r2://')) {
|
|
716
|
-
if (
|
|
717
|
-
|
|
776
|
+
if (
|
|
777
|
+
(target.startsWith('s3://') && filename.startsWith('r2://')) ||
|
|
778
|
+
(target.startsWith('r2://') && filename.startsWith('s3://'))
|
|
779
|
+
) {
|
|
718
780
|
throw new Error('Cowardly not copying between services');
|
|
719
781
|
}
|
|
720
782
|
|
|
@@ -741,8 +803,8 @@ Worker.prototype.move = async function ({ filename, target }) {
|
|
|
741
803
|
Worker.prototype.move.metadata = {
|
|
742
804
|
options: {
|
|
743
805
|
filename: {},
|
|
744
|
-
target: {}
|
|
745
|
-
}
|
|
806
|
+
target: {}
|
|
807
|
+
}
|
|
746
808
|
};
|
|
747
809
|
|
|
748
810
|
Worker.prototype.stat = async function ({ filename }) {
|
|
@@ -751,11 +813,7 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
751
813
|
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
752
814
|
return worker.stat({ filename });
|
|
753
815
|
}
|
|
754
|
-
const {
|
|
755
|
-
ctime,
|
|
756
|
-
birthtime,
|
|
757
|
-
size,
|
|
758
|
-
} = await fsp.stat(filename);
|
|
816
|
+
const { ctime, birthtime, size } = await fsp.stat(filename);
|
|
759
817
|
const modifiedAt = new Date(ctime);
|
|
760
818
|
let createdAt = birthtime;
|
|
761
819
|
if (createdAt === 0 || !createdAt) createdAt = ctime;
|
|
@@ -763,13 +821,13 @@ Worker.prototype.stat = async function ({ filename }) {
|
|
|
763
821
|
return {
|
|
764
822
|
createdAt,
|
|
765
823
|
modifiedAt,
|
|
766
|
-
size
|
|
824
|
+
size
|
|
767
825
|
};
|
|
768
826
|
};
|
|
769
827
|
Worker.prototype.stat.metadata = {
|
|
770
828
|
options: {
|
|
771
|
-
filename: {}
|
|
772
|
-
}
|
|
829
|
+
filename: {}
|
|
830
|
+
}
|
|
773
831
|
};
|
|
774
832
|
|
|
775
833
|
Worker.prototype.download = async function ({ filename }) {
|
|
@@ -782,8 +840,8 @@ Worker.prototype.download = async function ({ filename }) {
|
|
|
782
840
|
};
|
|
783
841
|
Worker.prototype.download.metadata = {
|
|
784
842
|
options: {
|
|
785
|
-
filename: {}
|
|
786
|
-
}
|
|
843
|
+
filename: {}
|
|
844
|
+
}
|
|
787
845
|
};
|
|
788
846
|
|
|
789
847
|
Worker.prototype.head = async function (options) {
|
|
@@ -792,7 +850,7 @@ Worker.prototype.head = async function (options) {
|
|
|
792
850
|
const chunks = [];
|
|
793
851
|
|
|
794
852
|
let counter = 0;
|
|
795
|
-
|
|
853
|
+
|
|
796
854
|
for await (const chunk of stream) {
|
|
797
855
|
chunks.push(chunk);
|
|
798
856
|
counter += 1;
|
|
@@ -804,8 +862,8 @@ Worker.prototype.head = async function (options) {
|
|
|
804
862
|
|
|
805
863
|
Worker.prototype.head.metadata = {
|
|
806
864
|
options: {
|
|
807
|
-
filename: { required: true }
|
|
808
|
-
}
|
|
865
|
+
filename: { required: true }
|
|
866
|
+
}
|
|
809
867
|
};
|
|
810
868
|
|
|
811
869
|
Worker.prototype.count = async function (options) {
|
|
@@ -814,7 +872,7 @@ Worker.prototype.count = async function (options) {
|
|
|
814
872
|
|
|
815
873
|
const limit = options.limit || 5;
|
|
816
874
|
let records = 0;
|
|
817
|
-
|
|
875
|
+
|
|
818
876
|
for await (const chunk of stream) {
|
|
819
877
|
records += 1;
|
|
820
878
|
if (records < limit) {
|
|
@@ -827,8 +885,8 @@ Worker.prototype.count = async function (options) {
|
|
|
827
885
|
|
|
828
886
|
Worker.prototype.count.metadata = {
|
|
829
887
|
options: {
|
|
830
|
-
filename: { required: true }
|
|
831
|
-
}
|
|
888
|
+
filename: { required: true }
|
|
889
|
+
}
|
|
832
890
|
};
|
|
833
891
|
|
|
834
892
|
// Get a set of unique entries from a uniqueFunction
|
|
@@ -839,10 +897,10 @@ Worker.prototype.getUniqueSet = async function (options) {
|
|
|
839
897
|
|
|
840
898
|
let { uniqueFunction } = options;
|
|
841
899
|
if (!uniqueFunction) {
|
|
842
|
-
uniqueFunction = (
|
|
900
|
+
uniqueFunction = (o) => JSON.stringify(o);
|
|
843
901
|
}
|
|
844
902
|
const uniqueSet = new Set();
|
|
845
|
-
|
|
903
|
+
|
|
846
904
|
for (const filename of existingFiles) {
|
|
847
905
|
const { stream: existsStream } = await this.fileToObjectStream({ filename });
|
|
848
906
|
await pipeline(
|
|
@@ -856,14 +914,14 @@ Worker.prototype.getUniqueSet = async function (options) {
|
|
|
856
914
|
}
|
|
857
915
|
uniqueSet.add(v);
|
|
858
916
|
cb(null, d);
|
|
859
|
-
}
|
|
917
|
+
}
|
|
860
918
|
}),
|
|
861
919
|
new Writable({
|
|
862
920
|
objectMode: true,
|
|
863
921
|
write(d, enc, cb) {
|
|
864
922
|
cb();
|
|
865
|
-
}
|
|
866
|
-
})
|
|
923
|
+
}
|
|
924
|
+
})
|
|
867
925
|
);
|
|
868
926
|
debug(`Finished loading ${filename}`);
|
|
869
927
|
}
|
|
@@ -875,7 +933,7 @@ Worker.prototype.getUniqueStream = async function (options) {
|
|
|
875
933
|
|
|
876
934
|
const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
|
|
877
935
|
filenames: options.existingFiles,
|
|
878
|
-
uniqueFunction: options.uniqueFunction
|
|
936
|
+
uniqueFunction: options.uniqueFunction
|
|
879
937
|
});
|
|
880
938
|
|
|
881
939
|
const { stream: inStream } = await this.fileToObjectStream(options);
|
|
@@ -899,8 +957,8 @@ Worker.prototype.getUniqueStream = async function (options) {
|
|
|
899
957
|
}
|
|
900
958
|
cb(null, d);
|
|
901
959
|
}
|
|
902
|
-
}
|
|
903
|
-
})
|
|
960
|
+
}
|
|
961
|
+
})
|
|
904
962
|
);
|
|
905
963
|
return { stream: uniqueStream, sample };
|
|
906
964
|
};
|
|
@@ -912,9 +970,9 @@ Worker.prototype.getUniqueStream.metadata = {
|
|
|
912
970
|
filename: { description: 'Specify a source filename or a stream' },
|
|
913
971
|
stream: { description: 'Specify a source filename or a stream' },
|
|
914
972
|
includeDuplicateSourceRecords: {
|
|
915
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
916
|
-
}
|
|
917
|
-
}
|
|
973
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
974
|
+
}
|
|
975
|
+
}
|
|
918
976
|
};
|
|
919
977
|
Worker.prototype.getUniqueFile = async function (options) {
|
|
920
978
|
const { stream, sample } = await this.getUniqueStream(options);
|
|
@@ -929,9 +987,9 @@ Worker.prototype.getUniqueFile.metadata = {
|
|
|
929
987
|
filename: { description: 'Specify a source filename or a stream' },
|
|
930
988
|
stream: { description: 'Specify a source filename or a stream' },
|
|
931
989
|
includeDuplicateSourceRecords: {
|
|
932
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
933
|
-
}
|
|
934
|
-
}
|
|
990
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
991
|
+
}
|
|
992
|
+
}
|
|
935
993
|
};
|
|
936
994
|
|
|
937
995
|
/*
|
|
@@ -940,7 +998,11 @@ Requires 2 passes of the files,
|
|
|
940
998
|
but that's a better tradeoff than trying to store huge files in memory
|
|
941
999
|
*/
|
|
942
1000
|
Worker.prototype.diff = async function ({
|
|
943
|
-
fileA,
|
|
1001
|
+
fileA,
|
|
1002
|
+
fileB,
|
|
1003
|
+
uniqueFunction: ufOpt,
|
|
1004
|
+
fields,
|
|
1005
|
+
includeDuplicateSourceRecords
|
|
944
1006
|
}) {
|
|
945
1007
|
if (ufOpt && fields) throw new Error('fields and uniqueFunction cannot both be specified');
|
|
946
1008
|
let uniqueFunction = ufOpt;
|
|
@@ -953,17 +1015,18 @@ Worker.prototype.diff = async function ({
|
|
|
953
1015
|
existingFiles: [fileB],
|
|
954
1016
|
filename: fileA,
|
|
955
1017
|
uniqueFunction,
|
|
956
|
-
includeDuplicateSourceRecords
|
|
1018
|
+
includeDuplicateSourceRecords
|
|
957
1019
|
});
|
|
958
1020
|
const right = await this.getUniqueFile({
|
|
959
1021
|
existingFiles: [fileA],
|
|
960
1022
|
filename: fileB,
|
|
961
1023
|
uniqueFunction,
|
|
962
|
-
includeDuplicateSourceRecords
|
|
1024
|
+
includeDuplicateSourceRecords
|
|
963
1025
|
});
|
|
964
1026
|
|
|
965
1027
|
return {
|
|
966
|
-
left,
|
|
1028
|
+
left,
|
|
1029
|
+
right
|
|
967
1030
|
};
|
|
968
1031
|
};
|
|
969
1032
|
Worker.prototype.diff.metadata = {
|
|
@@ -973,9 +1036,9 @@ Worker.prototype.diff.metadata = {
|
|
|
973
1036
|
fields: { description: 'Fields to use for uniqueness -- aka primary key. Defaults to JSON of line' },
|
|
974
1037
|
uniqueFunction: {},
|
|
975
1038
|
includeDuplicateSourceRecords: {
|
|
976
|
-
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
977
|
-
}
|
|
978
|
-
}
|
|
1039
|
+
description: 'Sometimes you want the output to include source dupes, sometimes not, default false'
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
979
1042
|
};
|
|
980
1043
|
|
|
981
1044
|
module.exports = Worker;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@engine9-io/input-tools",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.8.0",
|
|
4
4
|
"description": "Tools for dealing with Engine9 inputs",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
"throttle-debounce": "^5.0.2",
|
|
31
31
|
"unzipper": "^0.12.1",
|
|
32
32
|
"uuid": "^11.1.0",
|
|
33
|
+
"xlstream": "^2.5.5",
|
|
33
34
|
"yargs": "^17.7.2"
|
|
34
35
|
},
|
|
35
36
|
"directories": {
|