@engine9-io/input-tools 1.9.11 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ForEachEntry.js +18 -45
- package/ValidatingReadable.js +3 -6
- package/buildSamplePackets.js +11 -16
- package/eslint.config.mjs +15 -11
- package/file/FileUtilities.js +29 -153
- package/file/GoogleDrive.js +32 -38
- package/file/Parquet.js +112 -124
- package/file/R2.js +27 -32
- package/file/S3.js +259 -293
- package/file/tools.js +33 -54
- package/index.js +59 -74
- package/package.json +2 -1
- package/test/cli.js +3 -4
- package/test/file.js +6 -7
- package/test/processing/bigDataMessage.js +8 -10
- package/test/processing/forEach.js +6 -8
- package/test/processing/forEachResume.js +6 -8
- package/test/processing/message.js +31 -39
- package/test/processing/zip.js +6 -7
- package/test/uuid.js +6 -11
- package/timelineTypes.js +2 -24
package/file/FileUtilities.js
CHANGED
|
@@ -1,43 +1,32 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import zlib from 'node:zlib';
|
|
4
|
+
import nodestream from 'node:stream';
|
|
5
|
+
import promises from 'node:stream/promises';
|
|
6
|
+
import { parse, stringify } from 'csv';
|
|
7
|
+
import debug$0 from 'debug';
|
|
8
|
+
import xlstream from 'xlstream';
|
|
9
|
+
import JSON5 from 'json5';
|
|
10
|
+
import languageEncoding from 'detect-file-encoding-and-language';
|
|
11
|
+
import R2Worker from './R2.js';
|
|
12
|
+
import S3Worker from './S3.js';
|
|
13
|
+
import ParquetWorker from './Parquet.js';
|
|
14
|
+
import { bool, getTempFilename, getStringArray, getTempDir, makeStrings, streamPacket, relativeDate } from './tools.js';
|
|
3
15
|
const fsp = fs.promises;
|
|
4
|
-
const
|
|
5
|
-
const
|
|
6
|
-
const { Readable, Transform, PassThrough, Writable } = require('node:stream');
|
|
7
|
-
const { pipeline } = require('node:stream/promises');
|
|
8
|
-
const { stringify } = require('csv');
|
|
9
|
-
|
|
10
|
-
const debug = require('debug')('@engine9-io/file');
|
|
11
|
-
|
|
12
|
-
const { getXlsxStream } = require('xlstream');
|
|
13
|
-
const csv = require('csv');
|
|
14
|
-
const JSON5 = require('json5');
|
|
15
|
-
|
|
16
|
-
const languageEncoding = require('detect-file-encoding-and-language');
|
|
17
|
-
const R2Worker = require('./R2');
|
|
18
|
-
const S3Worker = require('./S3');
|
|
19
|
-
const ParquetWorker = require('./Parquet');
|
|
16
|
+
const { Readable, Transform, PassThrough, Writable } = nodestream;
|
|
17
|
+
const { pipeline } = promises;
|
|
20
18
|
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
getTempFilename,
|
|
24
|
-
getStringArray,
|
|
25
|
-
getTempDir,
|
|
26
|
-
makeStrings,
|
|
27
|
-
streamPacket,
|
|
28
|
-
relativeDate
|
|
29
|
-
} = require('./tools');
|
|
19
|
+
const debug = debug$0('@engine9-io/file');
|
|
20
|
+
const { getXlsxStream } = xlstream;
|
|
30
21
|
|
|
31
22
|
function Worker({ accountId }) {
|
|
32
23
|
this.accountId = accountId;
|
|
33
24
|
}
|
|
34
|
-
|
|
35
25
|
class LineReaderTransform extends Transform {
|
|
36
26
|
constructor(options = {}) {
|
|
37
27
|
super({ ...options, readableObjectMode: true });
|
|
38
28
|
this.buffer = '';
|
|
39
29
|
}
|
|
40
|
-
|
|
41
30
|
_transform(chunk, encoding, callback) {
|
|
42
31
|
this.buffer += chunk.toString();
|
|
43
32
|
const lines = this.buffer.split(/\r?\n/);
|
|
@@ -45,7 +34,6 @@ class LineReaderTransform extends Transform {
|
|
|
45
34
|
lines.forEach((line) => this.push(line));
|
|
46
35
|
callback();
|
|
47
36
|
}
|
|
48
|
-
|
|
49
37
|
_flush(callback) {
|
|
50
38
|
if (this.buffer) {
|
|
51
39
|
this.push(this.buffer);
|
|
@@ -53,11 +41,9 @@ class LineReaderTransform extends Transform {
|
|
|
53
41
|
callback();
|
|
54
42
|
}
|
|
55
43
|
}
|
|
56
|
-
|
|
57
44
|
Worker.prototype.csvToObjectTransforms = function (options) {
|
|
58
45
|
const transforms = [];
|
|
59
46
|
const delimiter = options.delimiter || ',';
|
|
60
|
-
|
|
61
47
|
const headerMapping =
|
|
62
48
|
options.headerMapping ||
|
|
63
49
|
function (d) {
|
|
@@ -65,7 +51,6 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
65
51
|
};
|
|
66
52
|
let lastLine = null;
|
|
67
53
|
let head = null;
|
|
68
|
-
|
|
69
54
|
const skipLinesWithError = bool(options.skip_lines_with_error, false);
|
|
70
55
|
const parserOptions = {
|
|
71
56
|
relax: true,
|
|
@@ -82,27 +67,23 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
82
67
|
if (options.limit) {
|
|
83
68
|
parserOptions.to = options.limit;
|
|
84
69
|
}
|
|
85
|
-
|
|
86
70
|
debug('Parser options=', parserOptions);
|
|
87
|
-
const parser =
|
|
71
|
+
const parser = parse(parserOptions);
|
|
88
72
|
parser.on('error', (error) => {
|
|
89
73
|
debug('fileToObjectStream: Error parsing csv file');
|
|
90
74
|
debug(lastLine);
|
|
91
75
|
throw new Error(error);
|
|
92
76
|
});
|
|
93
|
-
|
|
94
77
|
const blankAndHeaderCheck = new Transform({
|
|
95
78
|
objectMode: true,
|
|
96
79
|
transform(row, enc, cb) {
|
|
97
80
|
// Blank rows
|
|
98
81
|
if (row.length === 0) return cb();
|
|
99
82
|
if (row.length === 1 && !row[0]) return cb();
|
|
100
|
-
|
|
101
83
|
if (!head) {
|
|
102
84
|
head = row.map(headerMapping);
|
|
103
85
|
return cb();
|
|
104
86
|
}
|
|
105
|
-
|
|
106
87
|
const o = {};
|
|
107
88
|
head.forEach((_h, i) => {
|
|
108
89
|
const h = _h.trim();
|
|
@@ -110,18 +91,14 @@ Worker.prototype.csvToObjectTransforms = function (options) {
|
|
|
110
91
|
o[h] = row[i];
|
|
111
92
|
}
|
|
112
93
|
});
|
|
113
|
-
|
|
114
94
|
lastLine = row.join(delimiter);
|
|
115
95
|
return cb(null, o);
|
|
116
96
|
}
|
|
117
97
|
});
|
|
118
|
-
|
|
119
98
|
transforms.push(parser);
|
|
120
99
|
transforms.push(blankAndHeaderCheck);
|
|
121
|
-
|
|
122
100
|
return { transforms };
|
|
123
101
|
};
|
|
124
|
-
|
|
125
102
|
Worker.prototype.detectEncoding = async function (options) {
|
|
126
103
|
if (options.encoding_override) return { encoding: options.encoding_override };
|
|
127
104
|
// Limit to only the top N bytes -- for perfomance
|
|
@@ -154,19 +131,15 @@ Worker.prototype.detectEncoding = async function (options) {
|
|
|
154
131
|
decompressStream.end();
|
|
155
132
|
});
|
|
156
133
|
}
|
|
157
|
-
|
|
158
134
|
return languageEncoding(finalBuff);
|
|
159
135
|
};
|
|
160
|
-
|
|
161
136
|
Worker.prototype.detectEncoding.metadata = {
|
|
162
137
|
options: {
|
|
163
138
|
filename: { required: true }
|
|
164
139
|
}
|
|
165
140
|
};
|
|
166
|
-
|
|
167
141
|
Worker.prototype.xlsxToObjectStream = async function (options) {
|
|
168
142
|
let { filename } = options;
|
|
169
|
-
|
|
170
143
|
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
171
144
|
// We need to copy and delete
|
|
172
145
|
let worker = null;
|
|
@@ -176,7 +149,6 @@ Worker.prototype.xlsxToObjectStream = async function (options) {
|
|
|
176
149
|
worker = new S3Worker(this);
|
|
177
150
|
}
|
|
178
151
|
const target = getTempFilename({ targetFilename: filename.split('/').pop() });
|
|
179
|
-
|
|
180
152
|
await worker.copy({ filename, target });
|
|
181
153
|
filename = target;
|
|
182
154
|
}
|
|
@@ -202,27 +174,22 @@ Worker.prototype.xlsxToObjectStream = async function (options) {
|
|
|
202
174
|
}
|
|
203
175
|
})
|
|
204
176
|
);
|
|
205
|
-
|
|
206
177
|
return { stream };
|
|
207
178
|
};
|
|
208
|
-
|
|
209
179
|
Worker.prototype.getFormat = async function (options) {
|
|
210
180
|
const { sourcePostfix, filename, format: formatOverride } = options;
|
|
211
181
|
let postfix = sourcePostfix || filename.toLowerCase().split('.').pop();
|
|
212
|
-
|
|
213
182
|
if (postfix === 'gz') {
|
|
214
183
|
postfix = filename.toLowerCase().split('.');
|
|
215
184
|
postfix = postfix[postfix.length - 2];
|
|
216
185
|
}
|
|
217
186
|
return formatOverride || postfix;
|
|
218
187
|
};
|
|
219
|
-
|
|
220
188
|
/*
|
|
221
189
|
Commonly used method to transform a file into a stream of objects.
|
|
222
190
|
*/
|
|
223
191
|
Worker.prototype.fileToObjectStream = async function (options) {
|
|
224
192
|
const { filename, columns, limit: limitOption, format: formatOverride } = options;
|
|
225
|
-
|
|
226
193
|
// handle stream item
|
|
227
194
|
if (options.stream) {
|
|
228
195
|
if (Array.isArray(options.stream)) {
|
|
@@ -243,7 +210,6 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
243
210
|
debug('Invalid filename:', { filename });
|
|
244
211
|
throw new Error('Cowardly refusing to turn a .zip file into an object stream, turn into a csv first');
|
|
245
212
|
}
|
|
246
|
-
|
|
247
213
|
const streamInfo = await this.stream({
|
|
248
214
|
filename,
|
|
249
215
|
columns,
|
|
@@ -256,11 +222,8 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
256
222
|
// already an object
|
|
257
223
|
return { stream };
|
|
258
224
|
}
|
|
259
|
-
|
|
260
225
|
let count = 0;
|
|
261
|
-
|
|
262
226
|
let transforms = [];
|
|
263
|
-
|
|
264
227
|
if (postfix === 'gz') {
|
|
265
228
|
const gunzip = zlib.createGunzip();
|
|
266
229
|
transforms.push(gunzip);
|
|
@@ -273,9 +236,7 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
273
236
|
stream.setEncoding(encoding);
|
|
274
237
|
}
|
|
275
238
|
let format = formatOverride || postfix;
|
|
276
|
-
|
|
277
239
|
debug(`Reading file ${filename} with encoding: ${encoding} and format ${format}`);
|
|
278
|
-
|
|
279
240
|
if (format === 'csv') {
|
|
280
241
|
const csvTransforms = this.csvToObjectTransforms({ ...options });
|
|
281
242
|
transforms = transforms.concat(csvTransforms.transforms);
|
|
@@ -284,12 +245,10 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
284
245
|
transforms = transforms.concat(csvTransforms.transforms);
|
|
285
246
|
} else if (format === 'jsonl') {
|
|
286
247
|
/* Type of JSON that has the names in an array in the first record,
|
|
287
|
-
|
|
288
|
-
|
|
248
|
+
and the values in JSON arrays thereafter
|
|
249
|
+
*/
|
|
289
250
|
let headers = null;
|
|
290
|
-
|
|
291
251
|
const lineReader = new LineReaderTransform();
|
|
292
|
-
|
|
293
252
|
const jsonlTransform = new Transform({
|
|
294
253
|
objectMode: true,
|
|
295
254
|
transform(d, enc, cb) {
|
|
@@ -303,8 +262,8 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
303
262
|
throw e;
|
|
304
263
|
}
|
|
305
264
|
/* JSONL could potentially start with an array of names,
|
|
306
|
-
|
|
307
|
-
|
|
265
|
+
in which case we need to map the subsequent values
|
|
266
|
+
*/
|
|
308
267
|
if (headers === null) {
|
|
309
268
|
if (Array.isArray(obj)) {
|
|
310
269
|
headers = obj;
|
|
@@ -324,7 +283,6 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
324
283
|
return cb();
|
|
325
284
|
}
|
|
326
285
|
});
|
|
327
|
-
|
|
328
286
|
transforms.push(lineReader);
|
|
329
287
|
transforms.push(jsonlTransform);
|
|
330
288
|
} else {
|
|
@@ -348,20 +306,18 @@ Worker.prototype.fileToObjectStream = async function (options) {
|
|
|
348
306
|
// Don't push dummy records anymore -- legacy cruft
|
|
349
307
|
debug(`Completed reading file, records=${count}`);
|
|
350
308
|
/* if (count === 0) {
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
309
|
+
const o = { _is_placeholder: true };
|
|
310
|
+
|
|
311
|
+
if (head) head.forEach((c) => { o[c] = null; });
|
|
312
|
+
this.push(o);
|
|
313
|
+
} */
|
|
356
314
|
cb();
|
|
357
315
|
}
|
|
358
316
|
});
|
|
359
|
-
|
|
360
317
|
transforms.push(countAndDebug);
|
|
361
318
|
transforms.forEach((t) => {
|
|
362
319
|
stream = stream.pipe(t);
|
|
363
320
|
});
|
|
364
|
-
|
|
365
321
|
return { stream };
|
|
366
322
|
};
|
|
367
323
|
Worker.prototype.getFileWriterStream = async function (options = {}) {
|
|
@@ -380,13 +336,10 @@ Worker.prototype.getFileWriterStream = async function (options = {}) {
|
|
|
380
336
|
if (bool(options.gzip, false)) filename += '.gz';
|
|
381
337
|
const stream = fs.createWriteStream(filename);
|
|
382
338
|
debug('FileWriterStream writing to file ', filename);
|
|
383
|
-
|
|
384
339
|
return { filename, stream };
|
|
385
340
|
};
|
|
386
|
-
|
|
387
341
|
Worker.prototype.getOutputStreams = async function (options) {
|
|
388
342
|
const { filename, stream: fileWriterStream } = await this.getFileWriterStream(options);
|
|
389
|
-
|
|
390
343
|
let { transform } = options;
|
|
391
344
|
if (typeof options.transform === 'function') {
|
|
392
345
|
if (options.transform.length === 3) {
|
|
@@ -409,7 +362,6 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
409
362
|
}
|
|
410
363
|
const { flatten } = options;
|
|
411
364
|
let flattenTransform = null;
|
|
412
|
-
|
|
413
365
|
if (bool(flatten, false)) {
|
|
414
366
|
flattenTransform = new Transform({
|
|
415
367
|
objectMode: true,
|
|
@@ -431,7 +383,6 @@ Worker.prototype.getOutputStreams = async function (options) {
|
|
|
431
383
|
}
|
|
432
384
|
});
|
|
433
385
|
}
|
|
434
|
-
|
|
435
386
|
const stats = {
|
|
436
387
|
records: 0
|
|
437
388
|
};
|
|
@@ -473,23 +424,17 @@ Worker.prototype.objectStreamToFile = async function (options) {
|
|
|
473
424
|
await pipeline(streams);
|
|
474
425
|
return { filename, records: stats.records };
|
|
475
426
|
};
|
|
476
|
-
|
|
477
427
|
Worker.prototype.transform = async function (options) {
|
|
478
428
|
const worker = this;
|
|
479
|
-
|
|
480
429
|
const { filename } = options;
|
|
481
|
-
|
|
482
430
|
debug(`Transforming ${filename}`);
|
|
483
|
-
|
|
484
431
|
options.filename = filename;
|
|
485
432
|
let { stream } = await worker.fileToObjectStream(options);
|
|
486
433
|
if (typeof stream.pipe !== 'function') {
|
|
487
434
|
debug(stream);
|
|
488
435
|
throw new Error('No pipe in stream');
|
|
489
436
|
}
|
|
490
|
-
|
|
491
437
|
let t = options.transform;
|
|
492
|
-
|
|
493
438
|
// No longer need this
|
|
494
439
|
delete options.transform;
|
|
495
440
|
if (!t) {
|
|
@@ -498,7 +443,6 @@ Worker.prototype.transform = async function (options) {
|
|
|
498
443
|
cb(null, d);
|
|
499
444
|
};
|
|
500
445
|
}
|
|
501
|
-
|
|
502
446
|
if (!Array.isArray(t)) t = [t];
|
|
503
447
|
Object.keys(t).forEach((key) => {
|
|
504
448
|
let f = t[key];
|
|
@@ -508,22 +452,17 @@ Worker.prototype.transform = async function (options) {
|
|
|
508
452
|
transform: f
|
|
509
453
|
});
|
|
510
454
|
}
|
|
511
|
-
|
|
512
455
|
stream = stream.pipe(f);
|
|
513
456
|
});
|
|
514
|
-
|
|
515
457
|
const { targetFormat } = options;
|
|
516
|
-
|
|
517
458
|
if (
|
|
518
459
|
!targetFormat &&
|
|
519
460
|
(filename.toLowerCase().slice(-4) === '.csv' || filename.toLowerCase().slice(-7) === '.csv.gz')
|
|
520
461
|
) {
|
|
521
462
|
options.targetFormat = 'csv';
|
|
522
463
|
}
|
|
523
|
-
|
|
524
464
|
return worker.objectStreamToFile({ ...options, stream });
|
|
525
465
|
};
|
|
526
|
-
|
|
527
466
|
Worker.prototype.transform.metadata = {
|
|
528
467
|
options: {
|
|
529
468
|
sourcePostfix: { description: "Override the source postfix, if for example it's a csv" },
|
|
@@ -553,12 +492,10 @@ Worker.prototype.testTransform.metadata = {
|
|
|
553
492
|
filename: true
|
|
554
493
|
}
|
|
555
494
|
};
|
|
556
|
-
|
|
557
495
|
/* Get a stream from an actual stream, or an array, or a file */
|
|
558
496
|
Worker.prototype.stream = async function (options) {
|
|
559
497
|
const { stream: inputStream, packet, type, columns, limit, filename: filenameOpt } = options;
|
|
560
498
|
let filename = filenameOpt;
|
|
561
|
-
|
|
562
499
|
if (inputStream) {
|
|
563
500
|
if (Array.isArray(inputStream)) {
|
|
564
501
|
return { stream: Readable.from(inputStream) };
|
|
@@ -611,7 +548,6 @@ Worker.prototype.stream = async function (options) {
|
|
|
611
548
|
throw new Error('stream must be passed a stream, filename, or packet');
|
|
612
549
|
}
|
|
613
550
|
};
|
|
614
|
-
|
|
615
551
|
Worker.prototype.sample = async function (opts) {
|
|
616
552
|
opts.limit = opts.limit || 10;
|
|
617
553
|
const { stream } = await this.fileToObjectStream(opts);
|
|
@@ -631,7 +567,6 @@ Worker.prototype.toArray.metadata = {
|
|
|
631
567
|
filename: {}
|
|
632
568
|
}
|
|
633
569
|
};
|
|
634
|
-
|
|
635
570
|
Worker.prototype.write = async function (opts) {
|
|
636
571
|
const { filename, content } = opts;
|
|
637
572
|
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
@@ -658,15 +593,12 @@ Worker.prototype.write.metadata = {
|
|
|
658
593
|
content: {}
|
|
659
594
|
}
|
|
660
595
|
};
|
|
661
|
-
|
|
662
596
|
async function streamToString(stream) {
|
|
663
597
|
// lets have a ReadableStream as a stream variable
|
|
664
598
|
const chunks = [];
|
|
665
|
-
|
|
666
599
|
for await (const chunk of stream) {
|
|
667
600
|
chunks.push(Buffer.from(chunk));
|
|
668
601
|
}
|
|
669
|
-
|
|
670
602
|
return Buffer.concat(chunks).toString('utf-8');
|
|
671
603
|
}
|
|
672
604
|
/*
|
|
@@ -687,20 +619,17 @@ Worker.prototype.json.metadata = {
|
|
|
687
619
|
filename: { description: 'Get a javascript object from a file' }
|
|
688
620
|
}
|
|
689
621
|
};
|
|
690
|
-
|
|
691
622
|
Worker.prototype.list = async function ({ directory, start: s, end: e }) {
|
|
692
623
|
if (!directory) throw new Error('directory is required');
|
|
693
624
|
let start = null;
|
|
694
625
|
let end = null;
|
|
695
626
|
if (s) start = relativeDate(s);
|
|
696
627
|
if (e) end = relativeDate(e);
|
|
697
|
-
|
|
698
628
|
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
699
629
|
const worker = new (directory.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
700
630
|
return worker.list({ directory, start, end });
|
|
701
631
|
}
|
|
702
632
|
const a = await fsp.readdir(directory, { withFileTypes: true });
|
|
703
|
-
|
|
704
633
|
const withModified = [];
|
|
705
634
|
for (const file of a) {
|
|
706
635
|
const fullPath = path.join(directory, file.name);
|
|
@@ -717,7 +646,6 @@ Worker.prototype.list = async function ({ directory, start: s, end: e }) {
|
|
|
717
646
|
});
|
|
718
647
|
}
|
|
719
648
|
}
|
|
720
|
-
|
|
721
649
|
return withModified;
|
|
722
650
|
};
|
|
723
651
|
Worker.prototype.list.metadata = {
|
|
@@ -725,7 +653,6 @@ Worker.prototype.list.metadata = {
|
|
|
725
653
|
directory: { required: true }
|
|
726
654
|
}
|
|
727
655
|
};
|
|
728
|
-
|
|
729
656
|
Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
|
|
730
657
|
if (!directory) throw new Error('directory is required');
|
|
731
658
|
let start = null;
|
|
@@ -737,16 +664,13 @@ Worker.prototype.listAll = async function ({ directory, start: s, end: e }) {
|
|
|
737
664
|
return worker.listAll({ directory, start, end });
|
|
738
665
|
}
|
|
739
666
|
const a = await fsp.readdir(directory, { recursive: true });
|
|
740
|
-
|
|
741
667
|
let files = a.map((f) => `${directory}/${f}`);
|
|
742
668
|
if (!start && !end) {
|
|
743
669
|
return files;
|
|
744
670
|
}
|
|
745
671
|
const pLimit = await import('p-limit');
|
|
746
|
-
|
|
747
672
|
const limitedMethod = pLimit.default(10);
|
|
748
673
|
const filesWithinLimit = [];
|
|
749
|
-
|
|
750
674
|
await Promise.all(
|
|
751
675
|
files.map((filename) =>
|
|
752
676
|
limitedMethod(async () => {
|
|
@@ -774,7 +698,6 @@ Worker.prototype.listAll.metadata = {
|
|
|
774
698
|
end: {}
|
|
775
699
|
}
|
|
776
700
|
};
|
|
777
|
-
|
|
778
701
|
Worker.prototype.moveAll = async function (options) {
|
|
779
702
|
const { directory, targetDirectory } = options;
|
|
780
703
|
if (!directory) throw new Error('directory is required');
|
|
@@ -783,7 +706,6 @@ Worker.prototype.moveAll = async function (options) {
|
|
|
783
706
|
return worker.moveAll(options);
|
|
784
707
|
}
|
|
785
708
|
const a = await this.listAll(options);
|
|
786
|
-
|
|
787
709
|
let configs = a.map((f) => {
|
|
788
710
|
let filename = typeof f === 'string' ? f : f.filename;
|
|
789
711
|
return {
|
|
@@ -792,9 +714,7 @@ Worker.prototype.moveAll = async function (options) {
|
|
|
792
714
|
};
|
|
793
715
|
});
|
|
794
716
|
const pLimit = await import('p-limit');
|
|
795
|
-
|
|
796
717
|
const limitedMethod = pLimit.default(10);
|
|
797
|
-
|
|
798
718
|
return Promise.all(configs.map(({ filename, target }) => limitedMethod(async () => this.move({ filename, target }))));
|
|
799
719
|
};
|
|
800
720
|
Worker.prototype.moveAll.metadata = {
|
|
@@ -803,7 +723,6 @@ Worker.prototype.moveAll.metadata = {
|
|
|
803
723
|
targetDirectory: { required: true }
|
|
804
724
|
}
|
|
805
725
|
};
|
|
806
|
-
|
|
807
726
|
Worker.prototype.empty = async function ({ directory }) {
|
|
808
727
|
if (!directory) throw new Error('directory is required');
|
|
809
728
|
if (directory.startsWith('s3://') || directory.startsWith('r2://')) {
|
|
@@ -811,7 +730,6 @@ Worker.prototype.empty = async function ({ directory }) {
|
|
|
811
730
|
throw new Error('Cannot empty an s3:// or r2:// directory');
|
|
812
731
|
}
|
|
813
732
|
const removed = [];
|
|
814
|
-
|
|
815
733
|
for (const file of await fsp.readdir(directory)) {
|
|
816
734
|
removed.push(file);
|
|
817
735
|
await fsp.unlink(path.join(directory, file));
|
|
@@ -823,14 +741,10 @@ Worker.prototype.empty.metadata = {
|
|
|
823
741
|
directory: { required: true }
|
|
824
742
|
}
|
|
825
743
|
};
|
|
826
|
-
|
|
827
744
|
Worker.prototype.removeAll = async function (options) {
|
|
828
745
|
const filenames = await this.listAll(options);
|
|
829
|
-
|
|
830
746
|
const pLimit = await import('p-limit');
|
|
831
|
-
|
|
832
747
|
const limitedMethod = pLimit.default(10);
|
|
833
|
-
|
|
834
748
|
return Promise.all(filenames.map((filename) => limitedMethod(async () => this.remove({ filename }))));
|
|
835
749
|
};
|
|
836
750
|
Worker.prototype.removeAll.metadata = {
|
|
@@ -840,7 +754,6 @@ Worker.prototype.removeAll.metadata = {
|
|
|
840
754
|
end: {}
|
|
841
755
|
}
|
|
842
756
|
};
|
|
843
|
-
|
|
844
757
|
Worker.prototype.remove = async function ({ filename }) {
|
|
845
758
|
if (!filename) throw new Error('filename is required');
|
|
846
759
|
if (typeof filename !== 'string') throw new Error(`filename isn't a string:${JSON.stringify(filename)}`);
|
|
@@ -851,12 +764,10 @@ Worker.prototype.remove = async function ({ filename }) {
|
|
|
851
764
|
} else {
|
|
852
765
|
worker = new S3Worker(this);
|
|
853
766
|
}
|
|
854
|
-
|
|
855
767
|
await worker.remove({ filename });
|
|
856
768
|
} else {
|
|
857
769
|
fsp.unlink(filename);
|
|
858
770
|
}
|
|
859
|
-
|
|
860
771
|
return { removed: filename };
|
|
861
772
|
};
|
|
862
773
|
Worker.prototype.remove.metadata = {
|
|
@@ -864,7 +775,6 @@ Worker.prototype.remove.metadata = {
|
|
|
864
775
|
filename: {}
|
|
865
776
|
}
|
|
866
777
|
};
|
|
867
|
-
|
|
868
778
|
Worker.prototype.move = async function ({ filename, target, remove = true }) {
|
|
869
779
|
if (!target) throw new Error('target is required');
|
|
870
780
|
if (typeof target !== 'string') throw new Error(`target isn't a string:${JSON.stringify(target)}`);
|
|
@@ -875,14 +785,12 @@ Worker.prototype.move = async function ({ filename, target, remove = true }) {
|
|
|
875
785
|
) {
|
|
876
786
|
throw new Error('Cowardly not copying between services');
|
|
877
787
|
}
|
|
878
|
-
|
|
879
788
|
let worker = null;
|
|
880
789
|
if (target.startsWith('r2://')) {
|
|
881
790
|
worker = new R2Worker(this);
|
|
882
791
|
} else {
|
|
883
792
|
worker = new S3Worker(this);
|
|
884
793
|
}
|
|
885
|
-
|
|
886
794
|
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
887
795
|
// We need to copy and delete
|
|
888
796
|
const output = await worker.copy({ filename, target });
|
|
@@ -913,7 +821,6 @@ Worker.prototype.move.metadata = {
|
|
|
913
821
|
target: {}
|
|
914
822
|
}
|
|
915
823
|
};
|
|
916
|
-
|
|
917
824
|
Worker.prototype.copy = async function (opts) {
|
|
918
825
|
return this.move({ ...opts, remove: false });
|
|
919
826
|
};
|
|
@@ -923,17 +830,14 @@ Worker.prototype.copy.metadata = {
|
|
|
923
830
|
target: {}
|
|
924
831
|
}
|
|
925
832
|
};
|
|
926
|
-
|
|
927
833
|
Worker.prototype.stat = async function ({ filename }) {
|
|
928
834
|
if (!filename) throw new Error('filename is required');
|
|
929
835
|
const output = {};
|
|
930
|
-
|
|
931
836
|
if (filename.slice(-8) === '.parquet') {
|
|
932
837
|
const pq = new ParquetWorker(this);
|
|
933
838
|
output.schema = (await pq.schema({ filename }))?.schema;
|
|
934
839
|
output.records = (await pq.meta({ filename }))?.records;
|
|
935
840
|
}
|
|
936
|
-
|
|
937
841
|
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
938
842
|
const worker = new (filename.startsWith('r2://') ? R2Worker : S3Worker)(this);
|
|
939
843
|
Object.assign(output, await worker.stat({ filename }));
|
|
@@ -956,7 +860,6 @@ Worker.prototype.stat.metadata = {
|
|
|
956
860
|
filename: {}
|
|
957
861
|
}
|
|
958
862
|
};
|
|
959
|
-
|
|
960
863
|
Worker.prototype.download = async function ({ filename }) {
|
|
961
864
|
if (!filename) throw new Error('filename is required');
|
|
962
865
|
if (filename.startsWith('s3://') || filename.startsWith('r2://')) {
|
|
@@ -970,29 +873,23 @@ Worker.prototype.download.metadata = {
|
|
|
970
873
|
filename: {}
|
|
971
874
|
}
|
|
972
875
|
};
|
|
973
|
-
|
|
974
876
|
Worker.prototype.head = async function (options) {
|
|
975
877
|
const limit = options.limit || 3;
|
|
976
878
|
const { stream } = await this.fileToObjectStream({ ...options, limit });
|
|
977
879
|
const chunks = [];
|
|
978
|
-
|
|
979
880
|
let counter = 0;
|
|
980
|
-
|
|
981
881
|
for await (const chunk of stream) {
|
|
982
882
|
chunks.push(chunk);
|
|
983
883
|
counter += 1;
|
|
984
884
|
if (counter >= limit) break;
|
|
985
885
|
}
|
|
986
|
-
|
|
987
886
|
return chunks;
|
|
988
887
|
};
|
|
989
|
-
|
|
990
888
|
Worker.prototype.head.metadata = {
|
|
991
889
|
options: {
|
|
992
890
|
filename: { required: true }
|
|
993
891
|
}
|
|
994
892
|
};
|
|
995
|
-
|
|
996
893
|
Worker.prototype.columns = async function (options) {
|
|
997
894
|
const head = await this.head(options);
|
|
998
895
|
if (head.length == 0) {
|
|
@@ -1002,7 +899,6 @@ Worker.prototype.columns = async function (options) {
|
|
|
1002
899
|
columns: []
|
|
1003
900
|
};
|
|
1004
901
|
}
|
|
1005
|
-
|
|
1006
902
|
let likelyHeaderLines = 1;
|
|
1007
903
|
const columns = Object.keys(head[0]);
|
|
1008
904
|
let s = columns.join(',');
|
|
@@ -1014,48 +910,39 @@ Worker.prototype.columns = async function (options) {
|
|
|
1014
910
|
columns
|
|
1015
911
|
};
|
|
1016
912
|
};
|
|
1017
|
-
|
|
1018
913
|
Worker.prototype.columns.metadata = {
|
|
1019
914
|
options: {
|
|
1020
915
|
filename: { required: true }
|
|
1021
916
|
}
|
|
1022
917
|
};
|
|
1023
|
-
|
|
1024
918
|
Worker.prototype.count = async function (options) {
|
|
1025
919
|
const { stream } = await this.fileToObjectStream(options);
|
|
1026
920
|
const sample = [];
|
|
1027
|
-
|
|
1028
921
|
const limit = options.limit || 5;
|
|
1029
922
|
let records = 0;
|
|
1030
|
-
|
|
1031
923
|
for await (const chunk of stream) {
|
|
1032
924
|
records += 1;
|
|
1033
925
|
if (records < limit) {
|
|
1034
926
|
sample.push(chunk);
|
|
1035
927
|
}
|
|
1036
928
|
}
|
|
1037
|
-
|
|
1038
929
|
return { sample, records };
|
|
1039
930
|
};
|
|
1040
|
-
|
|
1041
931
|
Worker.prototype.count.metadata = {
|
|
1042
932
|
options: {
|
|
1043
933
|
filename: { required: true }
|
|
1044
934
|
}
|
|
1045
935
|
};
|
|
1046
|
-
|
|
1047
936
|
// Get a set of unique entries from a uniqueFunction
|
|
1048
937
|
// This could be large
|
|
1049
938
|
Worker.prototype.getUniqueSet = async function (options) {
|
|
1050
939
|
const existingFiles = getStringArray(options.filenames);
|
|
1051
940
|
const sample = {};
|
|
1052
|
-
|
|
1053
941
|
let { uniqueFunction } = options;
|
|
1054
942
|
if (!uniqueFunction) {
|
|
1055
943
|
uniqueFunction = (o) => JSON.stringify(o);
|
|
1056
944
|
}
|
|
1057
945
|
const uniqueSet = new Set();
|
|
1058
|
-
|
|
1059
946
|
for (const filename of existingFiles) {
|
|
1060
947
|
const { stream: existsStream } = await this.fileToObjectStream({ filename });
|
|
1061
948
|
await pipeline(
|
|
@@ -1082,22 +969,18 @@ Worker.prototype.getUniqueSet = async function (options) {
|
|
|
1082
969
|
}
|
|
1083
970
|
return { uniqueFunction, uniqueSet, sample };
|
|
1084
971
|
};
|
|
1085
|
-
|
|
1086
972
|
Worker.prototype.getUniqueStream = async function (options) {
|
|
1087
973
|
const includeDuplicateSourceRecords = bool(options.includeDuplicateSourceRecords, false);
|
|
1088
|
-
|
|
1089
974
|
const { uniqueSet, uniqueFunction, sample } = await this.getUniqueSet({
|
|
1090
975
|
filenames: options.existingFiles,
|
|
1091
976
|
uniqueFunction: options.uniqueFunction
|
|
1092
977
|
});
|
|
1093
|
-
|
|
1094
978
|
const { stream: inStream } = await this.fileToObjectStream(options);
|
|
1095
979
|
const uniqueStream = inStream.pipe(
|
|
1096
980
|
new Transform({
|
|
1097
981
|
objectMode: true,
|
|
1098
982
|
transform(d, enc, cb) {
|
|
1099
983
|
const v = uniqueFunction(makeStrings(d)) || '';
|
|
1100
|
-
|
|
1101
984
|
if (!v) {
|
|
1102
985
|
// falsey unique function includes
|
|
1103
986
|
// by default
|
|
@@ -1117,7 +1000,6 @@ Worker.prototype.getUniqueStream = async function (options) {
|
|
|
1117
1000
|
);
|
|
1118
1001
|
return { stream: uniqueStream, sample };
|
|
1119
1002
|
};
|
|
1120
|
-
|
|
1121
1003
|
Worker.prototype.getUniqueStream.metadata = {
|
|
1122
1004
|
options: {
|
|
1123
1005
|
existingFiles: {},
|
|
@@ -1134,7 +1016,6 @@ Worker.prototype.getUniqueFile = async function (options) {
|
|
|
1134
1016
|
const { filename, records } = await this.objectStreamToFile({ stream });
|
|
1135
1017
|
return { filename, records, sample };
|
|
1136
1018
|
};
|
|
1137
|
-
|
|
1138
1019
|
Worker.prototype.getUniqueFile.metadata = {
|
|
1139
1020
|
options: {
|
|
1140
1021
|
existingFiles: {},
|
|
@@ -1146,7 +1027,6 @@ Worker.prototype.getUniqueFile.metadata = {
|
|
|
1146
1027
|
}
|
|
1147
1028
|
}
|
|
1148
1029
|
};
|
|
1149
|
-
|
|
1150
1030
|
/*
|
|
1151
1031
|
diff that allows for unordered files, and doesn't store full objects in memory.
|
|
1152
1032
|
Requires 2 passes of the files,
|
|
@@ -1155,14 +1035,12 @@ but that's a better tradeoff than trying to store huge files in memory
|
|
|
1155
1035
|
Worker.prototype.diff = async function (options) {
|
|
1156
1036
|
const { fileA, fileB, uniqueFunction: ufOpt, columns, includeDuplicateSourceRecords } = options;
|
|
1157
1037
|
if (options.fields) throw new Error('fields is deprecated, use columns');
|
|
1158
|
-
|
|
1159
1038
|
if (ufOpt && columns) throw new Error('fields and uniqueFunction cannot both be specified');
|
|
1160
1039
|
let uniqueFunction = ufOpt;
|
|
1161
1040
|
if (!uniqueFunction && columns) {
|
|
1162
1041
|
const farr = getStringArray(columns);
|
|
1163
1042
|
uniqueFunction = (o) => farr.map((f) => o[f] || '').join('.');
|
|
1164
1043
|
}
|
|
1165
|
-
|
|
1166
1044
|
const left = await this.getUniqueFile({
|
|
1167
1045
|
existingFiles: [fileB],
|
|
1168
1046
|
filename: fileA,
|
|
@@ -1175,7 +1053,6 @@ Worker.prototype.diff = async function (options) {
|
|
|
1175
1053
|
uniqueFunction,
|
|
1176
1054
|
includeDuplicateSourceRecords
|
|
1177
1055
|
});
|
|
1178
|
-
|
|
1179
1056
|
return {
|
|
1180
1057
|
left,
|
|
1181
1058
|
right
|
|
@@ -1192,5 +1069,4 @@ Worker.prototype.diff.metadata = {
|
|
|
1192
1069
|
}
|
|
1193
1070
|
}
|
|
1194
1071
|
};
|
|
1195
|
-
|
|
1196
|
-
module.exports = Worker;
|
|
1072
|
+
export default Worker;
|